From 51843fd7d043428b5ef3bb77cc683e5339b2d95e Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 May 2023 12:15:11 +0200 Subject: [PATCH 001/199] Refactor full determinism (#3485) * up * fix more * Apply suggestions from code review * fix more * fix more * Check it * Remove 16:8 * fix more * fix more * fix more * up * up * Test only stable diffusion * Test only two files * up * Try out spinning up processes that can be killed * up * Apply suggestions from code review * up * up --- src/diffusers/training_utils.py | 23 +----------------- src/diffusers/utils/testing_utils.py | 18 ++++++++++++++ tests/models/test_layers_utils.py | 3 --- tests/models/test_models_unet_1d.py | 3 --- tests/models/test_models_unet_2d.py | 5 ++-- tests/models/test_models_unet_2d_condition.py | 5 ++-- tests/models/test_models_unet_3d_condition.py | 5 ++-- tests/models/test_models_vae.py | 4 ++-- tests/models/test_models_vq.py | 4 ++-- tests/others/test_ema.py | 5 ++-- .../altdiffusion/test_alt_diffusion.py | 5 ++-- .../test_alt_diffusion_img2img.py | 5 ++-- .../audio_diffusion/test_audio_diffusion.py | 5 ++-- tests/pipelines/audioldm/test_audioldm.py | 4 ++-- tests/pipelines/controlnet/test_controlnet.py | 5 ++-- .../controlnet/test_controlnet_img2img.py | 5 ++-- .../controlnet/test_controlnet_inpaint.py | 5 ++-- .../dance_diffusion/test_dance_diffusion.py | 4 ++-- tests/pipelines/ddim/test_ddim.py | 4 ++-- tests/pipelines/ddpm/test_ddpm.py | 4 ++-- tests/pipelines/dit/test_dit.py | 4 ++-- tests/pipelines/karras_ve/test_karras_ve.py | 4 ++-- .../latent_diffusion/test_latent_diffusion.py | 11 +++++++-- .../test_latent_diffusion_superresolution.py | 4 ++-- .../test_latent_diffusion_uncond.py | 4 ++-- .../paint_by_example/test_paint_by_example.py | 5 ++-- tests/pipelines/pndm/test_pndm.py | 4 ++-- tests/pipelines/repaint/test_repaint.py | 13 +++++++--- .../score_sde_ve/test_score_sde_ve.py | 4 ++-- .../test_semantic_diffusion.py | 4 ++-- .../test_spectrogram_diffusion.py | 4 ++-- .../stable_diffusion/test_cycle_diffusion.py | 5 ++-- .../stable_diffusion/test_stable_diffusion.py | 24 +++++++++++++++---- .../test_stable_diffusion_image_variation.py | 5 ++-- .../test_stable_diffusion_img2img.py | 5 ++-- .../test_stable_diffusion_inpaint.py | 5 ++-- .../test_stable_diffusion_inpaint_legacy.py | 5 ++-- ...st_stable_diffusion_instruction_pix2pix.py | 5 ++-- .../test_stable_diffusion_k_diffusion.py | 4 ++-- .../test_stable_diffusion_model_editing.py | 5 ++-- .../test_stable_diffusion_panorama.py | 5 ++-- .../test_stable_diffusion_pix2pix_zero.py | 5 ++-- .../test_stable_diffusion_sag.py | 5 ++-- .../test_stable_diffusion.py | 5 ++-- .../test_stable_diffusion_depth.py | 5 ++-- .../test_stable_diffusion_diffedit.py | 5 ++-- .../test_stable_diffusion_inpaint.py | 5 ++-- .../test_stable_diffusion_latent_upscale.py | 4 ++-- .../test_stable_diffusion_upscale.py | 4 ++-- .../test_stable_diffusion_v_pred.py | 5 ++-- .../test_safe_diffusion.py | 3 --- .../stable_unclip/test_stable_unclip.py | 5 ++-- .../test_stable_unclip_img2img.py | 4 ++-- tests/pipelines/test_pipelines.py | 6 ++--- tests/pipelines/test_pipelines_common.py | 3 --- .../text_to_video/test_text_to_video.py | 4 ++-- tests/pipelines/unclip/test_unclip.py | 5 ++-- .../unclip/test_unclip_image_variation.py | 5 ++-- 58 files changed, 158 insertions(+), 170 deletions(-) diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py index 1a3abb49a065..df9c7e882682 100644 --- a/src/diffusers/training_utils.py +++ b/src/diffusers/training_utils.py @@ -1,7 +1,6 @@ import contextlib import copy -import os -import random +from random import random from typing import Any, Dict, Iterable, Optional, Union import numpy as np @@ -14,26 +13,6 @@ import transformers -def enable_full_determinism(seed: int): - """ - Helper function for reproducible behavior during distributed training. See - - https://pytorch.org/docs/stable/notes/randomness.html for pytorch - """ - # set seed first - set_seed(seed) - - # Enable PyTorch deterministic mode. This potentially requires either the environment - # variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set, - # depending on the CUDA version, so we set them both here - os.environ["CUDA_LAUNCH_BLOCKING"] = "1" - os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" - torch.use_deterministic_algorithms(True) - - # Enable CUDNN deterministic mode - torch.backends.cudnn.deterministic = True - torch.backends.cudnn.benchmark = False - - def set_seed(seed: int): """ Args: diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 4ad7d97b4462..93d0ef5b7b5f 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -514,3 +514,21 @@ def __exit__(self, *exc): def __repr__(self): return f"captured: {self.out}\n" + + +def enable_full_determinism(): + """ + Helper function for reproducible behavior during distributed training. See + - https://pytorch.org/docs/stable/notes/randomness.html for pytorch + """ + # Enable PyTorch deterministic mode. This potentially requires either the environment + # variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set, + # depending on the CUDA version, so we set them both here + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + torch.use_deterministic_algorithms(True) + + # Enable CUDNN deterministic mode + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.backends.cuda.matmul.allow_tf32 = False diff --git a/tests/models/test_layers_utils.py b/tests/models/test_layers_utils.py index 98fa1afcbb9d..b438b2ddb4af 100644 --- a/tests/models/test_layers_utils.py +++ b/tests/models/test_layers_utils.py @@ -27,9 +27,6 @@ from diffusers.utils import torch_device -torch.backends.cuda.matmul.allow_tf32 = False - - class EmbeddingsTests(unittest.TestCase): def test_timestep_embeddings(self): embedding_dim = 256 diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py index 78f759cb1a24..9fb1a61011e3 100644 --- a/tests/models/test_models_unet_1d.py +++ b/tests/models/test_models_unet_1d.py @@ -23,9 +23,6 @@ from .test_modeling_common import ModelTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False - - class UNet1DModelTests(ModelTesterMixin, unittest.TestCase): model_class = UNet1DModel diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py index 8f9a6b813f19..92a5664daa2b 100644 --- a/tests/models/test_models_unet_2d.py +++ b/tests/models/test_models_unet_2d.py @@ -21,13 +21,14 @@ from diffusers import UNet2DModel from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device +from diffusers.utils.testing_utils import enable_full_determinism from .test_modeling_common import ModelTesterMixin logger = logging.get_logger(__name__) -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) + +enable_full_determinism() class Unet2DModelTests(ModelTesterMixin, unittest.TestCase): diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index d3ca5ea3048e..43a487a32b43 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -33,13 +33,14 @@ torch_device, ) from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.testing_utils import enable_full_determinism from .test_modeling_common import ModelTesterMixin logger = logging.get_logger(__name__) -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) + +enable_full_determinism() def create_lora_layers(model, mock_weights: bool = True): diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 08863adfeaac..928f6bcbe960 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -29,13 +29,14 @@ torch_device, ) from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.testing_utils import enable_full_determinism from .test_modeling_common import ModelTesterMixin +enable_full_determinism() + logger = logging.get_logger(__name__) -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) def create_lora_layers(model, mock_weights: bool = True): diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py index 9a3e49cdfbc0..fe27e138f5fa 100644 --- a/tests/models/test_models_vae.py +++ b/tests/models/test_models_vae.py @@ -22,12 +22,12 @@ from diffusers import AutoencoderKL from diffusers.utils import floats_tensor, load_hf_numpy, require_torch_gpu, slow, torch_all_close, torch_device from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.testing_utils import enable_full_determinism from .test_modeling_common import ModelTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase): diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py index f0be6f6a6d64..8ea6ef77ce63 100644 --- a/tests/models/test_models_vq.py +++ b/tests/models/test_models_vq.py @@ -19,12 +19,12 @@ from diffusers import VQModel from diffusers.utils import floats_tensor, torch_device +from diffusers.utils.testing_utils import enable_full_determinism from .test_modeling_common import ModelTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class VQModelTests(ModelTesterMixin, unittest.TestCase): diff --git a/tests/others/test_ema.py b/tests/others/test_ema.py index 5526aadc4757..32f7ae8a9a8e 100644 --- a/tests/others/test_ema.py +++ b/tests/others/test_ema.py @@ -20,11 +20,10 @@ from diffusers import UNet2DConditionModel from diffusers.training_utils import EMAModel -from diffusers.utils.testing_utils import skip_mps, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, skip_mps, torch_device -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class EMAModelTests(unittest.TestCase): diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py index 9237f7435b95..6842d29dc6c0 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -26,14 +26,13 @@ RobertaSeriesModelWithTransformation, ) from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py index 35a4e91284cd..61457e6ca01f 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion_img2img.py @@ -33,11 +33,10 @@ RobertaSeriesModelWithTransformation, ) from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class AltDiffusionImg2ImgPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py index a848bd031797..8c20f011cb86 100644 --- a/tests/pipelines/audio_diffusion/test_audio_diffusion.py +++ b/tests/pipelines/audio_diffusion/test_audio_diffusion.py @@ -30,11 +30,10 @@ UNet2DModel, ) from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class PipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py index 566b2c2d2cd0..0825fc36a266 100644 --- a/tests/pipelines/audioldm/test_audioldm.py +++ b/tests/pipelines/audioldm/test_audioldm.py @@ -37,13 +37,13 @@ UNet2DConditionModel, ) from diffusers.utils import slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class AudioLDMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 0453bb38e1ee..97b5e20f3c14 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -32,7 +32,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import ( TEXT_TO_IMAGE_BATCH_PARAMS, @@ -41,8 +41,7 @@ from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index b83a8af2778b..9d3b10aa8283 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -35,7 +35,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import ( TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, @@ -44,8 +44,7 @@ from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class ControlNetImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index 786b0e608ef0..155286630c04 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -35,7 +35,7 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel from diffusers.utils import floats_tensor, load_image, load_numpy, randn_tensor, slow, torch_device from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, @@ -44,8 +44,7 @@ from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class ControlNetInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/dance_diffusion/test_dance_diffusion.py b/tests/pipelines/dance_diffusion/test_dance_diffusion.py index 361839043c9f..0ba86daa61fc 100644 --- a/tests/pipelines/dance_diffusion/test_dance_diffusion.py +++ b/tests/pipelines/dance_diffusion/test_dance_diffusion.py @@ -21,13 +21,13 @@ from diffusers import DanceDiffusionPipeline, IPNDMScheduler, UNet1DModel from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import UNCONDITIONAL_AUDIO_GENERATION_BATCH_PARAMS, UNCONDITIONAL_AUDIO_GENERATION_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class DanceDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/ddim/test_ddim.py b/tests/pipelines/ddim/test_ddim.py index e997ae45d975..0861d7daab29 100644 --- a/tests/pipelines/ddim/test_ddim.py +++ b/tests/pipelines/ddim/test_ddim.py @@ -19,13 +19,13 @@ import torch from diffusers import DDIMPipeline, DDIMScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class DDIMPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/ddpm/test_ddpm.py b/tests/pipelines/ddpm/test_ddpm.py index 5e3e47cb74fb..a3c290215114 100644 --- a/tests/pipelines/ddpm/test_ddpm.py +++ b/tests/pipelines/ddpm/test_ddpm.py @@ -19,10 +19,10 @@ import torch from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class DDPMPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/dit/test_dit.py b/tests/pipelines/dit/test_dit.py index d8098178f339..4937915696b4 100644 --- a/tests/pipelines/dit/test_dit.py +++ b/tests/pipelines/dit/test_dit.py @@ -21,7 +21,7 @@ from diffusers import AutoencoderKL, DDIMScheduler, DiTPipeline, DPMSolverMultistepScheduler, Transformer2DModel from diffusers.utils import is_xformers_available, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import ( CLASS_CONDITIONED_IMAGE_GENERATION_BATCH_PARAMS, @@ -30,7 +30,7 @@ from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class DiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/karras_ve/test_karras_ve.py b/tests/pipelines/karras_ve/test_karras_ve.py index 391e61a2b9c9..142058bcd710 100644 --- a/tests/pipelines/karras_ve/test_karras_ve.py +++ b/tests/pipelines/karras_ve/test_karras_ve.py @@ -19,10 +19,10 @@ import torch from diffusers import KarrasVePipeline, KarrasVeScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class KarrasVePipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion.py b/tests/pipelines/latent_diffusion/test_latent_diffusion.py index 05ff4162e5c6..88dc8ef9b17b 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion.py @@ -21,13 +21,20 @@ from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, DDIMScheduler, LDMTextToImagePipeline, UNet2DConditionModel -from diffusers.utils.testing_utils import load_numpy, nightly, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + load_numpy, + nightly, + require_torch_gpu, + slow, + torch_device, +) from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class LDMTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py index f1aa2f08efba..d21ead543af8 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_superresolution.py @@ -21,10 +21,10 @@ from diffusers import DDIMScheduler, LDMSuperResolutionPipeline, UNet2DModel, VQModel from diffusers.utils import PIL_INTERPOLATION, floats_tensor, load_image, slow, torch_device -from diffusers.utils.testing_utils import require_torch +from diffusers.utils.testing_utils import enable_full_determinism, require_torch -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class LDMSuperResolutionPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py index aa7b33730d18..ff8670ea2950 100644 --- a/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py +++ b/tests/pipelines/latent_diffusion/test_latent_diffusion_uncond.py @@ -20,10 +20,10 @@ from transformers import CLIPTextConfig, CLIPTextModel from diffusers import DDIMScheduler, LDMPipeline, UNet2DModel, VQModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class LDMPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/paint_by_example/test_paint_by_example.py b/tests/pipelines/paint_by_example/test_paint_by_example.py index 80ba3f5ed37f..14c16644889e 100644 --- a/tests/pipelines/paint_by_example/test_paint_by_example.py +++ b/tests/pipelines/paint_by_example/test_paint_by_example.py @@ -25,14 +25,13 @@ from diffusers import AutoencoderKL, PaintByExamplePipeline, PNDMScheduler, UNet2DConditionModel from diffusers.pipelines.paint_by_example import PaintByExampleImageEncoder from diffusers.utils import floats_tensor, load_image, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import IMAGE_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_GUIDED_IMAGE_INPAINTING_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class PaintByExamplePipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/pndm/test_pndm.py b/tests/pipelines/pndm/test_pndm.py index bed5fea561dc..c2595713933c 100644 --- a/tests/pipelines/pndm/test_pndm.py +++ b/tests/pipelines/pndm/test_pndm.py @@ -19,10 +19,10 @@ import torch from diffusers import PNDMPipeline, PNDMScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class PNDMPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/repaint/test_repaint.py b/tests/pipelines/repaint/test_repaint.py index 59968eaf101c..e372cf979ebb 100644 --- a/tests/pipelines/repaint/test_repaint.py +++ b/tests/pipelines/repaint/test_repaint.py @@ -20,14 +20,21 @@ import torch from diffusers import RePaintPipeline, RePaintScheduler, UNet2DModel -from diffusers.utils.testing_utils import load_image, load_numpy, nightly, require_torch_gpu, skip_mps, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + load_image, + load_numpy, + nightly, + require_torch_gpu, + skip_mps, + torch_device, +) from ..pipeline_params import IMAGE_INPAINTING_BATCH_PARAMS, IMAGE_INPAINTING_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class RepaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/score_sde_ve/test_score_sde_ve.py b/tests/pipelines/score_sde_ve/test_score_sde_ve.py index 036ecc3f6bf3..32505253f6c7 100644 --- a/tests/pipelines/score_sde_ve/test_score_sde_ve.py +++ b/tests/pipelines/score_sde_ve/test_score_sde_ve.py @@ -19,10 +19,10 @@ import torch from diffusers import ScoreSdeVePipeline, ScoreSdeVeScheduler, UNet2DModel -from diffusers.utils.testing_utils import require_torch, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch, slow, torch_device -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class ScoreSdeVeipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py index ba42b1fe9c5f..9e810616dc56 100644 --- a/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py +++ b/tests/pipelines/semantic_stable_diffusion/test_semantic_diffusion.py @@ -25,10 +25,10 @@ from diffusers import AutoencoderKL, DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler, UNet2DConditionModel from diffusers.pipelines.semantic_stable_diffusion import SemanticStableDiffusionPipeline as StableDiffusionPipeline from diffusers.utils import floats_tensor, nightly, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class SafeDiffusionPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py index 3ec6f681be79..cc8690eb87ca 100644 --- a/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py +++ b/tests/pipelines/spectrogram_diffusion/test_spectrogram_diffusion.py @@ -22,13 +22,13 @@ from diffusers import DDPMScheduler, MidiProcessor, SpectrogramDiffusionPipeline from diffusers.pipelines.spectrogram_diffusion import SpectrogramContEncoder, SpectrogramNotesEncoder, T5FilmDecoder from diffusers.utils import require_torch_gpu, skip_mps, slow, torch_device -from diffusers.utils.testing_utils import require_note_seq, require_onnxruntime +from diffusers.utils.testing_utils import enable_full_determinism, require_note_seq, require_onnxruntime from ..pipeline_params import TOKENS_TO_AUDIO_GENERATION_BATCH_PARAMS, TOKENS_TO_AUDIO_GENERATION_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() MIDI_FILE = "./tests/fixtures/elise_format0.mid" diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index 3d6bfff1bbd1..a1ae3d2d0e7c 100644 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -23,14 +23,13 @@ from diffusers import AutoencoderKL, CycleDiffusionPipeline, DDIMScheduler, UNet2DConditionModel from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 1f52a09b672b..aec4436710b9 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -15,11 +15,16 @@ import gc +import os +import signal +import subprocess +import sys import tempfile import time import unittest import numpy as np +import pytest import torch from huggingface_hub import hf_hub_download from packaging import version @@ -39,15 +44,25 @@ ) from diffusers.models.attention_processor import AttnProcessor from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu +from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +@pytest.fixture(autouse=True) +def process_fixture(): + # This will be run before each test + command = [sys.executable, os.path.abspath(__file__)] + process = subprocess.Popen(command) + enable_full_determinism() + yield process + # This will be run after each test + try: + os.kill(process.pid, signal.SIGTERM) # or signal.SIGKILL + except ProcessLookupError: + pass class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): @@ -551,8 +566,7 @@ def test_inference_batch_single_identical(self): @slow @require_torch_gpu class StableDiffusionPipelineSlowTests(unittest.TestCase): - def tearDown(self): - super().tearDown() + def setUp(self): gc.collect() torch.cuda.empty_cache() diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index 0ce55ae78ae0..c35d84de9802 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -30,14 +30,13 @@ UNet2DConditionModel, ) from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionImageVariationPipelineFastTests( diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 4afc16d9b65f..8ab252b9be80 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -34,7 +34,7 @@ ) from diffusers.image_processor import VaeImageProcessor from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import ( IMAGE_TO_IMAGE_IMAGE_PARAMS, @@ -44,8 +44,7 @@ from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 5c2d9d7c44f7..44de277ead07 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -33,15 +33,14 @@ ) from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py index 8647041fbb6f..fa00a0d201af 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py @@ -34,11 +34,10 @@ VQModel, ) from diffusers.utils import floats_tensor, load_image, nightly, slow, torch_device -from diffusers.utils.testing_utils import load_numpy, preprocess_image, require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, preprocess_image, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionInpaintLegacyPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 99a069493885..fbff6c554967 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -32,14 +32,13 @@ UNet2DConditionModel, ) from diffusers.utils import floats_tensor, load_image, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionInstructPix2PixPipelineFastTests( diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py index 546b1d21252c..4eccb871a0cb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_k_diffusion.py @@ -21,10 +21,10 @@ from diffusers import StableDiffusionKDiffusionPipeline from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() @slow diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index b448dbef1ebe..cba20417bca0 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -29,14 +29,13 @@ UNet2DConditionModel, ) from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() @skip_mps diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 61708b36bfee..02a15b2a29dc 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -30,14 +30,13 @@ UNet2DConditionModel, ) from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() @skip_mps diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 90cc85646462..98f5910ab313 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -33,14 +33,13 @@ UNet2DConditionModel, ) from diffusers.utils import floats_tensor, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import load_image, load_pt, require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, load_image, load_pt, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() @skip_mps diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index 7cb8ab409a9b..2b0f0bfc11a6 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -27,14 +27,13 @@ UNet2DConditionModel, ) from diffusers.utils import slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index bc4ab7d66431..3f9867783b33 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -33,14 +33,13 @@ logging, ) from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu +from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusion2PipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index ae1eefa68242..08ac29868971 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -49,14 +49,13 @@ slow, torch_device, ) -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() @skip_mps diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py index c9da7b06893f..8df5b6da846c 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py @@ -33,14 +33,13 @@ UNet2DConditionModel, ) from diffusers.utils import load_image, slow -from diffusers.utils.testing_utils import floats_tensor, require_torch_gpu, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusionDiffEditPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index 77242add93e9..10d8561f0126 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -24,14 +24,13 @@ from diffusers import AutoencoderKL, PNDMScheduler, StableDiffusionInpaintPipeline, UNet2DConditionModel from diffusers.utils import floats_tensor, load_image, load_numpy, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, slow +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 539b4b1cc350..561536a44ea0 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -29,13 +29,13 @@ UNet2DConditionModel, ) from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py index 747809a4fb2e..7100e5023a5d 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_upscale.py @@ -24,10 +24,10 @@ from diffusers import AutoencoderKL, DDIMScheduler, DDPMScheduler, StableDiffusionUpscalePipeline, UNet2DConditionModel from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False +enable_full_determinism() class StableDiffusionUpscalePipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index a874cbb7e0c5..d1a2c856659f 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -30,11 +30,10 @@ UNet2DConditionModel, ) from diffusers.utils import load_numpy, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableDiffusion2VPredictionPipelineFastTests(unittest.TestCase): diff --git a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py index c614fa48055e..09e31aacfbc9 100644 --- a/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py +++ b/tests/pipelines/stable_diffusion_safe/test_safe_diffusion.py @@ -28,9 +28,6 @@ from diffusers.utils.testing_utils import require_torch_gpu -torch.backends.cuda.matmul.allow_tf32 = False - - class SafeDiffusionPipelineFastTests(unittest.TestCase): def tearDown(self): # clean up the VRAM after each test diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index 78775a938b5b..8b4a065cd4bf 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -13,14 +13,13 @@ UNet2DConditionModel, ) from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer -from diffusers.utils.testing_utils import load_numpy, require_torch_gpu, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, require_torch_gpu, slow, torch_device from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index dcd4300b85c1..35cae61242c4 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -18,6 +18,7 @@ from diffusers.pipelines.stable_diffusion.stable_unclip_image_normalizer import StableUnCLIPImageNormalizer from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import ( + enable_full_determinism, floats_tensor, load_image, load_numpy, @@ -35,8 +36,7 @@ ) -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index df1a3b6ac7bb..a9abb0b4fb62 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -65,6 +65,7 @@ ) from diffusers.utils.testing_utils import ( CaptureLogger, + enable_full_determinism, get_tests_dir, load_numpy, require_compel, @@ -73,8 +74,7 @@ ) -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class DownloadTests(unittest.TestCase): @@ -700,7 +700,6 @@ def test_local_custom_pipeline_file(self): def test_download_from_git(self): # Because adaptive_avg_pool2d_backward_cuda # does not have a deterministic implementation. - torch.use_deterministic_algorithms(False) clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id) @@ -722,7 +721,6 @@ def test_download_from_git(self): image = pipeline("a prompt", num_inference_steps=2, output_type="np").images[0] assert image.shape == (512, 512, 3) - torch.use_deterministic_algorithms(True) def test_save_pipeline_change_config(self): pipe = DiffusionPipeline.from_pretrained( diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index f23e850f4d54..3984ed76edce 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -18,9 +18,6 @@ from diffusers.utils.testing_utils import require_torch, torch_device -torch.backends.cuda.matmul.allow_tf32 = False - - def to_np(tensor): if isinstance(tensor, torch.Tensor): tensor = tensor.detach().cpu().numpy() diff --git a/tests/pipelines/text_to_video/test_text_to_video.py b/tests/pipelines/text_to_video/test_text_to_video.py index 212becbb6729..8b4bae2275e5 100644 --- a/tests/pipelines/text_to_video/test_text_to_video.py +++ b/tests/pipelines/text_to_video/test_text_to_video.py @@ -27,13 +27,13 @@ UNet3DConditionModel, ) from diffusers.utils import load_numpy, skip_mps, slow +from diffusers.utils.testing_utils import enable_full_determinism from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() @skip_mps diff --git a/tests/pipelines/unclip/test_unclip.py b/tests/pipelines/unclip/test_unclip.py index 5357e5b0e7ef..393c3ba1635d 100644 --- a/tests/pipelines/unclip/test_unclip.py +++ b/tests/pipelines/unclip/test_unclip.py @@ -23,14 +23,13 @@ from diffusers import PriorTransformer, UnCLIPPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase): diff --git a/tests/pipelines/unclip/test_unclip_image_variation.py b/tests/pipelines/unclip/test_unclip_image_variation.py index ded162102dd6..75a26250807b 100644 --- a/tests/pipelines/unclip/test_unclip_image_variation.py +++ b/tests/pipelines/unclip/test_unclip_image_variation.py @@ -37,14 +37,13 @@ ) from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel from diffusers.utils import floats_tensor, load_numpy, slow, torch_device -from diffusers.utils.testing_utils import load_image, require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, load_image, require_torch_gpu, skip_mps from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference -torch.backends.cuda.matmul.allow_tf32 = False -torch.use_deterministic_algorithms(True) +enable_full_determinism() class UnCLIPImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase): From 6dd3871ae05bd2a7d52c637e14db17887868aee9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 May 2023 15:32:39 +0200 Subject: [PATCH 002/199] Fix DPM single (#3413) * Fix DPM single * add test * fix one more bug * Apply suggestions from code review Co-authored-by: StAlKeR7779 --------- Co-authored-by: StAlKeR7779 --- .../scheduling_dpmsolver_singlestep.py | 19 ++++++++++++++++++- tests/schedulers/test_scheduler_dpm_single.py | 16 ++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 9307db89d8d7..8ddd30b0a192 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -21,9 +21,13 @@ import torch from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import logging from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, SchedulerOutput +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): """ @@ -251,7 +255,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.timesteps = torch.from_numpy(timesteps).to(device) self.model_outputs = [None] * self.config.solver_order self.sample = None - self.orders = self.get_order_list(num_inference_steps) + + if not self.config.lower_order_final and num_inference_steps % self.config.solver_order != 0: + logger.warn( + "Changing scheduler {self.config} to have `lower_order_final` set to True to handle uneven amount of inference steps. Please make sure to always use an even number of `num_inference steps when using `lower_order_final=True`." + ) + self.register_to_config(lower_order_final=True) + + self.order_list = self.get_order_list(num_inference_steps) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: @@ -597,6 +608,12 @@ def step( self.model_outputs[-1] = model_output order = self.order_list[step_index] + + # For img2img denoising might start with order>1 which is not possible + # In this case make sure that the first two steps are both order=1 + while self.model_outputs[-order] is None: + order -= 1 + # For single-step solvers, we use the initial value at each time with order = 1. if order == 1: self.sample = sample diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py index fd7395e794c7..18a706a1f59b 100644 --- a/tests/schedulers/test_scheduler_dpm_single.py +++ b/tests/schedulers/test_scheduler_dpm_single.py @@ -116,6 +116,22 @@ def full_loop(self, scheduler=None, **config): return sample + def test_full_uneven_loop(self): + scheduler = DPMSolverSinglestepScheduler(**self.get_scheduler_config()) + num_inference_steps = 50 + model = self.dummy_model() + sample = self.dummy_sample_deter + scheduler.set_timesteps(num_inference_steps) + + # make sure that the first t is uneven + for i, t in enumerate(scheduler.timesteps[3:]): + residual = model(sample, t) + sample = scheduler.step(residual, t, sample).prev_sample + + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_mean.item() - 0.2574) < 1e-3 + def test_timesteps(self): for timesteps in [25, 50, 100, 999, 1000]: self.check_over_configs(num_train_timesteps=timesteps) From 194b0a425dfa0bcdb048ab8f37d1668682c1a91b Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Mon, 22 May 2023 22:43:56 +0800 Subject: [PATCH 003/199] Add `use_Karras_sigmas` to DPMSolverSinglestepScheduler (#3476) * add use_karras_sigmas * add karras test * add doc --- .../scheduling_dpmsolver_singlestep.py | 52 +++++++++++++++++++ tests/schedulers/test_scheduler_dpm_single.py | 12 +++++ 2 files changed, 64 insertions(+) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 8ddd30b0a192..7fa8eabb5a15 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -117,6 +117,10 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin): lower_order_final (`bool`, default `True`): whether to use lower-order solvers in the final steps. For singlestep schedulers, we recommend to enable this to use up all the function evaluations. + use_karras_sigmas (`bool`, *optional*, defaults to `False`): + This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the + noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence + of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. lambda_min_clipped (`float`, default `-inf`): the clipping threshold for the minimum value of lambda(t) for numerical stability. This is critical for cosine (squaredcos_cap_v2) noise schedule. @@ -150,6 +154,7 @@ def __init__( algorithm_type: str = "dpmsolver++", solver_type: str = "midpoint", lower_order_final: bool = True, + use_karras_sigmas: Optional[bool] = False, lambda_min_clipped: float = -float("inf"), variance_type: Optional[str] = None, ): @@ -197,6 +202,7 @@ def __init__( self.model_outputs = [None] * solver_order self.sample = None self.order_list = self.get_order_list(num_train_timesteps) + self.use_karras_sigmas = use_karras_sigmas def get_order_list(self, num_inference_steps: int) -> List[int]: """ @@ -252,6 +258,14 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic .copy() .astype(np.int64) ) + + if self.use_karras_sigmas: + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + log_sigmas = np.log(sigmas) + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() + timesteps = np.flip(timesteps).copy().astype(np.int64) + self.timesteps = torch.from_numpy(timesteps).to(device) self.model_outputs = [None] * self.config.solver_order self.sample = None @@ -299,6 +313,44 @@ def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: return sample + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t + def _sigma_to_t(self, sigma, log_sigmas): + # get log sigma + log_sigma = np.log(sigma) + + # get distribution + dists = log_sigma - log_sigmas[:, np.newaxis] + + # get sigmas range + low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2) + high_idx = low_idx + 1 + + low = log_sigmas[low_idx] + high = log_sigmas[high_idx] + + # interpolate sigmas + w = (low - log_sigma) / (low - high) + w = np.clip(w, 0, 1) + + # transform interpolation to time range + t = (1 - w) * low_idx + w * high_idx + t = t.reshape(sigma.shape) + return t + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras + def _convert_to_karras(self, in_sigmas: torch.FloatTensor, num_inference_steps) -> torch.FloatTensor: + """Constructs the noise schedule of Karras et al. (2022).""" + + sigma_min: float = in_sigmas[-1].item() + sigma_max: float = in_sigmas[0].item() + + rho = 7.0 # 7.0 is the value used in the paper + ramp = np.linspace(0, 1, num_inference_steps) + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho + return sigmas + def convert_model_output( self, model_output: torch.FloatTensor, timestep: int, sample: torch.FloatTensor ) -> torch.FloatTensor: diff --git a/tests/schedulers/test_scheduler_dpm_single.py b/tests/schedulers/test_scheduler_dpm_single.py index 18a706a1f59b..66be3d5d00ad 100644 --- a/tests/schedulers/test_scheduler_dpm_single.py +++ b/tests/schedulers/test_scheduler_dpm_single.py @@ -215,12 +215,24 @@ def test_full_loop_no_noise(self): assert abs(result_mean.item() - 0.2791) < 1e-3 + def test_full_loop_with_karras(self): + sample = self.full_loop(use_karras_sigmas=True) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_mean.item() - 0.2248) < 1e-3 + def test_full_loop_with_v_prediction(self): sample = self.full_loop(prediction_type="v_prediction") result_mean = torch.mean(torch.abs(sample)) assert abs(result_mean.item() - 0.1453) < 1e-3 + def test_full_loop_with_karras_and_v_prediction(self): + sample = self.full_loop(prediction_type="v_prediction", use_karras_sigmas=True) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_mean.item() - 0.0649) < 1e-3 + def test_fp16_support(self): scheduler_class = self.scheduler_classes[0] scheduler_config = self.get_scheduler_config(thresholding=True, dynamic_thresholding_ratio=0) From 0160e5146f00ad541a857a16ecc1512e4f6e39bb Mon Sep 17 00:00:00 2001 From: w4ffl35 Date: Mon, 22 May 2023 08:44:36 -0600 Subject: [PATCH 004/199] Adds local_files_only bool to prevent forced online connection (#3486) --- .../pipelines/stable_diffusion/convert_from_ckpt.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 42e8ae7cafd2..ff9e03d29347 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -727,8 +727,8 @@ def _copy_layers(hf_layers, pt_layers): return hf_model -def convert_ldm_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") +def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False): + text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) keys = list(checkpoint.keys()) @@ -992,6 +992,7 @@ def download_from_original_stable_diffusion_ckpt( controlnet: Optional[bool] = None, load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, + local_files_only=False ) -> DiffusionPipeline: """ Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` @@ -1037,6 +1038,8 @@ def download_from_original_stable_diffusion_ckpt( Whether to load the safety checker or not. Defaults to `True`. pipeline_class (`str`, *optional*, defaults to `None`): The pipeline class to use. Pass `None` to determine automatically. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ @@ -1292,7 +1295,7 @@ def download_from_original_stable_diffusion_ckpt( feature_extractor=feature_extractor, ) elif model_type == "FrozenCLIPEmbedder": - text_model = convert_ldm_clip_checkpoint(checkpoint) + text_model = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") if load_safety_checker: From a2874af2971d1b262371d9a6fae653662c4a5e95 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 May 2023 16:44:48 +0200 Subject: [PATCH 005/199] make style --- src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index ff9e03d29347..7ba1bbd996db 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -992,7 +992,7 @@ def download_from_original_stable_diffusion_ckpt( controlnet: Optional[bool] = None, load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, - local_files_only=False + local_files_only=False, ) -> DiffusionPipeline: """ Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` From 229fd8cbca989b675ed9ad30676b323eebc24fbc Mon Sep 17 00:00:00 2001 From: Seongsu Park Date: Mon, 22 May 2023 23:46:16 +0900 Subject: [PATCH 006/199] [Docs] Korean translation (optimization, training) (#3488) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat) optimization kr translation * fix) typo, italic setting * feat) dreambooth, text2image kr * feat) lora kr * fix) LoRA * fix) fp16 fix * fix) doc-builder style * fix) fp16 일부 단어 수정 * fix) fp16 style fix * fix) opt, training docs update * feat) toctree update * feat) toctree update --------- Co-authored-by: Chanran Kim --- docs/source/ko/_toctree.yml | 207 ++-------- docs/source/ko/optimization/fp16.mdx | 410 +++++++++++++++++++ docs/source/ko/optimization/habana.mdx | 71 ++++ docs/source/ko/optimization/mps.mdx | 71 ++++ docs/source/ko/optimization/onnx.mdx | 65 +++ docs/source/ko/optimization/open_vino.mdx | 39 ++ docs/source/ko/optimization/xformers.mdx | 36 ++ docs/source/ko/training/dreambooth.mdx | 475 ++++++++++++++++++++++ docs/source/ko/training/lora.mdx | 128 ++++++ docs/source/ko/training/text2image.mdx | 224 ++++++++++ 10 files changed, 1550 insertions(+), 176 deletions(-) create mode 100644 docs/source/ko/optimization/fp16.mdx create mode 100644 docs/source/ko/optimization/habana.mdx create mode 100644 docs/source/ko/optimization/mps.mdx create mode 100644 docs/source/ko/optimization/onnx.mdx create mode 100644 docs/source/ko/optimization/open_vino.mdx create mode 100644 docs/source/ko/optimization/xformers.mdx create mode 100644 docs/source/ko/training/dreambooth.mdx create mode 100644 docs/source/ko/training/lora.mdx create mode 100644 docs/source/ko/training/text2image.mdx diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index a1c0c690eb94..2fec3af66525 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -3,191 +3,46 @@ title: "🧨 Diffusers" - local: quicktour title: "훑어보기" + - local: in_translation + title: Stable Diffusion - local: installation title: "설치" title: "시작하기" + - sections: - sections: - local: in_translation - title: "Loading Pipelines, Models, and Schedulers" - - local: in_translation - title: "Using different Schedulers" - - local: in_translation - title: "Configuring Pipelines, Models, and Schedulers" - - local: in_translation - title: "Loading and Adding Custom Pipelines" - title: "불러오기 & 허브 (번역 예정)" - - sections: - - local: in_translation - title: "Unconditional Image Generation" - - local: in_translation - title: "Text-to-Image Generation" - - local: in_translation - title: "Text-Guided Image-to-Image" + title: 개요 - local: in_translation - title: "Text-Guided Image-Inpainting" + title: Unconditional 이미지 생성 - local: in_translation - title: "Text-Guided Depth-to-Image" + title: Textual Inversion + - local: training/dreambooth + title: DreamBooth + - local: training/text2image + title: Text-to-image + - local: training/lora + title: Low-Rank Adaptation of Large Language Models (LoRA) - local: in_translation - title: "Reusing seeds for deterministic generation" + title: ControlNet - local: in_translation - title: "Community Pipelines" - - local: in_translation - title: "How to contribute a Pipeline" - title: "추론을 위한 파이프라인 (번역 예정)" - - sections: - - local: in_translation - title: "Reinforcement Learning" - - local: in_translation - title: "Audio" - - local: in_translation - title: "Other Modalities" - title: "Taking Diffusers Beyond Images" - title: "Diffusers 사용법 (번역 예정)" -- sections: - - local: in_translation - title: "Memory and Speed" - - local: in_translation - title: "xFormers" - - local: in_translation - title: "ONNX" - - local: in_translation - title: "OpenVINO" - - local: in_translation - title: "MPS" - - local: in_translation - title: "Habana Gaudi" - title: "최적화/특수 하드웨어 (번역 예정)" -- sections: - - local: in_translation - title: "Overview" - - local: in_translation - title: "Unconditional Image Generation" - - local: in_translation - title: "Textual Inversion" - - local: in_translation - title: "Dreambooth" - - local: in_translation - title: "Text-to-image fine-tuning" - title: "학습 (번역 예정)" + title: InstructPix2Pix 학습 + title: 학습 - sections: - local: in_translation - title: "Stable Diffusion" - - local: in_translation - title: "Philosophy" - - local: in_translation - title: "How to contribute?" - title: "개념 설명 (번역 예정)" -- sections: - - sections: - - local: in_translation - title: "Models" - - local: in_translation - title: "Diffusion Pipeline" - - local: in_translation - title: "Logging" - - local: in_translation - title: "Configuration" - - local: in_translation - title: "Outputs" - title: "Main Classes" - - - sections: - - local: in_translation - title: "Overview" - - local: in_translation - title: "AltDiffusion" - - local: in_translation - title: "Cycle Diffusion" - - local: in_translation - title: "DDIM" - - local: in_translation - title: "DDPM" - - local: in_translation - title: "Latent Diffusion" - - local: in_translation - title: "Unconditional Latent Diffusion" - - local: in_translation - title: "PaintByExample" - - local: in_translation - title: "PNDM" - - local: in_translation - title: "Score SDE VE" - - sections: - - local: in_translation - title: "Overview" - - local: in_translation - title: "Text-to-Image" - - local: in_translation - title: "Image-to-Image" - - local: in_translation - title: "Inpaint" - - local: in_translation - title: "Depth-to-Image" - - local: in_translation - title: "Image-Variation" - - local: in_translation - title: "Super-Resolution" - title: "Stable Diffusion" - - local: in_translation - title: "Stable Diffusion 2" - - local: in_translation - title: "Safe Stable Diffusion" - - local: in_translation - title: "Stochastic Karras VE" - - local: in_translation - title: "Dance Diffusion" - - local: in_translation - title: "UnCLIP" - - local: in_translation - title: "Versatile Diffusion" - - local: in_translation - title: "VQ Diffusion" - - local: in_translation - title: "RePaint" - - local: in_translation - title: "Audio Diffusion" - title: "파이프라인 (번역 예정)" - - sections: - - local: in_translation - title: "Overview" - - local: in_translation - title: "DDIM" - - local: in_translation - title: "DDPM" - - local: in_translation - title: "Singlestep DPM-Solver" - - local: in_translation - title: "Multistep DPM-Solver" - - local: in_translation - title: "Heun Scheduler" - - local: in_translation - title: "DPM Discrete Scheduler" - - local: in_translation - title: "DPM Discrete Scheduler with ancestral sampling" - - local: in_translation - title: "Stochastic Kerras VE" - - local: in_translation - title: "Linear Multistep" - - local: in_translation - title: "PNDM" - - local: in_translation - title: "VE-SDE" - - local: in_translation - title: "IPNDM" - - local: in_translation - title: "VP-SDE" - - local: in_translation - title: "Euler scheduler" - - local: in_translation - title: "Euler Ancestral Scheduler" - - local: in_translation - title: "VQDiffusionScheduler" - - local: in_translation - title: "RePaint Scheduler" - title: "스케줄러 (번역 예정)" - - sections: - - local: in_translation - title: "RL Planning" - title: "Experimental Features" - title: "API (번역 예정)" + title: 개요 + - local: optimization/fp16 + title: 메모리와 속도 + - local: in_translation + title: Torch2.0 지원 + - local: optimization/xformers + title: xFormers + - local: optimization/onnx + title: ONNX + - local: optimization/open_vino + title: OpenVINO + - local: optimization/mps + title: MPS + - local: optimization/habana + title: Habana Gaudi + title: 최적화/특수 하드웨어 \ No newline at end of file diff --git a/docs/source/ko/optimization/fp16.mdx b/docs/source/ko/optimization/fp16.mdx new file mode 100644 index 000000000000..593860581be3 --- /dev/null +++ b/docs/source/ko/optimization/fp16.mdx @@ -0,0 +1,410 @@ + + +# 메모리와 속도 + +메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다. +일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요. + +다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다. + +| | 지연시간 | 속도 향상 | +| ---------------- | ------- | ------- | +| 별도 설정 없음 | 9.50s | x1 | +| cuDNN auto-tuner | 9.37s | x1.01 | +| fp16 | 3.61s | x2.63 | +| Channels Last 메모리 형식 | 3.30s | x2.88 | +| traced UNet | 3.21s | x2.96 | +| memory-efficient attention | 2.63s | x3.61 | + + + NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다. + + +## cuDNN auto-tuner 활성화하기 + +[NVIDIA cuDNN](https://developer.nvidia.com/cudnn)은 컨볼루션을 계산하는 많은 알고리즘을 지원합니다. Autotuner는 짧은 벤치마크를 실행하고 주어진 입력 크기에 대해 주어진 하드웨어에서 최고의 성능을 가진 커널을 선택합니다. + +**컨볼루션 네트워크**를 활용하고 있기 때문에 (다른 유형들은 현재 지원되지 않음), 다음 설정을 통해 추론 전에 cuDNN autotuner를 활성화할 수 있습니다: + +```python +import torch + +torch.backends.cudnn.benchmark = True +``` + +### fp32 대신 tf32 사용하기 (Ampere 및 이후 CUDA 장치들에서) + +Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다. +기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다. +네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다. +이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다. +그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다. +추론하기 전에 다음을 추가하기만 하면 됩니다: + +```python +import torch + +torch.backends.cuda.matmul.allow_tf32 = True +``` + +## 반정밀도 가중치 + +더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 로드하고 실행할 수 있습니다. +여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다. + +```Python +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + + torch_dtype=torch.float16, +) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt).images[0] +``` + + + 어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다. + + +## 추가 메모리 절약을 위한 슬라이스 어텐션 + +추가 메모리 절약을 위해, 한 번에 모두 계산하는 대신 단계적으로 계산을 수행하는 슬라이스 버전의 어텐션(attention)을 사용할 수 있습니다. + + + Attention slicing은 모델이 하나 이상의 어텐션 헤드를 사용하는 한, 배치 크기가 1인 경우에도 유용합니다. + 하나 이상의 어텐션 헤드가 있는 경우 *QK^T* 어텐션 매트릭스는 상당한 양의 메모리를 절약할 수 있는 각 헤드에 대해 순차적으로 계산될 수 있습니다. + + +각 헤드에 대해 순차적으로 어텐션 계산을 수행하려면, 다음과 같이 추론 전에 파이프라인에서 [`~StableDiffusionPipeline.enable_attention_slicing`]를 호출하면 됩니다: + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + + torch_dtype=torch.float16, +) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_attention_slicing() +image = pipe(prompt).images[0] +``` + +추론 시간이 약 10% 느려지는 약간의 성능 저하가 있지만 이 방법을 사용하면 3.2GB 정도의 작은 VRAM으로도 Stable Diffusion을 사용할 수 있습니다! + + +## 더 큰 배치를 위한 sliced VAE 디코드 + +제한된 VRAM에서 대규모 이미지 배치를 디코딩하거나 32개 이상의 이미지가 포함된 배치를 활성화하기 위해, 배치의 latent 이미지를 한 번에 하나씩 디코딩하는 슬라이스 VAE 디코드를 사용할 수 있습니다. + +이를 [`~StableDiffusionPipeline.enable_attention_slicing`] 또는 [`~StableDiffusionPipeline.enable_xformers_memory_efficient_attention`]과 결합하여 메모리 사용을 추가로 최소화할 수 있습니다. + +VAE 디코드를 한 번에 하나씩 수행하려면 추론 전에 파이프라인에서 [`~StableDiffusionPipeline.enable_vae_slicing`]을 호출합니다. 예를 들어: + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + + torch_dtype=torch.float16, +) +pipe = pipe.to("cuda") + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_vae_slicing() +images = pipe([prompt] * 32).images +``` + +다중 이미지 배치에서 VAE 디코드가 약간의 성능 향상이 이루어집니다. 단일 이미지 배치에서는 성능 영향은 없습니다. + + + +## 메모리 절약을 위해 가속 기능을 사용하여 CPU로 오프로딩 + +추가 메모리 절약을 위해 가중치를 CPU로 오프로드하고 순방향 전달을 수행할 때만 GPU로 로드할 수 있습니다. + +CPU 오프로딩을 수행하려면 [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]를 호출하기만 하면 됩니다: + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + + torch_dtype=torch.float16, +) + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_sequential_cpu_offload() +image = pipe(prompt).images[0] +``` + +그러면 메모리 소비를 3GB 미만으로 줄일 수 있습니다. + +참고로 이 방법은 전체 모델이 아닌 서브모듈 수준에서 작동합니다. 이는 메모리 소비를 최소화하는 가장 좋은 방법이지만 프로세스의 반복적 특성으로 인해 추론 속도가 훨씬 느립니다. 파이프라인의 UNet 구성 요소는 여러 번 실행됩니다('num_inference_steps' 만큼). 매번 UNet의 서로 다른 서브모듈이 순차적으로 온로드된 다음 필요에 따라 오프로드되므로 메모리 이동 횟수가 많습니다. + + +또 다른 최적화 방법인 모델 오프로딩을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다. + + +또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다. + + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + + torch_dtype=torch.float16, +) + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_sequential_cpu_offload() +pipe.enable_attention_slicing(1) + +image = pipe(prompt).images[0] +``` + +**참고**: 'enable_sequential_cpu_offload()'를 사용할 때, 미리 파이프라인을 CUDA로 이동하지 **않는** 것이 중요합니다.그렇지 않으면 메모리 소비의 이득이 최소화됩니다. 더 많은 정보를 위해 [이 이슈](https://github.com/huggingface/diffusers/issues/1934)를 보세요. + + +## 빠른 추론과 메모리 메모리 절약을 위한 모델 오프로딩 + +[순차적 CPU 오프로딩](#sequential_offloading)은 이전 섹션에서 설명한 것처럼 많은 메모리를 보존하지만 필요에 따라 서브모듈을 GPU로 이동하고 새 모듈이 실행될 때 즉시 CPU로 반환되기 때문에 추론 속도가 느려집니다. + +전체 모델 오프로딩은 각 모델의 구성 요소인 _modules_을 처리하는 대신, 전체 모델을 GPU로 이동하는 대안입니다. 이로 인해 추론 시간에 미치는 영향은 미미하지만(파이프라인을 'cuda'로 이동하는 것과 비교하여) 여전히 약간의 메모리를 절약할 수 있습니다. + +이 시나리오에서는 파이프라인의 주요 구성 요소 중 하나만(일반적으로 텍스트 인코더, unet 및 vae) GPU에 있고, 나머지는 CPU에서 대기할 것입니다. +여러 반복을 위해 실행되는 UNet과 같은 구성 요소는 더 이상 필요하지 않을 때까지 GPU에 남아 있습니다. + +이 기능은 아래와 같이 파이프라인에서 `enable_model_cpu_offload()`를 호출하여 활성화할 수 있습니다. + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, +) + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_model_cpu_offload() +image = pipe(prompt).images[0] +``` + +이는 추가적인 메모리 절약을 위한 attention slicing과도 호환됩니다. + +```Python +import torch +from diffusers import StableDiffusionPipeline + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, +) + +prompt = "a photo of an astronaut riding a horse on mars" +pipe.enable_model_cpu_offload() +pipe.enable_attention_slicing(1) + +image = pipe(prompt).images[0] +``` + + +이 기능을 사용하려면 'accelerate' 버전 0.17.0 이상이 필요합니다. + + +## Channels Last 메모리 형식 사용하기 + +Channels Last 메모리 형식은 차원 순서를 보존하는 메모리에서 NCHW 텐서 배열을 대체하는 방법입니다. +Channels Last 텐서는 채널이 가장 조밀한 차원이 되는 방식으로 정렬됩니다(일명 픽셀당 이미지를 저장). +현재 모든 연산자 Channels Last 형식을 지원하는 것은 아니라 성능이 저하될 수 있으므로, 사용해보고 모델에 잘 작동하는지 확인하는 것이 좋습니다. + + +예를 들어 파이프라인의 UNet 모델이 channels Last 형식을 사용하도록 설정하려면 다음을 사용할 수 있습니다: + +```python +print(pipe.unet.conv_out.state_dict()["weight"].stride()) # (2880, 9, 3, 1) +pipe.unet.to(memory_format=torch.channels_last) # in-place 연산 +# 2번째 차원에서 스트라이드 1을 가지는 (2880, 1, 960, 320)로, 연산이 작동함을 증명합니다. +print(pipe.unet.conv_out.state_dict()["weight"].stride()) +``` + +## 추적(tracing) + +추적은 모델을 통해 예제 입력 텐서를 통해 실행되는데, 해당 입력이 모델의 레이어를 통과할 때 호출되는 작업을 캡처하여 실행 파일 또는 'ScriptFunction'이 반환되도록 하고, 이는 just-in-time 컴파일로 최적화됩니다. + +UNet 모델을 추적하기 위해 다음을 사용할 수 있습니다: + +```python +import time +import torch +from diffusers import StableDiffusionPipeline +import functools + +# torch 기울기 비활성화 +torch.set_grad_enabled(False) + +# 변수 설정 +n_experiments = 2 +unet_runs_per_experiment = 50 + + +# 입력 불러오기 +def generate_inputs(): + sample = torch.randn(2, 4, 64, 64).half().cuda() + timestep = torch.rand(1).half().cuda() * 999 + encoder_hidden_states = torch.randn(2, 77, 768).half().cuda() + return sample, timestep, encoder_hidden_states + + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, +).to("cuda") +unet = pipe.unet +unet.eval() +unet.to(memory_format=torch.channels_last) # Channels Last 메모리 형식 사용 +unet.forward = functools.partial(unet.forward, return_dict=False) # return_dict=False을 기본값으로 설정 + +# 워밍업 +for _ in range(3): + with torch.inference_mode(): + inputs = generate_inputs() + orig_output = unet(*inputs) + +# 추적 +print("tracing..") +unet_traced = torch.jit.trace(unet, inputs) +unet_traced.eval() +print("done tracing") + + +# 워밍업 및 그래프 최적화 +for _ in range(5): + with torch.inference_mode(): + inputs = generate_inputs() + orig_output = unet_traced(*inputs) + + +# 벤치마킹 +with torch.inference_mode(): + for _ in range(n_experiments): + torch.cuda.synchronize() + start_time = time.time() + for _ in range(unet_runs_per_experiment): + orig_output = unet_traced(*inputs) + torch.cuda.synchronize() + print(f"unet traced inference took {time.time() - start_time:.2f} seconds") + for _ in range(n_experiments): + torch.cuda.synchronize() + start_time = time.time() + for _ in range(unet_runs_per_experiment): + orig_output = unet(*inputs) + torch.cuda.synchronize() + print(f"unet inference took {time.time() - start_time:.2f} seconds") + +# 모델 저장 +unet_traced.save("unet_traced.pt") +``` + +그 다음, 파이프라인의 `unet` 특성을 다음과 같이 추적된 모델로 바꿀 수 있습니다. + +```python +from diffusers import StableDiffusionPipeline +import torch +from dataclasses import dataclass + + +@dataclass +class UNet2DConditionOutput: + sample: torch.FloatTensor + + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, +).to("cuda") + +# jitted unet 사용 +unet_traced = torch.jit.load("unet_traced.pt") + + +# pipe.unet 삭제 +class TracedUNet(torch.nn.Module): + def __init__(self): + super().__init__() + self.in_channels = pipe.unet.in_channels + self.device = pipe.unet.device + + def forward(self, latent_model_input, t, encoder_hidden_states): + sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0] + return UNet2DConditionOutput(sample=sample) + + +pipe.unet = TracedUNet() + +with torch.inference_mode(): + image = pipe([prompt] * 1, num_inference_steps=50).images[0] +``` + + +## Memory-efficient attention + +어텐션 블록의 대역폭을 최적화하는 최근 작업으로 GPU 메모리 사용량이 크게 향상되고 향상되었습니다. +@tridao의 가장 최근의 플래시 어텐션: [code](https://github.com/HazyResearch/flash-attention), [paper](https://arxiv.org/pdf/2205.14135.pdf). + +배치 크기 1(프롬프트 1개)의 512x512 크기로 추론을 실행할 때 몇 가지 Nvidia GPU에서 얻은 속도 향상은 다음과 같습니다: + +| GPU | 기준 어텐션 FP16 | 메모리 효율적인 어텐션 FP16 | +|------------------ |--------------------- |--------------------------------- | +| NVIDIA Tesla T4 | 3.5it/s | 5.5it/s | +| NVIDIA 3060 RTX | 4.6it/s | 7.8it/s | +| NVIDIA A10G | 8.88it/s | 15.6it/s | +| NVIDIA RTX A6000 | 11.7it/s | 21.09it/s | +| NVIDIA TITAN RTX | 12.51it/s | 18.22it/s | +| A100-SXM4-40GB | 18.6it/s | 29.it/s | +| A100-SXM-80GB | 18.7it/s | 29.5it/s | + +이를 활용하려면 다음을 만족해야 합니다: + - PyTorch > 1.12 + - Cuda 사용 가능 + - [xformers 라이브러리를 설치함](xformers) +```python +from diffusers import StableDiffusionPipeline +import torch + +pipe = StableDiffusionPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + torch_dtype=torch.float16, +).to("cuda") + +pipe.enable_xformers_memory_efficient_attention() + +with torch.inference_mode(): + sample = pipe("a small cat") + +# 선택: 이를 비활성화 하기 위해 다음을 사용할 수 있습니다. +# pipe.disable_xformers_memory_efficient_attention() +``` diff --git a/docs/source/ko/optimization/habana.mdx b/docs/source/ko/optimization/habana.mdx new file mode 100644 index 000000000000..0f076245fb1c --- /dev/null +++ b/docs/source/ko/optimization/habana.mdx @@ -0,0 +1,71 @@ + + +# Habana Gaudi에서 Stable Diffusion을 사용하는 방법 + +🤗 Diffusers는 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion)를 통해서 Habana Gaudi와 호환됩니다. + +## 요구 사항 + +- Optimum Habana 1.4 또는 이후, [여기](https://huggingface.co/docs/optimum/habana/installation)에 설치하는 방법이 있습니다. +- SynapseAI 1.8. + + +## 추론 파이프라인 + +Gaudi에서 Stable Diffusion 1 및 2로 이미지를 생성하려면 두 인스턴스를 인스턴스화해야 합니다: +- [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline)이 포함된 파이프라인. 이 파이프라인은 *텍스트-이미지 생성*을 지원합니다. +- [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler)이 포함된 스케줄러. 이 스케줄러는 Habana Gaudi에 최적화되어 있습니다. + +파이프라인을 초기화할 때, HPU에 배포하기 위해 `use_habana=True`를 지정해야 합니다. +또한 가능한 가장 빠른 생성을 위해 `use_hpu_graphs=True`로 **HPU 그래프**를 활성화해야 합니다. +마지막으로, [Hugging Face Hub](https://huggingface.co/Habana)에서 다운로드할 수 있는 [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config)을 지정해야 합니다. + +```python +from optimum.habana import GaudiConfig +from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline + +model_name = "stabilityai/stable-diffusion-2-base" +scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler") +pipeline = GaudiStableDiffusionPipeline.from_pretrained( + model_name, + scheduler=scheduler, + use_habana=True, + use_hpu_graphs=True, + gaudi_config="Habana/stable-diffusion", +) +``` + +파이프라인을 호출하여 하나 이상의 프롬프트에서 배치별로 이미지를 생성할 수 있습니다. + +```python +outputs = pipeline( + prompt=[ + "High quality photo of an astronaut riding a horse in space", + "Face of a yellow cat, high resolution, sitting on a park bench", + ], + num_images_per_prompt=10, + batch_size=4, +) +``` + +더 많은 정보를 얻기 위해, Optimum Habana의 [문서](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion)와 공식 Github 저장소에 제공된 [예시](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)를 확인하세요. + + +## 벤치마크 + +다음은 [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi 구성(혼합 정밀도 bf16/fp32)을 사용하는 Habana first-generation Gaudi 및 Gaudi2의 지연 시간입니다: + +| | Latency (배치 크기 = 1) | Throughput (배치 크기 = 8) | +| ---------------------- |:------------------------:|:---------------------------:| +| first-generation Gaudi | 4.29s | 0.283 images/s | +| Gaudi2 | 1.54s | 0.904 images/s | diff --git a/docs/source/ko/optimization/mps.mdx b/docs/source/ko/optimization/mps.mdx new file mode 100644 index 000000000000..cd04d6d1103d --- /dev/null +++ b/docs/source/ko/optimization/mps.mdx @@ -0,0 +1,71 @@ + + +# Apple Silicon (M1/M2)에서 Stable Diffusion을 사용하는 방법 + +Diffusers는 Stable Diffusion 추론을 위해 PyTorch `mps`를 사용해 Apple 실리콘과 호환됩니다. 다음은 Stable Diffusion이 있는 M1 또는 M2 컴퓨터를 사용하기 위해 따라야 하는 단계입니다. + +## 요구 사항 + +- Apple silicon (M1/M2) 하드웨어의 Mac 컴퓨터. +- macOS 12.6 또는 이후 (13.0 또는 이후 추천). +- Python arm64 버전 +- PyTorch 2.0(추천) 또는 1.13(`mps`를 지원하는 최소 버전). Yhttps://pytorch.org/get-started/locally/의 지침에 따라 `pip` 또는 `conda`로 설치할 수 있습니다. + + +## 추론 파이프라인 + +아래 코도는 익숙한 `to()` 인터페이스를 사용하여 `mps` 백엔드로 Stable Diffusion 파이프라인을 M1 또는 M2 장치로 이동하는 방법을 보여줍니다. + + + + +**PyTorch 1.13을 사용 중일 때 ** 추가 일회성 전달을 사용하여 파이프라인을 "프라이밍"하는 것을 추천합니다. 이것은 발견한 이상한 문제에 대한 임시 해결 방법입니다. 첫 번째 추론 전달은 후속 전달와 약간 다른 결과를 생성합니다. 이 전달은 한 번만 수행하면 되며 추론 단계를 한 번만 사용하고 결과를 폐기해도 됩니다. + + + +이전 팁에서 설명한 것들을 포함한 여러 문제를 해결하므로 PyTorch 2 이상을 사용하는 것이 좋습니다. + + +```python +# `huggingface-cli login`에 로그인되어 있음을 확인 +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") +pipe = pipe.to("mps") + +# 컴퓨터가 64GB 이하의 RAM 램일 때 추천 +pipe.enable_attention_slicing() + +prompt = "a photo of an astronaut riding a horse on mars" + +# 처음 "워밍업" 전달 (위 설명을 보세요) +_ = pipe(prompt, num_inference_steps=1) + +# 결과는 워밍업 전달 후의 CPU 장치의 결과와 일치합니다. +image = pipe(prompt).images[0] +``` + +## 성능 추천 + +M1/M2 성능은 메모리 압력에 매우 민감합니다. 시스템은 필요한 경우 자동으로 스왑되지만 스왑할 때 성능이 크게 저하됩니다. + + +특히 컴퓨터의 시스템 RAM이 64GB 미만이거나 512 × 512픽셀보다 큰 비표준 해상도에서 이미지를 생성하는 경우, 추론 중에 메모리 압력을 줄이고 스와핑을 방지하기 위해 *어텐션 슬라이싱*을 사용하는 것이 좋습니다. 어텐션 슬라이싱은 비용이 많이 드는 어텐션 작업을 한 번에 모두 수행하는 대신 여러 단계로 수행합니다. 일반적으로 범용 메모리가 없는 컴퓨터에서 ~20%의 성능 영향을 미치지만 64GB 이상이 아닌 경우 대부분의 Apple Silicon 컴퓨터에서 *더 나은 성능*이 관찰되었습니다. + +```python +pipeline.enable_attention_slicing() +``` + +## Known Issues + +- 여러 프롬프트를 배치로 생성하는 것은 [충돌이 발생하거나 안정적으로 작동하지 않습니다](https://github.com/huggingface/diffusers/issues/363). 우리는 이것이 [PyTorch의 `mps` 백엔드](https://github.com/pytorch/pytorch/issues/84039)와 관련이 있다고 생각합니다. 이 문제는 해결되고 있지만 지금은 배치 대신 반복 방법을 사용하는 것이 좋습니다. \ No newline at end of file diff --git a/docs/source/ko/optimization/onnx.mdx b/docs/source/ko/optimization/onnx.mdx new file mode 100644 index 000000000000..d52110b8c1fb --- /dev/null +++ b/docs/source/ko/optimization/onnx.mdx @@ -0,0 +1,65 @@ + + + +# 추론을 위해 ONNX 런타임을 사용하는 방법 + +🤗 Diffusers는 ONNX Runtime과 호환되는 Stable Diffusion 파이프라인을 제공합니다. 이를 통해 ONNX(CPU 포함)를 지원하고 PyTorch의 가속 버전을 사용할 수 없는 모든 하드웨어에서 Stable Diffusion을 실행할 수 있습니다. + +## 설치 + +다음 명령어로 ONNX Runtime를 지원하는 🤗 Optimum를 설치합니다: + +``` +pip install optimum["onnxruntime"] +``` + +## Stable Diffusion 추론 + +아래 코드는 ONNX 런타임을 사용하는 방법을 보여줍니다. `StableDiffusionPipeline` 대신 `OnnxStableDiffusionPipeline`을 사용해야 합니다. +PyTorch 모델을 불러오고 즉시 ONNX 형식으로 변환하려는 경우 `export=True`로 설정합니다. + +```python +from optimum.onnxruntime import ORTStableDiffusionPipeline + +model_id = "runwayml/stable-diffusion-v1-5" +pipe = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True) +prompt = "a photo of an astronaut riding a horse on mars" +images = pipe(prompt).images[0] +pipe.save_pretrained("./onnx-stable-diffusion-v1-5") +``` + +파이프라인을 ONNX 형식으로 오프라인으로 내보내고 나중에 추론에 사용하려는 경우, +[`optimum-cli export`](https://huggingface.co/docs/optimum/main/en/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 명령어를 사용할 수 있습니다: + +```bash +optimum-cli export onnx --model runwayml/stable-diffusion-v1-5 sd_v15_onnx/ +``` + +그 다음 추론을 수행합니다: + +```python +from optimum.onnxruntime import ORTStableDiffusionPipeline + +model_id = "sd_v15_onnx" +pipe = ORTStableDiffusionPipeline.from_pretrained(model_id) +prompt = "a photo of an astronaut riding a horse on mars" +images = pipe(prompt).images[0] +``` + +Notice that we didn't have to specify `export=True` above. + +[Optimum 문서](https://huggingface.co/docs/optimum/)에서 더 많은 예시를 찾을 수 있습니다. + +## 알려진 이슈들 + +- 여러 프롬프트를 배치로 생성하면 너무 많은 메모리가 사용되는 것 같습니다. 이를 조사하는 동안, 배치 대신 반복 방법이 필요할 수도 있습니다. diff --git a/docs/source/ko/optimization/open_vino.mdx b/docs/source/ko/optimization/open_vino.mdx new file mode 100644 index 000000000000..cb279909f618 --- /dev/null +++ b/docs/source/ko/optimization/open_vino.mdx @@ -0,0 +1,39 @@ + + +# 추론을 위한 OpenVINO 사용 방법 + +🤗 [Optimum](https://github.com/huggingface/optimum-intel)은 OpenVINO와 호환되는 Stable Diffusion 파이프라인을 제공합니다. +이제 다양한 Intel 프로세서에서 OpenVINO Runtime으로 쉽게 추론을 수행할 수 있습니다. ([여기](https://docs.openvino.ai/latest/openvino_docs_OV_UG_supported_plugins_Supported_Devices.html)서 지원되는 전 기기 목록을 확인하세요). + +## 설치 + +다음 명령어로 🤗 Optimum을 설치합니다: + +``` +pip install optimum["openvino"] +``` + +## Stable Diffusion 추론 + +OpenVINO 모델을 불러오고 OpenVINO 런타임으로 추론을 실행하려면 `StableDiffusionPipeline`을 `OVStableDiffusionPipeline`으로 교체해야 합니다. PyTorch 모델을 불러오고 즉시 OpenVINO 형식으로 변환하려는 경우 `export=True`로 설정합니다. + +```python +from optimum.intel.openvino import OVStableDiffusionPipeline + +model_id = "runwayml/stable-diffusion-v1-5" +pipe = OVStableDiffusionPipeline.from_pretrained(model_id, export=True) +prompt = "a photo of an astronaut riding a horse on mars" +images = pipe(prompt).images[0] +``` + +[Optimum 문서](https://huggingface.co/docs/optimum/intel/inference#export-and-inference-of-stable-diffusion-models)에서 (정적 reshaping과 모델 컴파일 등의) 더 많은 예시들을 찾을 수 있습니다. diff --git a/docs/source/ko/optimization/xformers.mdx b/docs/source/ko/optimization/xformers.mdx new file mode 100644 index 000000000000..a8b9408fbe50 --- /dev/null +++ b/docs/source/ko/optimization/xformers.mdx @@ -0,0 +1,36 @@ + + +# xFormers 설치하기 + +추론과 학습 모두에 [xFormers](https://github.com/facebookresearch/xformers)를 사용하는 것이 좋습니다. +자체 테스트로 어텐션 블록에서 수행된 최적화가 더 빠른 속도와 적은 메모리 소비를 확인했습니다. + +2023년 1월에 출시된 xFormers 버전 '0.0.16'부터 사전 빌드된 pip wheel을 사용하여 쉽게 설치할 수 있습니다: + +```bash +pip install xformers +``` + + + +xFormers PIP 패키지에는 최신 버전의 PyTorch(xFormers 0.0.16에 1.13.1)가 필요합니다. 이전 버전의 PyTorch를 사용해야 하는 경우 [프로젝트 지침](https://github.com/facebookresearch/xformers#installing-xformers)의 소스를 사용해 xFormers를 설치하는 것이 좋습니다. + + + +xFormers를 설치하면, [여기](fp16#memory-efficient-attention)서 설명한 것처럼 'enable_xformers_memory_efficient_attention()'을 사용하여 추론 속도를 높이고 메모리 소비를 줄일 수 있습니다. + + + +[이 이슈](https://github.com/huggingface/diffusers/issues/2234#issuecomment-1416931212)에 따르면 xFormers `v0.0.16`에서 GPU를 사용한 학습(파인 튜닝 또는 Dreambooth)을 할 수 없습니다. 해당 문제가 발견되면. 해당 코멘트를 참고해 development 버전을 설치하세요. + + diff --git a/docs/source/ko/training/dreambooth.mdx b/docs/source/ko/training/dreambooth.mdx new file mode 100644 index 000000000000..cc282d9d24f8 --- /dev/null +++ b/docs/source/ko/training/dreambooth.mdx @@ -0,0 +1,475 @@ + + +# DreamBooth + +[DreamBooth](https://arxiv.org/abs/2208.12242)는 한 주제에 대한 적은 이미지(3~5개)만으로도 stable diffusion과 같이 text-to-image 모델을 개인화할 수 있는 방법입니다. 이를 통해 모델은 다양한 장면, 포즈 및 장면(뷰)에서 피사체에 대해 맥락화(contextualized)된 이미지를 생성할 수 있습니다. + +![프로젝트 블로그에서의 DreamBooth 예시](https://dreambooth.github.io/DreamBooth_files/teaser_static.jpg) +project's blog. +프로젝트 블로그에서의 Dreambooth 예시 + + +이 가이드는 다양한 GPU, Flax 사양에 대해 [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 모델로 DreamBooth를 파인튜닝하는 방법을 보여줍니다. 더 깊이 파고들어 작동 방식을 확인하는 데 관심이 있는 경우, 이 가이드에 사용된 DreamBooth의 모든 학습 스크립트를 [여기](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)에서 찾을 수 있습니다. + +스크립트를 실행하기 전에 라이브러리의 학습에 필요한 dependencies를 설치해야 합니다. 또한 `main` GitHub 브랜치에서 🧨 Diffusers를 설치하는 것이 좋습니다. + +```bash +pip install git+https://github.com/huggingface/diffusers +pip install -U -r diffusers/examples/dreambooth/requirements.txt +``` + +xFormers는 학습에 필요한 요구 사항은 아니지만, 가능하면 [설치](../optimization/xformers)하는 것이 좋습니다. 학습 속도를 높이고 메모리 사용량을 줄일 수 있기 때문입니다. + +모든 dependencies을 설정한 후 다음을 사용하여 [🤗 Accelerate](https://github.com/huggingface/accelerate/) 환경을 다음과 같이 초기화합니다: + +```bash +accelerate config +``` + +별도 설정 없이 기본 🤗 Accelerate 환경을 설치하려면 다음을 실행합니다: + +```bash +accelerate config default +``` + +또는 현재 환경이 노트북과 같은 대화형 셸을 지원하지 않는 경우 다음을 사용할 수 있습니다: + +```py +from accelerate.utils import write_basic_config + +write_basic_config() +``` + +## 파인튜닝 + + + +DreamBooth 파인튜닝은 하이퍼파라미터에 매우 민감하고 과적합되기 쉽습니다. 적절한 하이퍼파라미터를 선택하는 데 도움이 되도록 다양한 권장 설정이 포함된 [심층 분석](https://huggingface.co/blog/dreambooth)을 살펴보는 것이 좋습니다. + + + + + +[몇 장의 강아지 이미지들](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ)로 DreamBooth를 시도해봅시다. +이를 다운로드해 디렉터리에 저장한 다음 `INSTANCE_DIR` 환경 변수를 해당 경로로 설정합니다: + + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export INSTANCE_DIR="path_to_training_images" +export OUTPUT_DIR="path_to_saved_model" +``` + +그런 다음, 다음 명령을 사용하여 학습 스크립트를 실행할 수 있습니다 (전체 학습 스크립트는 [여기](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)에서 찾을 수 있습니다): + +```bash +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a photo of sks dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=1 \ + --learning_rate=5e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=400 +``` + + + +TPU에 액세스할 수 있거나 더 빠르게 훈련하고 싶다면 [Flax 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_flax.py)를 사용해 볼 수 있습니다. Flax 학습 스크립트는 gradient checkpointing 또는 gradient accumulation을 지원하지 않으므로, 메모리가 30GB 이상인 GPU가 필요합니다. + +스크립트를 실행하기 전에 요구 사항이 설치되어 있는지 확인하십시오. + +```bash +pip install -U -r requirements.txt +``` + +그러면 다음 명령어로 학습 스크립트를 실행시킬 수 있습니다: + +```bash +export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" +export INSTANCE_DIR="path-to-instance-images" +export OUTPUT_DIR="path-to-save-model" + +python train_dreambooth_flax.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a photo of sks dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --learning_rate=5e-6 \ + --max_train_steps=400 +``` + + + +### Prior-preserving(사전 보존) loss를 사용한 파인튜닝 + +과적합과 language drift를 방지하기 위해 사전 보존이 사용됩니다(관심이 있는 경우 [논문](https://arxiv.org/abs/2208.12242)을 참조하세요). 사전 보존을 위해 동일한 클래스의 다른 이미지를 학습 프로세스의 일부로 사용합니다. 좋은 점은 Stable Diffusion 모델 자체를 사용하여 이러한 이미지를 생성할 수 있다는 것입니다! 학습 스크립트는 생성된 이미지를 우리가 지정한 로컬 경로에 저장합니다. + +저자들에 따르면 사전 보존을 위해 `num_epochs * num_samples`개의 이미지를 생성하는 것이 좋습니다. 200-300개에서 대부분 잘 작동합니다. + + + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export INSTANCE_DIR="path_to_training_images" +export CLASS_DIR="path_to_class_images" +export OUTPUT_DIR="path_to_saved_model" + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=1 \ + --learning_rate=5e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --max_train_steps=800 +``` + + +```bash +export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" +export INSTANCE_DIR="path-to-instance-images" +export CLASS_DIR="path-to-class-images" +export OUTPUT_DIR="path-to-save-model" + +python train_dreambooth_flax.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --learning_rate=5e-6 \ + --num_class_images=200 \ + --max_train_steps=800 +``` + + + +## 텍스트 인코더와 and UNet로 파인튜닝하기 + +해당 스크립트를 사용하면 `unet`과 함께 `text_encoder`를 파인튜닝할 수 있습니다. 실험에서(자세한 내용은 [🧨 Diffusers를 사용해 DreamBooth로 Stable Diffusion 학습하기](https://huggingface.co/blog/dreambooth) 게시물을 확인하세요), 특히 얼굴 이미지를 생성할 때 훨씬 더 나은 결과를 얻을 수 있습니다. + + + +텍스트 인코더를 학습시키려면 추가 메모리가 필요해 16GB GPU로는 동작하지 않습니다. 이 옵션을 사용하려면 최소 24GB VRAM이 필요합니다. + + + +`--train_text_encoder` 인수를 학습 스크립트에 전달하여 `text_encoder` 및 `unet`을 파인튜닝할 수 있습니다: + + + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export INSTANCE_DIR="path_to_training_images" +export CLASS_DIR="path_to_class_images" +export OUTPUT_DIR="path_to_saved_model" + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --train_text_encoder \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --use_8bit_adam + --gradient_checkpointing \ + --learning_rate=2e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --max_train_steps=800 +``` + + +```bash +export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" +export INSTANCE_DIR="path-to-instance-images" +export CLASS_DIR="path-to-class-images" +export OUTPUT_DIR="path-to-save-model" + +python train_dreambooth_flax.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --train_text_encoder \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --learning_rate=2e-6 \ + --num_class_images=200 \ + --max_train_steps=800 +``` + + + +## LoRA로 파인튜닝하기 + +DreamBooth에서 대규모 모델의 학습을 가속화하기 위한 파인튜닝 기술인 LoRA(Low-Rank Adaptation of Large Language Models)를 사용할 수 있습니다. 자세한 내용은 [LoRA 학습](training/lora#dreambooth) 가이드를 참조하세요. + +### 학습 중 체크포인트 저장하기 + +Dreambooth로 훈련하는 동안 과적합하기 쉬우므로, 때때로 학습 중에 정기적인 체크포인트를 저장하는 것이 유용합니다. 중간 체크포인트 중 하나가 최종 모델보다 더 잘 작동할 수 있습니다! 체크포인트 저장 기능을 활성화하려면 학습 스크립트에 다음 인수를 전달해야 합니다: + +```bash + --checkpointing_steps=500 +``` + +이렇게 하면 `output_dir`의 하위 폴더에 전체 학습 상태가 저장됩니다. 하위 폴더 이름은 접두사 `checkpoint-`로 시작하고 지금까지 수행된 step 수입니다. 예시로 `checkpoint-1500`은 1500 학습 step 후에 저장된 체크포인트입니다. + +#### 저장된 체크포인트에서 훈련 재개하기 + +저장된 체크포인트에서 훈련을 재개하려면, `--resume_from_checkpoint` 인수를 전달한 다음 사용할 체크포인트의 이름을 지정하면 됩니다. 특수 문자열 `"latest"`를 사용하여 저장된 마지막 체크포인트(즉, step 수가 가장 많은 체크포인트)에서 재개할 수도 있습니다. 예를 들어 다음은 1500 step 후에 저장된 체크포인트에서부터 학습을 재개합니다: + +```bash + --resume_from_checkpoint="checkpoint-1500" +``` + +원하는 경우 일부 하이퍼파라미터를 조정할 수 있습니다. + +#### 저장된 체크포인트를 사용하여 추론 수행하기 + +저장된 체크포인트는 훈련 재개에 적합한 형식으로 저장됩니다. 여기에는 모델 가중치뿐만 아니라 옵티마이저, 데이터 로더 및 학습률의 상태도 포함됩니다. + +**`"accelerate>=0.16.0"`**이 설치된 경우 다음 코드를 사용하여 중간 체크포인트에서 추론을 실행합니다. + +```python +from diffusers import DiffusionPipeline, UNet2DConditionModel +from transformers import CLIPTextModel +import torch + +# 학습에 사용된 것과 동일한 인수(model, revision)로 파이프라인을 로드합니다. +model_id = "CompVis/stable-diffusion-v1-4" + +unet = UNet2DConditionModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/unet") + +# `args.train_text_encoder`로 학습한 경우면 텍스트 인코더를 꼭 불러오세요 +text_encoder = CLIPTextModel.from_pretrained("/sddata/dreambooth/daruma-v2-1/checkpoint-100/text_encoder") + +pipeline = DiffusionPipeline.from_pretrained(model_id, unet=unet, text_encoder=text_encoder, dtype=torch.float16) +pipeline.to("cuda") + +# 추론을 수행하거나 저장하거나, 허브에 푸시합니다. +pipeline.save_pretrained("dreambooth-pipeline") +``` + +If you have **`"accelerate<0.16.0"`** installed, you need to convert it to an inference pipeline first: + +```python +from accelerate import Accelerator +from diffusers import DiffusionPipeline + +# 학습에 사용된 것과 동일한 인수(model, revision)로 파이프라인을 로드합니다. +model_id = "CompVis/stable-diffusion-v1-4" +pipeline = DiffusionPipeline.from_pretrained(model_id) + +accelerator = Accelerator() + +# 초기 학습에 `--train_text_encoder`가 사용된 경우 text_encoder를 사용합니다. +unet, text_encoder = accelerator.prepare(pipeline.unet, pipeline.text_encoder) + +# 체크포인트 경로로부터 상태를 복원합니다. 여기서는 절대 경로를 사용해야 합니다. +accelerator.load_state("/sddata/dreambooth/daruma-v2-1/checkpoint-100") + +# unwrapped 모델로 파이프라인을 다시 빌드합니다.(.unet and .text_encoder로의 할당도 작동해야 합니다) +pipeline = DiffusionPipeline.from_pretrained( + model_id, + unet=accelerator.unwrap_model(unet), + text_encoder=accelerator.unwrap_model(text_encoder), +) + +# 추론을 수행하거나 저장하거나, 허브에 푸시합니다. +pipeline.save_pretrained("dreambooth-pipeline") +``` + +## 각 GPU 용량에서의 최적화 + +하드웨어에 따라 16GB에서 8GB까지 GPU에서 DreamBooth를 최적화하는 몇 가지 방법이 있습니다! + +### xFormers + +[xFormers](https://github.com/facebookresearch/xformers)는 Transformers를 최적화하기 위한 toolbox이며, 🧨 Diffusers에서 사용되는[memory-efficient attention](https://facebookresearch.github.io/xformers/components/ops.html#module-xformers.ops) 메커니즘을 포함하고 있습니다. [xFormers를 설치](./optimization/xformers)한 다음 학습 스크립트에 다음 인수를 추가합니다: + +```bash + --enable_xformers_memory_efficient_attention +``` + +xFormers는 Flax에서 사용할 수 없습니다. + +### 그래디언트 없음으로 설정 + +메모리 사용량을 줄일 수 있는 또 다른 방법은 [기울기 설정](https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html)을 0 대신 `None`으로 하는 것입니다. 그러나 이로 인해 특정 동작이 변경될 수 있으므로 문제가 발생하면 이 인수를 제거해 보십시오. 학습 스크립트에 다음 인수를 추가하여 그래디언트를 `None`으로 설정합니다. + +```bash + --set_grads_to_none +``` + +### 16GB GPU + +Gradient checkpointing과 [bitsandbytes](https://github.com/TimDettmers/bitsandbytes)의 8비트 옵티마이저의 도움으로, 16GB GPU에서 dreambooth를 훈련할 수 있습니다. bitsandbytes가 설치되어 있는지 확인하세요: + +```bash +pip install bitsandbytes +``` + +그 다음, 학습 스크립트에 `--use_8bit_adam` 옵션을 명시합니다: + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export INSTANCE_DIR="path_to_training_images" +export CLASS_DIR="path_to_class_images" +export OUTPUT_DIR="path_to_saved_model" + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=2 --gradient_checkpointing \ + --use_8bit_adam \ + --learning_rate=5e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --max_train_steps=800 +``` + +### 12GB GPU + +12GB GPU에서 DreamBooth를 실행하려면 gradient checkpointing, 8비트 옵티마이저, xFormers를 활성화하고 그래디언트를 `None`으로 설정해야 합니다. + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export INSTANCE_DIR="path-to-instance-images" +export CLASS_DIR="path-to-class-images" +export OUTPUT_DIR="path-to-save-model" + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=1 --gradient_checkpointing \ + --use_8bit_adam \ + --enable_xformers_memory_efficient_attention \ + --set_grads_to_none \ + --learning_rate=2e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --max_train_steps=800 +``` + +### 8GB GPU에서 학습하기 + +8GB GPU에 대해서는 [DeepSpeed](https://www.deepspeed.ai/)를 사용해 일부 텐서를 VRAM에서 CPU 또는 NVME로 오프로드하여 더 적은 GPU 메모리로 학습할 수도 있습니다. + +🤗 Accelerate 환경을 구성하려면 다음 명령을 실행하세요: + +```bash +accelerate config +``` + +환경 구성 중에 DeepSpeed를 사용할 것을 확인하세요. +그러면 DeepSpeed stage 2, fp16 혼합 정밀도를 결합하고 모델 매개변수와 옵티마이저 상태를 모두 CPU로 오프로드하면 8GB VRAM 미만에서 학습할 수 있습니다. +단점은 더 많은 시스템 RAM(약 25GB)이 필요하다는 것입니다. 추가 구성 옵션은 [DeepSpeed 문서](https://huggingface.co/docs/accelerate/usage_guides/deepspeed)를 참조하세요. + +또한 기본 Adam 옵티마이저를 DeepSpeed의 최적화된 Adam 버전으로 변경해야 합니다. +이는 상당한 속도 향상을 위한 Adam인 [`deepspeed.ops.adam.DeepSpeedCPUAdam`](https://deepspeed.readthedocs.io/en/latest/optimizers.html#adam-cpu)입니다. +`DeepSpeedCPUAdam`을 활성화하려면 시스템의 CUDA toolchain 버전이 PyTorch와 함께 설치된 것과 동일해야 합니다. + +8비트 옵티마이저는 현재 DeepSpeed와 호환되지 않는 것 같습니다. + +다음 명령으로 학습을 시작합니다: + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export INSTANCE_DIR="path_to_training_images" +export CLASS_DIR="path_to_class_images" +export OUTPUT_DIR="path_to_saved_model" + +accelerate launch train_dreambooth.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --class_data_dir=$CLASS_DIR \ + --output_dir=$OUTPUT_DIR \ + --with_prior_preservation --prior_loss_weight=1.0 \ + --instance_prompt="a photo of sks dog" \ + --class_prompt="a photo of dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --sample_batch_size=1 \ + --gradient_accumulation_steps=1 --gradient_checkpointing \ + --learning_rate=5e-6 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --num_class_images=200 \ + --max_train_steps=800 \ + --mixed_precision=fp16 +``` + +## 추론 + +모델을 학습한 후에는, 모델이 저장된 경로를 지정해 [`StableDiffusionPipeline`]로 추론을 수행할 수 있습니다. 프롬프트에 학습에 사용된 특수 `식별자`(이전 예시의 `sks`)가 포함되어 있는지 확인하세요. + +**`"accelerate>=0.16.0"`**이 설치되어 있는 경우 다음 코드를 사용하여 중간 체크포인트에서 추론을 실행할 수 있습니다: + +```python +from diffusers import StableDiffusionPipeline +import torch + +model_id = "path_to_saved_model" +pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") + +prompt = "A photo of sks dog in a bucket" +image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5).images[0] + +image.save("dog-bucket.png") +``` + +[저장된 학습 체크포인트](#inference-from-a-saved-checkpoint)에서도 추론을 실행할 수도 있습니다. \ No newline at end of file diff --git a/docs/source/ko/training/lora.mdx b/docs/source/ko/training/lora.mdx new file mode 100644 index 000000000000..9aebb0fa3109 --- /dev/null +++ b/docs/source/ko/training/lora.mdx @@ -0,0 +1,128 @@ + + +# Low-Rank Adaptation of Large Language Models (LoRA) + +[[open-in-colab]] + + + +현재 LoRA는 [`UNet2DConditionalModel`]의 어텐션 레이어에서만 지원됩니다. + + + +[LoRA(Low-Rank Adaptation of Large Language Models)](https://arxiv.org/abs/2106.09685)는 메모리를 적게 사용하면서 대규모 모델의 학습을 가속화하는 학습 방법입니다. 이는 rank-decomposition weight 행렬 쌍(**업데이트 행렬**이라고 함)을 추가하고 새로 추가된 가중치**만** 학습합니다. 여기에는 몇 가지 장점이 있습니다. + +- 이전에 미리 학습된 가중치는 고정된 상태로 유지되므로 모델이 [치명적인 망각](https://www.pnas.org/doi/10.1073/pnas.1611835114) 경향이 없습니다. +- Rank-decomposition 행렬은 원래 모델보다 파라메터 수가 훨씬 적으므로 학습된 LoRA 가중치를 쉽게 끼워넣을 수 있습니다. +- LoRA 매트릭스는 일반적으로 원본 모델의 어텐션 레이어에 추가됩니다. 🧨 Diffusers는 [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`] 메서드를 제공하여 LoRA 가중치를 모델의 어텐션 레이어로 불러옵니다. `scale` 매개변수를 통해 모델이 새로운 학습 이미지에 맞게 조정되는 범위를 제어할 수 있습니다. +- 메모리 효율성이 향상되어 Tesla T4, RTX 3080 또는 RTX 2080 Ti와 같은 소비자용 GPU에서 파인튜닝을 실행할 수 있습니다! T4와 같은 GPU는 무료이며 Kaggle 또는 Google Colab 노트북에서 쉽게 액세스할 수 있습니다. + + + + +💡 LoRA는 어텐션 레이어에만 한정되지는 않습니다. 저자는 언어 모델의 어텐션 레이어를 수정하는 것이 매우 효율적으로 죻은 성능을 얻기에 충분하다는 것을 발견했습니다. 이것이 LoRA 가중치를 모델의 어텐션 레이어에 추가하는 것이 일반적인 이유입니다. LoRA 작동 방식에 대한 자세한 내용은 [Using LoRA for effective Stable Diffusion fine-tuning](https://huggingface.co/blog/lora) 블로그를 확인하세요! + + + +[cloneofsimo](https://github.com/cloneofsimo)는 인기 있는 [lora](https://github.com/cloneofsimo/lora) GitHub 리포지토리에서 Stable Diffusion을 위한 LoRA 학습을 최초로 시도했습니다. 🧨 Diffusers는 [text-to-image 생성](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image#training-with-lora) 및 [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth#training-with-low-rank-adaptation-of-large-language-models-lora)을 지원합니다. 이 가이드는 두 가지를 모두 수행하는 방법을 보여줍니다. + +모델을 저장하거나 커뮤니티와 공유하려면 Hugging Face 계정에 로그인하세요(아직 계정이 없는 경우 [생성](hf.co/join)하세요): + +```bash +huggingface-cli login +``` + +## Text-to-image + +수십억 개의 파라메터들이 있는 Stable Diffusion과 같은 모델을 파인튜닝하는 것은 느리고 어려울 수 있습니다. LoRA를 사용하면 diffusion 모델을 파인튜닝하는 것이 훨씬 쉽고 빠릅니다. 8비트 옵티마이저와 같은 트릭에 의존하지 않고도 11GB의 GPU RAM으로 하드웨어에서 실행할 수 있습니다. + + +### 학습 [[text-to-image 학습]] + +[Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋으로 [`stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)를 파인튜닝해 나만의 포켓몬을 생성해 보겠습니다. + +시작하려면 `MODEL_NAME` 및 `DATASET_NAME` 환경 변수가 설정되어 있는지 확인하십시오. `OUTPUT_DIR` 및 `HUB_MODEL_ID` 변수는 선택 사항이며 허브에서 모델을 저장할 위치를 지정합니다. + +```bash +export MODEL_NAME="runwayml/stable-diffusion-v1-5" +export OUTPUT_DIR="/sddata/finetune/lora/pokemon" +export HUB_MODEL_ID="pokemon-lora" +export DATASET_NAME="lambdalabs/pokemon-blip-captions" +``` + +학습을 시작하기 전에 알아야 할 몇 가지 플래그가 있습니다. + +* `--push_to_hub`를 명시하면 학습된 LoRA 임베딩을 허브에 저장합니다. +* `--report_to=wandb`는 학습 결과를 가중치 및 편향 대시보드에 보고하고 기록합니다(예를 들어, 이 [보고서](https://wandb.ai/pcuenq/text2image-fine-tune/run/b4k1w0tn?workspace=user-pcuenq)를 참조하세요). +* `--learning_rate=1e-04`, 일반적으로 LoRA에서 사용하는 것보다 더 높은 학습률을 사용할 수 있습니다. + +이제 학습을 시작할 준비가 되었습니다 (전체 학습 스크립트는 [여기](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py)에서 찾을 수 있습니다). + +```bash +accelerate launch train_dreambooth_lora.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a photo of sks dog" \ + --resolution=512 \ + --train_batch_size=1 \ + --gradient_accumulation_steps=1 \ + --checkpointing_steps=100 \ + --learning_rate=1e-4 \ + --report_to="wandb" \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --max_train_steps=500 \ + --validation_prompt="A photo of sks dog in a bucket" \ + --validation_epochs=50 \ + --seed="0" \ + --push_to_hub +``` + +### 추론 [[dreambooth 추론]] + +이제 [`StableDiffusionPipeline`]에서 기본 모델을 불러와 추론을 위해 모델을 사용할 수 있습니다: + +```py +>>> import torch +>>> from diffusers import StableDiffusionPipeline + +>>> model_base = "runwayml/stable-diffusion-v1-5" + +>>> pipe = StableDiffusionPipeline.from_pretrained(model_base, torch_dtype=torch.float16) +``` + +*기본 모델의 가중치 위에* 파인튜닝된 DreamBooth 모델에서 LoRA 가중치를 로드한 다음, 더 빠른 추론을 위해 파이프라인을 GPU로 이동합니다. LoRA 가중치를 프리징된 사전 훈련된 모델 가중치와 병합할 때, 선택적으로 'scale' 매개변수로 어느 정도의 가중치를 병합할 지 조절할 수 있습니다: + + + +💡 `0`의 `scale` 값은 LoRA 가중치를 사용하지 않아 원래 모델의 가중치만 사용한 것과 같고, `1`의 `scale` 값은 파인튜닝된 LoRA 가중치만 사용함을 의미합니다. 0과 1 사이의 값들은 두 결과들 사이로 보간됩니다. + + + +```py +>>> pipe.unet.load_attn_procs(model_path) +>>> pipe.to("cuda") +# LoRA 파인튜닝된 모델의 가중치 절반과 기본 모델의 가중치 절반 사용 + +>>> image = pipe( +... "A picture of a sks dog in a bucket.", +... num_inference_steps=25, +... guidance_scale=7.5, +... cross_attention_kwargs={"scale": 0.5}, +... ).images[0] +# 완전히 파인튜닝된 LoRA 모델의 가중치 사용 + +>>> image = pipe("A picture of a sks dog in a bucket.", num_inference_steps=25, guidance_scale=7.5).images[0] +>>> image.save("bucket-dog.png") +``` \ No newline at end of file diff --git a/docs/source/ko/training/text2image.mdx b/docs/source/ko/training/text2image.mdx new file mode 100644 index 000000000000..069388603124 --- /dev/null +++ b/docs/source/ko/training/text2image.mdx @@ -0,0 +1,224 @@ + + + +# Text-to-image + + + +text-to-image 파인튜닝 스크립트는 experimental 상태입니다. 과적합하기 쉽고 치명적인 망각과 같은 문제에 부딪히기 쉽습니다. 자체 데이터셋에서 최상의 결과를 얻으려면 다양한 하이퍼파라미터를 탐색하는 것이 좋습니다. + + + +Stable Diffusion과 같은 text-to-image 모델은 텍스트 프롬프트에서 이미지를 생성합니다. 이 가이드는 PyTorch 및 Flax를 사용하여 자체 데이터셋에서 [`CompVis/stable-diffusion-v1-4`](https://huggingface.co/CompVis/stable-diffusion-v1-4) 모델로 파인튜닝하는 방법을 보여줍니다. 이 가이드에 사용된 text-to-image 파인튜닝을 위한 모든 학습 스크립트에 관심이 있는 경우 이 [리포지토리](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image)에서 자세히 찾을 수 있습니다. + +스크립트를 실행하기 전에, 라이브러리의 학습 dependency들을 설치해야 합니다: + +```bash +pip install git+https://github.com/huggingface/diffusers.git +pip install -U -r requirements.txt +``` + +그리고 [🤗Accelerate](https://github.com/huggingface/accelerate/) 환경을 초기화합니다: + +```bash +accelerate config +``` + +리포지토리를 이미 복제한 경우, 이 단계를 수행할 필요가 없습니다. 대신, 로컬 체크아웃 경로를 학습 스크립트에 명시할 수 있으며 거기에서 로드됩니다. + +### 하드웨어 요구 사항 + +`gradient_checkpointing` 및 `mixed_precision`을 사용하면 단일 24GB GPU에서 모델을 파인튜닝할 수 있습니다. 더 높은 `batch_size`와 더 빠른 훈련을 위해서는 GPU 메모리가 30GB 이상인 GPU를 사용하는 것이 좋습니다. TPU 또는 GPU에서 파인튜닝을 위해 JAX나 Flax를 사용할 수도 있습니다. 자세한 내용은 [아래](#flax-jax-finetuning)를 참조하세요. + +xFormers로 memory efficient attention을 활성화하여 메모리 사용량 훨씬 더 줄일 수 있습니다. [xFormers가 설치](./optimization/xformers)되어 있는지 확인하고 `--enable_xformers_memory_efficient_attention`를 학습 스크립트에 명시합니다. + +xFormers는 Flax에 사용할 수 없습니다. + +## Hub에 모델 업로드하기 + +학습 스크립트에 다음 인수를 추가하여 모델을 허브에 저장합니다: + +```bash + --push_to_hub +``` + + +## 체크포인트 저장 및 불러오기 + +학습 중 발생할 수 있는 일에 대비하여 정기적으로 체크포인트를 저장해 두는 것이 좋습니다. 체크포인트를 저장하려면 학습 스크립트에 다음 인수를 명시합니다. + +```bash + --checkpointing_steps=500 +``` + +500스텝마다 전체 학습 state가 'output_dir'의 하위 폴더에 저장됩니다. 체크포인트는 'checkpoint-'에 지금까지 학습된 step 수입니다. 예를 들어 'checkpoint-1500'은 1500 학습 step 후에 저장된 체크포인트입니다. + +학습을 재개하기 위해 체크포인트를 불러오려면 '--resume_from_checkpoint' 인수를 학습 스크립트에 명시하고 재개할 체크포인트를 지정하십시오. 예를 들어 다음 인수는 1500개의 학습 step 후에 저장된 체크포인트에서부터 훈련을 재개합니다. + +```bash + --resume_from_checkpoint="checkpoint-1500" +``` + +## 파인튜닝 + + + +다음과 같이 [Pokémon BLIP 캡션](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions) 데이터셋에서 파인튜닝 실행을 위해 [PyTorch 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)를 실행합니다: + + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export dataset_name="lambdalabs/pokemon-blip-captions" + +accelerate launch train_text_to_image.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$dataset_name \ + --use_ema \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --mixed_precision="fp16" \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --lr_scheduler="constant" --lr_warmup_steps=0 \ + --output_dir="sd-pokemon-model" +``` + +자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다. + +사용자 커스텀 loading logic을 사용하려면 스크립트를 수정하십시오. 도움이 되도록 코드의 적절한 위치에 포인터를 남겼습니다. 🤗 아래 예제 스크립트는 `TRAIN_DIR`의 로컬 데이터셋으로를 파인튜닝하는 방법과 `OUTPUT_DIR`에서 모델을 저장할 위치를 보여줍니다: + + +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export TRAIN_DIR="path_to_your_dataset" +export OUTPUT_DIR="path_to_save_model" + +accelerate launch train_text_to_image.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --train_data_dir=$TRAIN_DIR \ + --use_ema \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --mixed_precision="fp16" \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --lr_scheduler="constant" --lr_warmup_steps=0 \ + --output_dir=${OUTPUT_DIR} +``` + + + +[@duongna211](https://github.com/duongna21)의 기여로, Flax를 사용해 TPU 및 GPU에서 Stable Diffusion 모델을 더 빠르게 학습할 수 있습니다. 이는 TPU 하드웨어에서 매우 효율적이지만 GPU에서도 훌륭하게 작동합니다. Flax 학습 스크립트는 gradient checkpointing나 gradient accumulation과 같은 기능을 아직 지원하지 않으므로 메모리가 30GB 이상인 GPU 또는 TPU v3가 필요합니다. + +스크립트를 실행하기 전에 요구 사항이 설치되어 있는지 확인하십시오: + +```bash +pip install -U -r requirements_flax.txt +``` + +그러면 다음과 같이 [Flax 학습 스크립트](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_flax.py)를 실행할 수 있습니다. + +```bash +export MODEL_NAME="runwayml/stable-diffusion-v1-5" +export dataset_name="lambdalabs/pokemon-blip-captions" + +python train_text_to_image_flax.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$dataset_name \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --output_dir="sd-pokemon-model" +``` + +자체 데이터셋으로 파인튜닝하려면 🤗 [Datasets](https://huggingface.co/docs/datasets/index)에서 요구하는 형식에 따라 데이터셋을 준비하세요. [데이터셋을 허브에 업로드](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub)하거나 [파일들이 있는 로컬 폴더를 준비](https ://huggingface.co/docs/datasets/image_dataset#imagefolder)할 수 있습니다. + +사용자 커스텀 loading logic을 사용하려면 스크립트를 수정하십시오. 도움이 되도록 코드의 적절한 위치에 포인터를 남겼습니다. 🤗 아래 예제 스크립트는 `TRAIN_DIR`의 로컬 데이터셋으로를 파인튜닝하는 방법을 보여줍니다: + +```bash +export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" +export TRAIN_DIR="path_to_your_dataset" + +python train_text_to_image_flax.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --train_data_dir=$TRAIN_DIR \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --mixed_precision="fp16" \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --output_dir="sd-pokemon-model" +``` + + + +## LoRA + +Text-to-image 모델 파인튜닝을 위해, 대규모 모델 학습을 가속화하기 위한 파인튜닝 기술인 LoRA(Low-Rank Adaptation of Large Language Models)를 사용할 수 있습니다. 자세한 내용은 [LoRA 학습](lora#text-to-image) 가이드를 참조하세요. + +## 추론 + +허브의 모델 경로 또는 모델 이름을 [`StableDiffusionPipeline`]에 전달하여 추론을 위해 파인 튜닝된 모델을 불러올 수 있습니다: + + + +```python +from diffusers import StableDiffusionPipeline + +model_path = "path_to_saved_model" +pipe = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float16) +pipe.to("cuda") + +image = pipe(prompt="yoda").images[0] +image.save("yoda-pokemon.png") +``` + + +```python +import jax +import numpy as np +from flax.jax_utils import replicate +from flax.training.common_utils import shard +from diffusers import FlaxStableDiffusionPipeline + +model_path = "path_to_saved_model" +pipe, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16) + +prompt = "yoda pokemon" +prng_seed = jax.random.PRNGKey(0) +num_inference_steps = 50 + +num_samples = jax.device_count() +prompt = num_samples * [prompt] +prompt_ids = pipeline.prepare_inputs(prompt) + +# shard inputs and rng +params = replicate(params) +prng_seed = jax.random.split(prng_seed, jax.device_count()) +prompt_ids = shard(prompt_ids) + +images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images +images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:]))) +image.save("yoda-pokemon.png") +``` + + \ No newline at end of file From b8b5daaee30ecbecd7b901020008ffead443665d Mon Sep 17 00:00:00 2001 From: Ambrosiussen Date: Mon, 22 May 2023 16:49:35 +0200 Subject: [PATCH 007/199] DataLoader respecting EXIF data in Training Images (#3465) * DataLoader will now bake in any transforms or image manipulations contained in the EXIF Images may have rotations stored in EXIF. Training using such images will cause those transforms to be ignored while training and thus produce unexpected results * Fixed the Dataloading EXIF issue in main DreamBooth training as well * Run make style (black & isort) --- examples/dreambooth/train_dreambooth.py | 23 ++++++++++++-------- examples/dreambooth/train_dreambooth_lora.py | 23 ++++++++++++-------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index efcfb39ab4c4..53d9c269f3e7 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -27,19 +27,13 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint +from torch.utils.data import Dataset + +import diffusers import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed -from huggingface_hub import create_repo, model_info, upload_folder -from packaging import version -from PIL import Image -from torch.utils.data import Dataset -from torchvision import transforms -from tqdm.auto import tqdm -from transformers import AutoTokenizer, PretrainedConfig - -import diffusers from diffusers import ( AutoencoderKL, DDPMScheduler, @@ -50,6 +44,13 @@ from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import create_repo, model_info, upload_folder +from packaging import version +from PIL import Image +from PIL.ImageOps import exif_transpose +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig if is_wandb_available(): @@ -607,6 +608,8 @@ def __len__(self): def __getitem__(self, index): example = {} instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + instance_image = exif_transpose(instance_image) + if not instance_image.mode == "RGB": instance_image = instance_image.convert("RGB") example["instance_images"] = self.image_transforms(instance_image) @@ -622,6 +625,8 @@ def __getitem__(self, index): if self.class_data_root: class_image = Image.open(self.class_images_path[index % self.num_class_images]) + class_image = exif_transpose(class_image) + if not class_image.mode == "RGB": class_image = class_image.convert("RGB") example["class_images"] = self.image_transforms(class_image) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index bfbf3603e8d0..659b0d3e1d88 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -27,19 +27,13 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint +from torch.utils.data import Dataset + +import diffusers import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed -from huggingface_hub import create_repo, upload_folder -from packaging import version -from PIL import Image -from torch.utils.data import Dataset -from torchvision import transforms -from tqdm.auto import tqdm -from transformers import AutoTokenizer, PretrainedConfig - -import diffusers from diffusers import ( AutoencoderKL, DDPMScheduler, @@ -59,6 +53,13 @@ from diffusers.optimization import get_scheduler from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available +from huggingface_hub import create_repo, upload_folder +from packaging import version +from PIL import Image +from PIL.ImageOps import exif_transpose +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig # Will error if the minimal version of diffusers is not installed. Remove at your own risks. @@ -508,6 +509,8 @@ def __len__(self): def __getitem__(self, index): example = {} instance_image = Image.open(self.instance_images_path[index % self.num_instance_images]) + instance_image = exif_transpose(instance_image) + if not instance_image.mode == "RGB": instance_image = instance_image.convert("RGB") example["instance_images"] = self.image_transforms(instance_image) @@ -523,6 +526,8 @@ def __getitem__(self, index): if self.class_data_root: class_image = Image.open(self.class_images_path[index % self.num_class_images]) + class_image = exif_transpose(class_image) + if not class_image.mode == "RGB": class_image = class_image.convert("RGB") example["class_images"] = self.image_transforms(class_image) From 2b56e8ca6810d40b820592a9673b538705141482 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 22 May 2023 16:49:46 +0200 Subject: [PATCH 008/199] make style --- examples/dreambooth/train_dreambooth.py | 20 ++++++++++---------- examples/dreambooth/train_dreambooth_lora.py | 20 ++++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 53d9c269f3e7..ad43ee7aeee2 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -27,13 +27,20 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint -from torch.utils.data import Dataset - -import diffusers import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed +from huggingface_hub import create_repo, model_info, upload_folder +from packaging import version +from PIL import Image +from PIL.ImageOps import exif_transpose +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +import diffusers from diffusers import ( AutoencoderKL, DDPMScheduler, @@ -44,13 +51,6 @@ from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available -from huggingface_hub import create_repo, model_info, upload_folder -from packaging import version -from PIL import Image -from PIL.ImageOps import exif_transpose -from torchvision import transforms -from tqdm.auto import tqdm -from transformers import AutoTokenizer, PretrainedConfig if is_wandb_available(): diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 659b0d3e1d88..e640542e36da 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -27,13 +27,20 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint -from torch.utils.data import Dataset - -import diffusers import transformers from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed +from huggingface_hub import create_repo, upload_folder +from packaging import version +from PIL import Image +from PIL.ImageOps import exif_transpose +from torch.utils.data import Dataset +from torchvision import transforms +from tqdm.auto import tqdm +from transformers import AutoTokenizer, PretrainedConfig + +import diffusers from diffusers import ( AutoencoderKL, DDPMScheduler, @@ -53,13 +60,6 @@ from diffusers.optimization import get_scheduler from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available -from huggingface_hub import create_repo, upload_folder -from packaging import version -from PIL import Image -from PIL.ImageOps import exif_transpose -from torchvision import transforms -from tqdm.auto import tqdm -from transformers import AutoTokenizer, PretrainedConfig # Will error if the minimal version of diffusers is not installed. Remove at your own risks. From f3d570c273561b7f92a1ab55e6c846bb73c19a29 Mon Sep 17 00:00:00 2001 From: Hari Krishna <37787894+hari10599@users.noreply.github.com> Date: Mon, 22 May 2023 20:41:08 +0530 Subject: [PATCH 009/199] feat: allow disk offload for diffuser models (#3285) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * allow disk offload for diffuser models * sort import * add max_memory argument * Changed sample[0] to images[0] (#3304) A pipeline object stores the results in `images` not in `sample`. Current code blocks don't work. * Typo in tutorial (#3295) * Torch compile graph fix (#3286) * fix more * Fix more * fix more * Apply suggestions from code review * fix * make style * make fix-copies * fix * make sure torch compile * Clean * fix test * Postprocessing refactor img2img (#3268) * refactor img2img VaeImageProcessor.postprocess * remove copy from for init, run_safety_checker, decode_latents Co-authored-by: Sayak Paul --------- Co-authored-by: yiyixuxu Co-authored-by: Sayak Paul * [Torch 2.0 compile] Fix more torch compile breaks (#3313) * Fix more torch compile breaks * add tests * Fix all * fix controlnet * fix more * Add Horace He as co-author. > > Co-authored-by: Horace He * Add Horace He as co-author. Co-authored-by: Horace He --------- Co-authored-by: Horace He * fix: scale_lr and sync example readme and docs. (#3299) * fix: scale_lr and sync example readme and docs. * fix doc link. * Update stable_diffusion.mdx (#3310) fixed import statement * Fix missing variable assign in DeepFloyd-IF-II (#3315) Fix missing variable assign lol * Correct doc build for patch releases (#3316) Update build_documentation.yml * Add Stable Diffusion RePaint to community pipelines (#3320) * Add Stable Diffsuion RePaint to community pipelines - Adds Stable Diffsuion RePaint to community pipelines - Add Readme enty for pipeline * Fix: Remove wrong import - Remove wrong import - Minor change in comments * Fix: Code formatting of stable_diffusion_repaint * Fix: ruff errors in stable_diffusion_repaint * Fix multistep dpmsolver for cosine schedule (suitable for deepfloyd-if) (#3314) * fix multistep dpmsolver for cosine schedule (deepfloy-if) * fix a typo * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * update all dpmsolver (singlestep, multistep, dpm, dpm++) for cosine noise schedule * add test, fix style --------- Co-authored-by: Patrick von Platen * [docs] Improve LoRA docs (#3311) * update docs * add to toctree * apply feedback * Added input pretubation (#3292) * Added input pretubation * Fixed spelling * Update write_own_pipeline.mdx (#3323) * update controlling generation doc with latest goodies. (#3321) * [Quality] Make style (#3341) * Fix config dpm (#3343) * Add the SDE variant of DPM-Solver and DPM-Solver++ (#3344) * add SDE variant of DPM-Solver and DPM-Solver++ * add test * fix typo * fix typo * Add upsample_size to AttnUpBlock2D, AttnDownBlock2D (#3275) The argument `upsample_size` needs to be added to these modules to allow compatibility with other blocks that require this argument. * Rename --only_save_embeds to --save_as_full_pipeline (#3206) * Set --only_save_embeds to False by default Due to how the option is named, it makes more sense to behave like this. * Refactor only_save_embeds to save_as_full_pipeline * [AudioLDM] Generalise conversion script (#3328) Co-authored-by: Patrick von Platen * Fix TypeError when using prompt_embeds and negative_prompt (#2982) * test: Added test case * fix: fixed type checking issue on _encode_prompt * fix: fixed copies consistency * fix: one copy was not sufficient * Fix pipeline class on README (#3345) Update README.md * Inpainting: typo in docs (#3331) Typo in docs Co-authored-by: Patrick von Platen * Add `use_Karras_sigmas` to LMSDiscreteScheduler (#3351) * add karras sigma to lms discrete scheduler * add test for lms_scheduler karras * reformat test lms * Batched load of textual inversions (#3277) * Batched load of textual inversions - Only call resize_token_embeddings once per batch as it is the most expensive operation - Allow pretrained_model_name_or_path and token to be an optional list - Remove Dict from type annotation pretrained_model_name_or_path as it was not supported in this function - Add comment that single files (e.g. .pt/.safetensors) are supported - Add comment for token parameter - Convert token override log message from warning to info * Update src/diffusers/loaders.py Check for duplicate tokens Co-authored-by: Patrick von Platen * Update condition for None tokens --------- Co-authored-by: Patrick von Platen * make fix-copies * [docs] Fix docstring (#3334) fix docstring Co-authored-by: Patrick von Platen * if dreambooth lora (#3360) * update IF stage I pipelines add fixed variance schedulers and lora loading * added kv lora attn processor * allow loading into alternative lora attn processor * make vae optional * throw away predicted variance * allow loading into added kv lora layer * allow load T5 * allow pre compute text embeddings * set new variance type in schedulers * fix copies * refactor all prompt embedding code class prompts are now included in pre-encoding code max tokenizer length is now configurable embedding attention mask is now configurable * fix for when variance type is not defined on scheduler * do not pre compute validation prompt if not present * add example test for if lora dreambooth * add check for train text encoder and pre compute text embeddings * Postprocessing refactor all others (#3337) * add text2img * fix-copies * add * add all other pipelines * add * add * add * add * add * make style * style + fix copies --------- Co-authored-by: yiyixuxu * [docs] Improve safetensors docstring (#3368) * clarify safetensor docstring * fix typo * apply feedback * add: a warning message when using xformers in a PT 2.0 env. (#3365) * add: a warning message when using xformers in a PT 2.0 env. * Apply suggestions from code review Co-authored-by: Patrick von Platen --------- Co-authored-by: Patrick von Platen * StableDiffusionInpaintingPipeline - resize image w.r.t height and width (#3322) * StableDiffusionInpaintingPipeline now resizes input images and masks w.r.t to passed input height and width. Default is already set to 512. This addresses the common tensor mismatch error. Also moved type check into relevant funciton to keep main pipeline body tidy. * Fixed StableDiffusionInpaintingPrepareMaskAndMaskedImageTests Due to previous commit these tests were failing as height and width need to be passed into the prepare_mask_and_masked_image function, I have updated the code and added a height/width variable per unit test as it seemed more appropriate than the current hard coded solution * Added a resolution test to StableDiffusionInpaintPipelineSlowTests this unit test simply gets the input and resizes it into some that would fail (e.g. would throw a tensor mismatch error/not a mult of 8). Then passes it through the pipeline and verifies it produces output with correct dims w.r.t the passed height and width --------- Co-authored-by: Patrick von Platen * make style * [docs] Adapt a model (#3326) * first draft * apply feedback * conv_in.weight thrown away * [docs] Load safetensors (#3333) * safetensors * apply feedback * apply feedback * Apply suggestions from code review --------- Co-authored-by: Patrick von Platen * make style * [Docs] Fix stable_diffusion.mdx typo (#3398) Fix typo in last code block. Correct "prommpts" to "prompt" * Support ControlNet v1.1 shuffle properly (#3340) * add inferring_controlnet_cond_batch * Revert "add inferring_controlnet_cond_batch" This reverts commit abe8d6311d4b7f5b9409ca709c7fabf80d06c1a9. * set guess_mode to True whenever global_pool_conditions is True Co-authored-by: Patrick von Platen * nit * add integration test --------- Co-authored-by: Patrick von Platen * [Tests] better determinism (#3374) * enable deterministic pytorch and cuda operations. * disable manual seeding. * make style && make quality for unet_2d tests. * enable determinism for the unet2dconditional model. * add CUBLAS_WORKSPACE_CONFIG for better reproducibility. * relax tolerance (very weird issue, though). * revert to torch manual_seed() where needed. * relax more tolerance. * better placement of the cuda variable and relax more tolerance. * enable determinism for 3d condition model. * relax tolerance. * add: determinism to alt_diffusion. * relax tolerance for alt diffusion. * dance diffusion. * dance diffusion is flaky. * test_dict_tuple_outputs_equivalent edit. * fix two more tests. * fix more ddim tests. * fix: argument. * change to diff in place of difference. * fix: test_save_load call. * test_save_load_float16 call. * fix: expected_max_diff * fix: paint by example. * relax tolerance. * add determinism to 1d unet model. * torch 2.0 regressions seem to be brutal * determinism to vae. * add reason to skipping. * up tolerance. * determinism to vq. * determinism to cuda. * determinism to the generic test pipeline file. * refactor general pipelines testing a bit. * determinism to alt diffusion i2i * up tolerance for alt diff i2i and audio diff * up tolerance. * determinism to audioldm * increase tolerance for audioldm lms. * increase tolerance for paint by paint. * increase tolerance for repaint. * determinism to cycle diffusion and sd 1. * relax tol for cycle diffusion 🚲 * relax tol for sd 1.0 * relax tol for controlnet. * determinism to img var. * relax tol for img variation. * tolerance to i2i sd * make style * determinism to inpaint. * relax tolerance for inpaiting. * determinism for inpainting legacy * relax tolerance. * determinism to instruct pix2pix * determinism to model editing. * model editing tolerance. * panorama determinism * determinism to pix2pix zero. * determinism to sag. * sd 2. determinism * sd. tolerance * disallow tf32 matmul. * relax tolerance is all you need. * make style and determinism to sd 2 depth * relax tolerance for depth. * tolerance to diffedit. * tolerance to sd 2 inpaint. * up tolerance. * determinism in upscaling. * tolerance in upscaler. * more tolerance relaxation. * determinism to v pred. * up tol for v_pred * unclip determinism * determinism to unclip img2img * determinism to text to video. * determinism to last set of tests * up tol. * vq cumsum doesn't have a deterministic kernel * relax tol * relax tol * [docs] Add transformers to install (#3388) add transformers to install * [deepspeed] partial ZeRO-3 support (#3076) * [deepspeed] partial ZeRO-3 support * cleanup * improve deepspeed fixes * Improve * make style --------- Co-authored-by: Patrick von Platen * Add omegaconf for tests (#3400) Add omegaconfg * Fix various bugs with LoRA Dreambooth and Dreambooth script (#3353) * Improve checkpointing lora * fix more * Improve doc string * Update src/diffusers/loaders.py * make stytle * Apply suggestions from code review * Update src/diffusers/loaders.py * Apply suggestions from code review * Apply suggestions from code review * better * Fix all * Fix multi-GPU dreambooth * Apply suggestions from code review Co-authored-by: Pedro Cuenca * Fix all * make style * make style --------- Co-authored-by: Pedro Cuenca * Fix docker file (#3402) * up * up * fix: deepseepd_plugin retrieval from accelerate state (#3410) * [Docs] Add `sigmoid` beta_scheduler to docstrings of relevant Schedulers (#3399) * Add `sigmoid` beta scheduler to `DDPMScheduler` docstring * Add `sigmoid` beta scheduler to `RePaintScheduler` docstring --------- Co-authored-by: Patrick von Platen * Don't install accelerate and transformers from source (#3415) * Don't install transformers and accelerate from source (#3414) * Improve fast tests (#3416) Update pr_tests.yml * attention refactor: the trilogy (#3387) * Replace `AttentionBlock` with `Attention` * use _from_deprecated_attn_block check re: @patrickvonplaten * [Docs] update the PT 2.0 optimization doc with latest findings (#3370) * add: benchmarking stats for A100 and V100. * Apply suggestions from code review Co-authored-by: Patrick von Platen * address patrick's comments. * add: rtx 4090 stats * ⚔ benchmark reports done * Apply suggestions from code review Co-authored-by: Pedro Cuenca * 3313 pr link. * add: plots. Co-authored-by: Pedro * fix formattimg * update number percent. --------- Co-authored-by: Patrick von Platen Co-authored-by: Pedro Cuenca * Fix style rendering (#3433) * Fix style rendering. * Fix typo * unCLIP scheduler do not use note (#3417) * Replace deprecated command with environment file (#3409) Co-authored-by: Patrick von Platen * fix warning message pipeline loading (#3446) * add stable diffusion tensorrt img2img pipeline (#3419) * add stable diffusion tensorrt img2img pipeline Signed-off-by: Asfiya Baig * update docstrings Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig * Refactor controlnet and add img2img and inpaint (#3386) * refactor controlnet and add img2img and inpaint * First draft to get pipelines to work * make style * Fix more * Fix more * More tests * Fix more * Make inpainting work * make style and more tests * Apply suggestions from code review * up * make style * Fix imports * Fix more * Fix more * Improve examples * add test * Make sure import is correctly deprecated * Make sure everything works in compile mode * make sure authorship is correctly attributed * [Scheduler] DPM-Solver (++) Inverse Scheduler (#3335) * Add DPM-Solver Multistep Inverse Scheduler * Add draft tests for DiffEdit * Add inverse sde-dpmsolver steps to tune image diversity from inverted latents * Fix tests --------- Co-authored-by: Patrick von Platen * [Docs] Fix incomplete docstring for resnet.py (#3438) Fix incomplete docstrings for resnet.py * fix tiled vae blend extent range (#3384) fix tiled vae bleand extent range * Small update to "Next steps" section (#3443) Small update to "Next steps" section: - PyTorch 2 is recommended. - Updated improvement figures. * Allow arbitrary aspect ratio in IFSuperResolutionPipeline (#3298) * Update pipeline_if_superresolution.py Allow arbitrary aspect ratio in IFSuperResolutionPipeline by using the input image shape * IFSuperResolutionPipeline: allow the user to override the height and width through the arguments * update IFSuperResolutionPipeline width/height doc string to match StableDiffusionInpaintPipeline conventions --------- Co-authored-by: Patrick von Platen * Adding 'strength' parameter to StableDiffusionInpaintingPipeline (#3424) * Added explanation of 'strength' parameter * Added get_timesteps function which relies on new strength parameter * Added `strength` parameter which defaults to 1. * Swapped ordering so `noise_timestep` can be calculated before masking the image this is required when you aren't applying 100% noise to the masked region, e.g. strength < 1. * Added strength to check_inputs, throws error if out of range * Changed `prepare_latents` to initialise latents w.r.t strength inspired from the stable diffusion img2img pipeline, init latents are initialised by converting the init image into a VAE latent and adding noise (based upon the strength parameter passed in), e.g. random when strength = 1, or the init image at strength = 0. * WIP: Added a unit test for the new strength parameter in the StableDiffusionInpaintingPipeline still need to add correct regression values * Created a is_strength_max to initialise from pure random noise * Updated unit tests w.r.t new strength parameter + fixed new strength unit test * renamed parameter to avoid confusion with variable of same name * Updated regression values for new strength test - now passes * removed 'copied from' comment as this method is now different and divergent from the cpy * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py Co-authored-by: Patrick von Platen * Ensure backwards compatibility for prepare_mask_and_masked_image created a return_image boolean and initialised to false * Ensure backwards compatibility for prepare_latents * Fixed copy check typo * Fixes w.r.t backward compibility changes * make style * keep function argument ordering same for backwards compatibility in callees with copied from statements * make fix-copies --------- Co-authored-by: Patrick von Platen Co-authored-by: William Berman * [WIP] Bugfix - Pipeline.from_pretrained is broken when the pipeline is partially downloaded (#3448) Added bugfix using f strings. * Fix gradient checkpointing bugs in freezing part of models (requires_grad=False) (#3404) * gradient checkpointing bug fix * bug fix; changes for reviews * reformat * reformat --------- Co-authored-by: Patrick von Platen * Make dreambooth lora more robust to orig unet (#3462) * Make dreambooth lora more robust to orig unet * up * Reduce peak VRAM by releasing large attention tensors (as soon as they're unnecessary) (#3463) Release large tensors in attention (as soon as they're no longer required). Reduces peak VRAM by nearly 2 GB for 1024x1024 (even after slicing), and the savings scale up with image size. * Add min snr to text2img lora training script (#3459) add min snr to text2img lora training script * Add inpaint lora scale support (#3460) * add inpaint lora scale support * add inpaint lora scale test --------- Co-authored-by: yueyang.hyy * [From ckpt] Fix from_ckpt (#3466) * Correct from_ckpt * make style * Update full dreambooth script to work with IF (#3425) * Add IF dreambooth docs (#3470) * parameterize pass single args through tuple (#3477) * attend and excite tests disable determinism on the class level (#3478) * dreambooth docs torch.compile note (#3471) * dreambooth docs torch.compile note * Update examples/dreambooth/README.md Co-authored-by: Sayak Paul * Update examples/dreambooth/README.md Co-authored-by: Pedro Cuenca --------- Co-authored-by: Sayak Paul Co-authored-by: Pedro Cuenca * add: if entry in the dreambooth training docs. (#3472) * [docs] Textual inversion inference (#3473) * add textual inversion inference to docs * add to toctree --------- Co-authored-by: Sayak Paul * [docs] Distributed inference (#3376) * distributed inference * move to inference section * apply feedback * update with split_between_processes * apply feedback * [{Up,Down}sample1d] explicit view kernel size as number elements in flattened indices (#3479) explicit view kernel size as number elements in flattened indices * mps & onnx tests rework (#3449) * Remove ONNX tests from PR. They are already a part of push_tests.yml. * Remove mps tests from PRs. They are already performed on push. * Fix workflow name for fast push tests. * Extract mps tests to a workflow. For better control/filtering. * Remove --extra-index-url from mps tests * Increase tolerance of mps test This test passes in my Mac (Ventura 13.3) but fails in the CI hardware (Ventura 13.2). I ran the local tests following the same steps that exist in the CI workflow. * Temporarily run mps tests on pr So we can test. * Revert "Temporarily run mps tests on pr" Tests passed, go back to running on push. --------- Signed-off-by: Asfiya Baig Co-authored-by: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com> Co-authored-by: Patrick von Platen Co-authored-by: YiYi Xu Co-authored-by: yiyixuxu Co-authored-by: Sayak Paul Co-authored-by: Horace He Co-authored-by: Umar <55330742+mu94-csl@users.noreply.github.com> Co-authored-by: Mylo <36931363+gitmylo@users.noreply.github.com> Co-authored-by: Markus Pobitzer Co-authored-by: Cheng Lu Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Isamu Isozaki Co-authored-by: Cesar Aybar Co-authored-by: Will Rice Co-authored-by: Adrià Arrufat <1671644+arrufat@users.noreply.github.com> Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Co-authored-by: At-sushi Co-authored-by: Lucca Zenóbio Co-authored-by: Lysandre Debut Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: pdoane Co-authored-by: Will Berman Co-authored-by: yiyixuxu Co-authored-by: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com> Co-authored-by: sudowind Co-authored-by: Takuma Mori Co-authored-by: Stas Bekman Co-authored-by: Pedro Cuenca Co-authored-by: Laureηt Co-authored-by: Jongwoo Han Co-authored-by: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Co-authored-by: clarencechen Co-authored-by: Laureηt Co-authored-by: superlabs-dev <133080491+superlabs-dev@users.noreply.github.com> Co-authored-by: Dev Aggarwal Co-authored-by: Vimarsh Chaturvedi Co-authored-by: 7eu7d7 <31194890+7eu7d7@users.noreply.github.com> Co-authored-by: cmdr2 Co-authored-by: wfng92 <43742196+wfng92@users.noreply.github.com> Co-authored-by: Glaceon-Hyy Co-authored-by: yueyang.hyy --- src/diffusers/models/modeling_utils.py | 25 ++++++++++++++++++++++- src/diffusers/pipelines/pipeline_utils.py | 21 +++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index e7cfcd71062f..c9fabf93253b 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -398,6 +398,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For more information about each option see [designing a device map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + max_memory (`Dict`, *optional*): + A dictionary device identifier to maximum memory. Will default to the maximum memory available for each + GPU and the available CPU RAM if unset. + offload_folder (`str` or `os.PathLike`, *optional*): + If the `device_map` contains any value `"disk"`, the folder where we will offload weights. + offload_state_dict (`bool`, *optional*): + If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU + RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to + `True` when there is some disk offload. low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): Speed up model loading by not initializing the weights and only loading the pre-trained weights. This also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the @@ -439,6 +448,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P torch_dtype = kwargs.pop("torch_dtype", None) subfolder = kwargs.pop("subfolder", None) device_map = kwargs.pop("device_map", None) + max_memory = kwargs.pop("max_memory", None) + offload_folder = kwargs.pop("offload_folder", None) + offload_state_dict = kwargs.pop("offload_state_dict", False) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) variant = kwargs.pop("variant", None) use_safetensors = kwargs.pop("use_safetensors", None) @@ -510,6 +522,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P revision=revision, subfolder=subfolder, device_map=device_map, + max_memory=max_memory, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, user_agent=user_agent, **kwargs, ) @@ -614,7 +629,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P else: # else let accelerate handle loading and dispatching. # Load weights and dispatch according to the device_map # by default the device_map is None and the weights are loaded on the CPU - accelerate.load_checkpoint_and_dispatch(model, model_file, device_map, dtype=torch_dtype) + accelerate.load_checkpoint_and_dispatch( + model, + model_file, + device_map, + max_memory=max_memory, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + ) loading_info = { "missing_keys": [], diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index fa71a181f521..aed1139a2a16 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -354,6 +354,9 @@ def load_sub_model( provider: Any, sess_options: Any, device_map: Optional[Union[Dict[str, torch.device], str]], + max_memory: Optional[Dict[Union[int, str], Union[int, str]]], + offload_folder: Optional[Union[str, os.PathLike]], + offload_state_dict: bool, model_variants: Dict[str, str], name: str, from_flax: bool, @@ -416,6 +419,9 @@ def load_sub_model( # This makes sure that the weights won't be initialized which significantly speeds up loading. if is_diffusers_model or is_transformers_model: loading_kwargs["device_map"] = device_map + loading_kwargs["max_memory"] = max_memory + loading_kwargs["offload_folder"] = offload_folder + loading_kwargs["offload_state_dict"] = offload_state_dict loading_kwargs["variant"] = model_variants.pop(name, None) if from_flax: loading_kwargs["from_flax"] = True @@ -808,6 +814,15 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For more information about each option see [designing a device map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + max_memory (`Dict`, *optional*): + A dictionary device identifier to maximum memory. Will default to the maximum memory available for each + GPU and the available CPU RAM if unset. + offload_folder (`str` or `os.PathLike`, *optional*): + If the `device_map` contains any value `"disk"`, the folder where we will offload weights. + offload_state_dict (`bool`, *optional*): + If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU + RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to + `True` when there is some disk offload. low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): Speed up model loading by not initializing the weights and only loading the pre-trained weights. This also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the @@ -873,6 +888,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P provider = kwargs.pop("provider", None) sess_options = kwargs.pop("sess_options", None) device_map = kwargs.pop("device_map", None) + max_memory = kwargs.pop("max_memory", None) + offload_folder = kwargs.pop("offload_folder", None) + offload_state_dict = kwargs.pop("offload_state_dict", False) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) variant = kwargs.pop("variant", None) use_safetensors = kwargs.pop("use_safetensors", None if is_safetensors_available() else False) @@ -1046,6 +1064,9 @@ def load_module(name, value): provider=provider, sess_options=sess_options, device_map=device_map, + max_memory=max_memory, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, model_variants=model_variants, name=name, from_flax=from_flax, From c4359d63e32119081be877ec0affa61df5e6630b Mon Sep 17 00:00:00 2001 From: takuoko Date: Tue, 23 May 2023 00:21:54 +0900 Subject: [PATCH 010/199] [Community] reference only control (#3435) * add reference only control * add reference only control * add reference only control * fix lint * fix lint * reference adain * bugfix EulerAncestralDiscreteScheduler * fix style fidelity rule * fix default output size * del unused line * fix deterministic --- examples/community/README.md | 43 + .../community/stable_diffusion_reference.py | 774 ++++++++++++++++++ 2 files changed, 817 insertions(+) create mode 100644 examples/community/stable_diffusion_reference.py diff --git a/examples/community/README.md b/examples/community/README.md index 47b129ce9e7e..974f77fd1011 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1320,3 +1320,46 @@ prompt = "photorealistic new zealand hills" image = pipe(prompt, image=input_image, strength=0.75,).images[0] image.save('tensorrt_img2img_new_zealand_hills.png') ``` + +### Stable Diffusion Reference + +This pipeline uses the Reference only Control. Refer to the [sd-webui-controlnet discussion](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236). + + +```py +import torch +from diffusers import UniPCMultistepScheduler +from diffusers.utils import load_image + +input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png") + +pipe = StableDiffusionReferencePipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + safety_checker=None, + torch_dtype=torch.float16 + ).to('cuda:0') + +pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) + +result_img = pipe(ref_image=input_image, + prompt="1girl", + num_inference_steps=20, + reference_attn=True, + reference_adain=True).images[0] +``` + +Reference Image + +![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png) + +Output Image of `reference_attn=True` and `reference_adain=False` + +![output_image](https://github.com/huggingface/diffusers/assets/24734142/813b5c6a-6d89-46ba-b7a4-2624e240eea5) + +Output Image of `reference_attn=False` and `reference_adain=True` + +![output_image](https://github.com/huggingface/diffusers/assets/24734142/ffc90339-9ef0-4c4d-a544-135c3e5644da) + +Output Image of `reference_attn=True` and `reference_adain=True` + +![output_image](https://github.com/huggingface/diffusers/assets/24734142/3c5255d6-867d-4d35-b202-8dfd30cc6827) diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py new file mode 100644 index 000000000000..5e8051cdcdb2 --- /dev/null +++ b/examples/community/stable_diffusion_reference.py @@ -0,0 +1,774 @@ +# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL.Image +import torch + +from diffusers import StableDiffusionPipeline +from diffusers.models.attention import BasicTransformerBlock +from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.utils import PIL_INTERPOLATION, logging, randn_tensor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import UniPCMultistepScheduler + >>> from diffusers.utils import load_image + + >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png") + + >>> pipe = StableDiffusionReferencePipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + safety_checker=None, + torch_dtype=torch.float16 + ).to('cuda:0') + + >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe_controlnet.scheduler.config) + + >>> result_img = pipe(ref_image=input_image, + prompt="1girl", + num_inference_steps=20, + reference_attn=True, + reference_adain=True).images[0] + + >>> result_img.show() + ``` +""" + + +def torch_dfs(model: torch.nn.Module): + result = [model] + for child in model.children(): + result += torch_dfs(child) + return result + + +class StableDiffusionReferencePipeline(StableDiffusionPipeline): + def _default_height_width(self, height, width, image): + # NOTE: It is possible that a list of images have different + # dimensions for each image, so just checking the first image + # is not _exactly_ correct, but it is simple. + while isinstance(image, list): + image = image[0] + + if height is None: + if isinstance(image, PIL.Image.Image): + height = image.height + elif isinstance(image, torch.Tensor): + height = image.shape[2] + + height = (height // 8) * 8 # round down to nearest multiple of 8 + + if width is None: + if isinstance(image, PIL.Image.Image): + width = image.width + elif isinstance(image, torch.Tensor): + width = image.shape[3] + + width = (width // 8) * 8 # round down to nearest multiple of 8 + + return height, width + + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + if not isinstance(image, torch.Tensor): + if isinstance(image, PIL.Image.Image): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + images = [] + + for image_ in image: + image_ = image_.convert("RGB") + image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) + image_ = np.array(image_) + image_ = image_[None, :] + images.append(image_) + + image = images + + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = (image - 0.5) / 0.5 + image = image.transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, dim=0) + + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(device=device, dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = torch.cat([image] * 2) + + return image + + def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance): + refimage = refimage.to(device=device, dtype=dtype) + + # encode the mask image into latents space so we can concatenate it to the latents + if isinstance(generator, list): + ref_image_latents = [ + self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(batch_size) + ] + ref_image_latents = torch.cat(ref_image_latents, dim=0) + else: + ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator) + ref_image_latents = self.vae.config.scaling_factor * ref_image_latents + + # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method + if ref_image_latents.shape[0] < batch_size: + if not batch_size % ref_image_latents.shape[0] == 0: + raise ValueError( + "The passed images and the required batch size don't match. Images are supposed to be duplicated" + f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed." + " Make sure the number of images that you pass is divisible by the total requested batch size." + ) + ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1) + + ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents + + # aligning device to prevent device errors when concating it with the latent model input + ref_image_latents = ref_image_latents.to(device=device, dtype=dtype) + return ref_image_latents + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + ref_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + attention_auto_machine_weight: float = 1.0, + gn_auto_machine_weight: float = 1.0, + style_fidelity: float = 0.5, + reference_attn: bool = True, + reference_adain: bool = True, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If + the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + also be accepted as an image. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + attention_auto_machine_weight (`float`): + Weight of using reference query for self attention's context. + If attention_auto_machine_weight=1.0, use reference query for all self attention's context. + gn_auto_machine_weight (`float`): + Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins. + style_fidelity (`float`): + style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important, + elif style_fidelity=0.0, prompt more important, else balanced. + reference_attn (`bool`): + Whether to use reference query for self attention's context. + reference_adain (`bool`): + Whether to use reference adain. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True." + + # 0. Default height and width to unet + height, width = self._default_height_width(height, width, ref_image) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Preprocess reference image + ref_image = self.prepare_image( + image=ref_image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=prompt_embeds.dtype, + ) + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 6. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 7. Prepare reference latent variables + ref_image_latents = self.prepare_ref_latents( + ref_image, + batch_size * num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + do_classifier_free_guidance, + ) + + # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 9. Modify self attention and group norm + MODE = "write" + uc_mask = ( + torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt) + .type_as(ref_image_latents) + .bool() + ) + + def hacked_basic_transformer_inner_forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + attention_mask=None, + cross_attention_kwargs=None, + class_labels=None, + ): + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + else: + norm_hidden_states = self.norm1(hidden_states) + + # 1. Self-Attention + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + if self.only_cross_attention: + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + else: + if MODE == "write": + self.bank.append(norm_hidden_states.detach().clone()) + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if MODE == "read": + if attention_auto_machine_weight > self.attn_weight: + attn_output_uc = self.attn1( + norm_hidden_states, + encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1), + # attention_mask=attention_mask, + **cross_attention_kwargs, + ) + attn_output_c = attn_output_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + attn_output_c[uc_mask] = self.attn1( + norm_hidden_states[uc_mask], + encoder_hidden_states=norm_hidden_states[uc_mask], + **cross_attention_kwargs, + ) + attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc + self.bank.clear() + else: + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = attn_output + hidden_states + + if self.attn2 is not None: + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + + # 2. Cross-Attention + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 3. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + ff_output = self.ff(norm_hidden_states) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = ff_output + hidden_states + + return hidden_states + + def hacked_mid_forward(self, *args, **kwargs): + eps = 1e-6 + x = self.original_forward(*args, **kwargs) + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank) / float(len(self.mean_bank)) + var_acc = sum(self.var_bank) / float(len(self.var_bank)) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + x_uc = (((x - mean) / std) * std_acc) + mean_acc + x_c = x_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + x_c[uc_mask] = x[uc_mask] + x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc + self.mean_bank = [] + self.var_bank = [] + return x + + def hack_CrossAttnDownBlock2D_forward( + self, + hidden_states, + temb=None, + encoder_hidden_states=None, + attention_mask=None, + cross_attention_kwargs=None, + ): + eps = 1e-6 + + # TODO(Patrick, William) - attention mask is not used + output_states = () + + for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): + hidden_states = resnet(hidden_states, temb) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + output_states = output_states + (hidden_states,) + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + def hacked_DownBlock2D_forward(self, hidden_states, temb=None): + eps = 1e-6 + + output_states = () + + for i, resnet in enumerate(self.resnets): + hidden_states = resnet(hidden_states, temb) + + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + output_states = output_states + (hidden_states,) + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + def hacked_CrossAttnUpBlock2D_forward( + self, + hidden_states, + res_hidden_states_tuple, + temb=None, + encoder_hidden_states=None, + cross_attention_kwargs=None, + upsample_size=None, + attention_mask=None, + ): + eps = 1e-6 + # TODO(Patrick, William) - attention mask is not used + for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states + + def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): + eps = 1e-6 + for i, resnet in enumerate(self.resnets): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb) + + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states + + if reference_attn: + attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)] + attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0]) + + for i, module in enumerate(attn_modules): + module._original_inner_forward = module.forward + module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock) + module.bank = [] + module.attn_weight = float(i) / float(len(attn_modules)) + + if reference_adain: + gn_modules = [self.unet.mid_block] + self.unet.mid_block.gn_weight = 0 + + down_blocks = self.unet.down_blocks + for w, module in enumerate(down_blocks): + module.gn_weight = 1.0 - float(w) / float(len(down_blocks)) + gn_modules.append(module) + + up_blocks = self.unet.up_blocks + for w, module in enumerate(up_blocks): + module.gn_weight = float(w) / float(len(up_blocks)) + gn_modules.append(module) + + for i, module in enumerate(gn_modules): + if getattr(module, "original_forward", None) is None: + module.original_forward = module.forward + if i == 0: + # mid_block + module.forward = hacked_mid_forward.__get__(module, torch.nn.Module) + elif isinstance(module, CrossAttnDownBlock2D): + module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D) + elif isinstance(module, DownBlock2D): + module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D) + elif isinstance(module, CrossAttnUpBlock2D): + module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D) + elif isinstance(module, UpBlock2D): + module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D) + module.mean_bank = [] + module.var_bank = [] + module.gn_weight *= 2 + + # 10. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # ref only part + noise = randn_tensor( + ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype + ) + ref_xt = self.scheduler.add_noise( + ref_image_latents, + noise, + t.reshape( + 1, + ), + ) + ref_xt = self.scheduler.scale_model_input(ref_xt, t) + + MODE = "write" + self.unet( + ref_xt, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + ) + + # predict the noise residual + MODE = "read" + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From 64bf5d33b7ef1b1deac256bed7bd99b55020c4e0 Mon Sep 17 00:00:00 2001 From: Birch-san Date: Mon, 22 May 2023 17:27:15 +0100 Subject: [PATCH 011/199] Support for cross-attention bias / mask (#2634) * Cross-attention masks prefer qualified symbol, fix accidental Optional prefer qualified symbol in AttentionProcessor prefer qualified symbol in embeddings.py qualified symbol in transformed_2d qualify FloatTensor in unet_2d_blocks move new transformer_2d params attention_mask, encoder_attention_mask to the end of the section which is assumed (e.g. by functions such as checkpoint()) to have a stable positional param interface. regard return_dict as a special-case which is assumed to be injected separately from positional params (e.g. by create_custom_forward()). move new encoder_attention_mask param to end of CrossAttn block interfaces and Unet2DCondition interface, to maintain positional param interface. regenerate modeling_text_unet.py remove unused import unet_2d_condition encoder_attention_mask docs Co-authored-by: Pedro Cuenca versatile_diffusion/modeling_text_unet.py encoder_attention_mask docs Co-authored-by: Pedro Cuenca transformer_2d encoder_attention_mask docs Co-authored-by: Pedro Cuenca unet_2d_blocks.py: add parameter name comments Co-authored-by: Pedro Cuenca revert description. bool-to-bias treatment happens in unet_2d_condition only. comment parameter names fix copies, style * encoder_attention_mask for SimpleCrossAttnDownBlock2D, SimpleCrossAttnUpBlock2D * encoder_attention_mask for UNetMidBlock2DSimpleCrossAttn * support attention_mask, encoder_attention_mask in KCrossAttnDownBlock2D, KCrossAttnUpBlock2D, KAttentionBlock. fix binding of attention_mask, cross_attention_kwargs params in KCrossAttnDownBlock2D, KCrossAttnUpBlock2D checkpoint invocations. * fix mistake made during merge conflict resolution * regenerate versatile_diffusion * pass time embedding into checkpointed attention invocation * always assume encoder_attention_mask is a mask (i.e. not a bias). * style, fix-copies * add tests for cross-attention masks * add test for padding of attention mask * explain mask's query_tokens dim. fix explanation about broadcasting over channels; we actually broadcast over query tokens * support both masks and biases in Transformer2DModel#forward. document behaviour * fix-copies * delete attention_mask docs on the basis I never tested self-attention masking myself. not comfortable explaining it, since I don't actually understand how a self-attn mask can work in its current form: the key length will be different in every ResBlock (we don't downsample the mask when we downsample the image). * review feedback: the standard Unet blocks shouldn't pass temb to attn (only to resnet). remove from KCrossAttnDownBlock2D,KCrossAttnUpBlock2D#forward. * remove encoder_attention_mask param from SimpleCrossAttn{Up,Down}Block2D,UNetMidBlock2DSimpleCrossAttn, and mask-choice in those blocks' #forward, on the basis that they only do one type of attention, so the consumer can pass whichever type of attention_mask is appropriate. * put attention mask padding back to how it was (since the SD use-case it enabled wasn't important, and it breaks the original unclip use-case). disable the test which was added. * fix-copies * style * fix-copies * put encoder_attention_mask param back into Simple block forward interfaces, to ensure consistency of forward interface. * restore passing of emb to KAttentionBlock#forward, on the basis that removal caused test failures. restore also the passing of emb to checkpointed calls to KAttentionBlock#forward. * make simple unet2d blocks use encoder_attention_mask, but only when attention_mask is None. this should fix UnCLIP compatibility. * fix copies --- src/diffusers/models/attention.py | 18 +- src/diffusers/models/attention_processor.py | 33 +- src/diffusers/models/embeddings.py | 2 +- src/diffusers/models/transformer_2d.py | 47 ++- src/diffusers/models/unet_2d_blocks.py | 315 +++++++++++------- src/diffusers/models/unet_2d_condition.py | 26 +- .../versatile_diffusion/modeling_text_unet.py | 167 ++++++---- tests/models/test_models_unet_2d_condition.py | 71 ++++ 8 files changed, 473 insertions(+), 206 deletions(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 0b313b83d360..a7a9a472d9e9 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Any, Dict, Optional import torch import torch.nn.functional as F @@ -120,13 +120,13 @@ def __init__( def forward( self, - hidden_states, - attention_mask=None, - encoder_hidden_states=None, - encoder_attention_mask=None, - timestep=None, - cross_attention_kwargs=None, - class_labels=None, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, ): # Notice that normalization is always applied before the real computation in the following blocks. # 1. Self-Attention @@ -155,8 +155,6 @@ def forward( norm_hidden_states = ( self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) ) - # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly - # prepare attention mask here attn_output = self.attn2( norm_hidden_states, diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 86997632cac1..d0e2e7bd2dac 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -380,7 +380,13 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, if attention_mask is None: return attention_mask - if attention_mask.shape[-1] != target_length: + current_length: int = attention_mask.shape[-1] + if current_length > target_length: + # we *could* trim the mask with: + # attention_mask = attention_mask[:,:target_length] + # but this is weird enough that it's more likely to be a mistake than a shortcut + raise ValueError(f"mask's length ({current_length}) exceeds the sequence length ({target_length}).") + elif current_length < target_length: if attention_mask.device.type == "mps": # HACK: MPS: Does not support padding by greater than dimension of input tensor. # Instead, we can manually construct the padding tensor. @@ -388,6 +394,10 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device) attention_mask = torch.cat([attention_mask, padding], dim=2) else: + # TODO: for pipelines such as stable-diffusion, padding cross-attn mask: + # we want to instead pad by (0, remaining_length), where remaining_length is: + # remaining_length: int = target_length - current_length + # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) if out_dim == 3: @@ -820,7 +830,13 @@ class XFormersAttnProcessor: def __init__(self, attention_op: Optional[Callable] = None): self.attention_op = attention_op - def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ): residual = hidden_states input_ndim = hidden_states.ndim @@ -829,11 +845,20 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a batch_size, channel, height, width = hidden_states.shape hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) - batch_size, sequence_length, _ = ( + batch_size, key_tokens, _ = ( hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape ) - attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size) + if attention_mask is not None: + # expand our mask's singleton query_tokens dimension: + # [batch*heads, 1, key_tokens] -> + # [batch*heads, query_tokens, key_tokens] + # so that it can be added as a bias onto the attention scores that xformers computes: + # [batch*heads, query_tokens, key_tokens] + # we do this explicitly because xformers doesn't broadcast the singleton dimension for us. + _, query_tokens, _ = hidden_states.shape + attention_mask = attention_mask.expand(-1, query_tokens, -1) if attn.group_norm is not None: hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index fa88bce305e6..fb803039b268 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -352,7 +352,7 @@ def token_drop(self, labels, force_drop_ids=None): labels = torch.where(drop_ids, self.num_classes, labels) return labels - def forward(self, labels, force_drop_ids=None): + def forward(self, labels: torch.LongTensor, force_drop_ids=None): use_dropout = self.dropout_prob > 0 if (self.training and use_dropout) or (force_drop_ids is not None): labels = self.token_drop(labels, force_drop_ids) diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py index fde1014bd2e7..ec4cb371845f 100644 --- a/src/diffusers/models/transformer_2d.py +++ b/src/diffusers/models/transformer_2d.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Optional +from typing import Any, Dict, Optional import torch import torch.nn.functional as F @@ -213,11 +213,13 @@ def __init__( def forward( self, - hidden_states, - encoder_hidden_states=None, - timestep=None, - class_labels=None, - cross_attention_kwargs=None, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + timestep: Optional[torch.LongTensor] = None, + class_labels: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, return_dict: bool = True, ): """ @@ -228,11 +230,17 @@ def forward( encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. - timestep ( `torch.long`, *optional*): + timestep ( `torch.LongTensor`, *optional*): Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels conditioning. + encoder_attention_mask ( `torch.Tensor`, *optional* ). + Cross-attention mask, applied to encoder_hidden_states. Two formats supported: + Mask `(batch, sequence_length)` True = keep, False = discard. Bias `(batch, 1, sequence_length)` 0 + = keep, -10000 = discard. + If ndim == 2: will be interpreted as a mask, then converted into a bias consistent with the format + above. This bias will be added to the cross-attention scores. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. @@ -241,6 +249,29 @@ def forward( [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is the sample tensor. """ + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. + # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. + # we can tell by counting dims; if ndim == 2: it's a mask rather than a bias. + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None and attention_mask.ndim == 2: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2: + encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + # 1. Input if self.is_input_continuous: batch, _, height, width = hidden_states.shape @@ -264,7 +295,9 @@ def forward( for block in self.transformer_blocks: hidden_states = block( hidden_states, + attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, timestep=timestep, cross_attention_kwargs=cross_attention_kwargs, class_labels=class_labels, diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index 75d9eb3e03df..6f8e3d0f5500 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Optional +from typing import Any, Dict, Optional, Tuple import numpy as np import torch @@ -558,14 +558,22 @@ def __init__( self.resnets = nn.ModuleList(resnets) def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None - ): + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] hidden_states = resnet(hidden_states, temb) @@ -659,16 +667,34 @@ def __init__( self.resnets = nn.ModuleList(resnets) def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + + if attention_mask is None: + # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask. + mask = None if encoder_hidden_states is None else encoder_attention_mask + else: + # when attention_mask is defined: we don't even check for encoder_attention_mask. + # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks. + # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask. + # then we can simplify this whole if/else block to: + # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask + mask = attention_mask + hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): # attn hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, + attention_mask=mask, **cross_attention_kwargs, ) @@ -850,9 +876,14 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): - # TODO(Patrick, William) - attention mask is not used output_states = () for resnet, attn in zip(self.resnets, self.attentions): @@ -867,33 +898,32 @@ def custom_forward(*inputs): return custom_forward - if is_torch_version(">=", "1.11.0"): - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, use_reentrant=False - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - use_reentrant=False, - )[0] - else: - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - )[0] + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + None, # timestep + None, # class_labels + cross_attention_kwargs, + attention_mask, + encoder_attention_mask, + **ckpt_kwargs, + )[0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] @@ -1501,11 +1531,28 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): output_states = () cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + if attention_mask is None: + # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask. + mask = None if encoder_hidden_states is None else encoder_attention_mask + else: + # when attention_mask is defined: we don't even check for encoder_attention_mask. + # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks. + # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask. + # then we can simplify this whole if/else block to: + # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask + mask = attention_mask + for resnet, attn in zip(self.resnets, self.attentions): if self.training and self.gradient_checkpointing: @@ -1523,6 +1570,7 @@ def custom_forward(*inputs): create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, + mask, cross_attention_kwargs, )[0] else: @@ -1531,7 +1579,7 @@ def custom_forward(*inputs): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, + attention_mask=mask, **cross_attention_kwargs, ) @@ -1690,7 +1738,13 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): output_states = () @@ -1706,29 +1760,23 @@ def custom_forward(*inputs): return custom_forward - if is_torch_version(">=", "1.11.0"): - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, use_reentrant=False - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - attention_mask, - cross_attention_kwargs, - use_reentrant=False, - ) - else: - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - attention_mask, - cross_attention_kwargs, - ) + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + temb, + attention_mask, + cross_attention_kwargs, + encoder_attention_mask, + **ckpt_kwargs, + ) else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( @@ -1737,6 +1785,7 @@ def custom_forward(*inputs): emb=temb, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, ) if self.downsamplers is None: @@ -1916,15 +1965,15 @@ def __init__( def forward( self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): - # TODO(Patrick, William) - attention mask is not used for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] @@ -1942,33 +1991,32 @@ def custom_forward(*inputs): return custom_forward - if is_torch_version(">=", "1.11.0"): - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, use_reentrant=False - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - use_reentrant=False, - )[0] - else: - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - )[0] + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + None, # timestep + None, # class_labels + cross_attention_kwargs, + attention_mask, + encoder_attention_mask, + **ckpt_kwargs, + )[0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] @@ -2594,15 +2642,28 @@ def __init__( def forward( self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - upsample_size=None, - attention_mask=None, - cross_attention_kwargs=None, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + + if attention_mask is None: + # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask. + mask = None if encoder_hidden_states is None else encoder_attention_mask + else: + # when attention_mask is defined: we don't even check for encoder_attention_mask. + # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks. + # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask. + # then we can simplify this whole if/else block to: + # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask + mask = attention_mask + for resnet, attn in zip(self.resnets, self.attentions): # resnet # pop res hidden states @@ -2626,6 +2687,7 @@ def custom_forward(*inputs): create_custom_forward(attn, return_dict=False), hidden_states, encoder_hidden_states, + mask, cross_attention_kwargs, )[0] else: @@ -2634,7 +2696,7 @@ def custom_forward(*inputs): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, + attention_mask=mask, **cross_attention_kwargs, ) @@ -2811,13 +2873,14 @@ def __init__( def forward( self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): res_hidden_states_tuple = res_hidden_states_tuple[-1] if res_hidden_states_tuple is not None: @@ -2835,29 +2898,23 @@ def custom_forward(*inputs): return custom_forward - if is_torch_version(">=", "1.11.0"): - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, use_reentrant=False - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - attention_mask, - cross_attention_kwargs, - use_reentrant=False, - )[0] - else: - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - attention_mask, - cross_attention_kwargs, - )[0] + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + temb, + attention_mask, + cross_attention_kwargs, + encoder_attention_mask, + **ckpt_kwargs, + )[0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( @@ -2866,6 +2923,7 @@ def custom_forward(*inputs): emb=temb, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, ) if self.upsamplers is not None: @@ -2944,11 +3002,14 @@ def _to_4d(self, hidden_states, height, weight): def forward( self, - hidden_states, - encoder_hidden_states=None, - emb=None, - attention_mask=None, - cross_attention_kwargs=None, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + # TODO: mark emb as non-optional (self.norm2 requires it). + # requires assessing impact of change to positional param interface. + emb: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} @@ -2962,6 +3023,7 @@ def forward( attn_output = self.attn1( norm_hidden_states, encoder_hidden_states=None, + attention_mask=attention_mask, **cross_attention_kwargs, ) attn_output = self._to_4d(attn_output, height, weight) @@ -2976,6 +3038,7 @@ def forward( attn_output = self.attn2( norm_hidden_states, encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask if encoder_hidden_states is None else encoder_attention_mask, **cross_attention_kwargs, ) attn_output = self._to_4d(attn_output, height, weight) diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 2a4c9fd72c1b..76a40ffa1ec5 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -618,6 +618,7 @@ def forward( cross_attention_kwargs: Optional[Dict[str, Any]] = None, down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[UNet2DConditionOutput, Tuple]: r""" @@ -625,6 +626,10 @@ def forward( sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + encoder_attention_mask (`torch.Tensor`): + (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False = + discard. Mask will be converted into a bias, which adds large negative values to attention scores + corresponding to "discard" tokens. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): @@ -651,11 +656,27 @@ def forward( logger.info("Forward upsample size to force interpolation output size.") forward_upsample_size = True - # prepare attention_mask + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 attention_mask = attention_mask.unsqueeze(1) + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + # 0. center input if necessary if self.config.center_input_sample: sample = 2 * sample - 1.0 @@ -727,6 +748,7 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, ) else: sample, res_samples = downsample_block(hidden_states=sample, temb=emb) @@ -752,6 +774,7 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, ) if mid_block_additional_residual is not None: @@ -778,6 +801,7 @@ def forward( cross_attention_kwargs=cross_attention_kwargs, upsample_size=upsample_size, attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, ) else: sample = upsample_block( diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 7aaa0e49e1da..29cde43337d2 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -721,6 +721,7 @@ def forward( cross_attention_kwargs: Optional[Dict[str, Any]] = None, down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[UNet2DConditionOutput, Tuple]: r""" @@ -728,6 +729,10 @@ def forward( sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + encoder_attention_mask (`torch.Tensor`): + (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False = + discard. Mask will be converted into a bias, which adds large negative values to attention scores + corresponding to "discard" tokens. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. cross_attention_kwargs (`dict`, *optional*): @@ -754,11 +759,27 @@ def forward( logger.info("Forward upsample size to force interpolation output size.") forward_upsample_size = True - # prepare attention_mask + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 attention_mask = attention_mask.unsqueeze(1) + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + # 0. center input if necessary if self.config.center_input_sample: sample = 2 * sample - 1.0 @@ -830,6 +851,7 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, ) else: sample, res_samples = downsample_block(hidden_states=sample, temb=emb) @@ -855,6 +877,7 @@ def forward( encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, ) if mid_block_additional_residual is not None: @@ -881,6 +904,7 @@ def forward( cross_attention_kwargs=cross_attention_kwargs, upsample_size=upsample_size, attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, ) else: sample = upsample_block( @@ -1188,9 +1212,14 @@ def __init__( self.gradient_checkpointing = False def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): - # TODO(Patrick, William) - attention mask is not used output_states = () for resnet, attn in zip(self.resnets, self.attentions): @@ -1205,33 +1234,32 @@ def custom_forward(*inputs): return custom_forward - if is_torch_version(">=", "1.11.0"): - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, use_reentrant=False - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - use_reentrant=False, - )[0] - else: - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - )[0] + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + None, # timestep + None, # class_labels + cross_attention_kwargs, + attention_mask, + encoder_attention_mask, + **ckpt_kwargs, + )[0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] @@ -1414,15 +1442,15 @@ def __init__( def forward( self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): - # TODO(Patrick, William) - attention mask is not used for resnet, attn in zip(self.resnets, self.attentions): # pop res hidden states res_hidden_states = res_hidden_states_tuple[-1] @@ -1440,33 +1468,32 @@ def custom_forward(*inputs): return custom_forward - if is_torch_version(">=", "1.11.0"): - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb, use_reentrant=False - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - use_reentrant=False, - )[0] - else: - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(resnet), hidden_states, temb - ) - hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(attn, return_dict=False), - hidden_states, - encoder_hidden_states, - cross_attention_kwargs, - )[0] + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(attn, return_dict=False), + hidden_states, + encoder_hidden_states, + None, # timestep + None, # class_labels + cross_attention_kwargs, + attention_mask, + encoder_attention_mask, + **ckpt_kwargs, + )[0] else: hidden_states = resnet(hidden_states, temb) hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] @@ -1564,14 +1591,22 @@ def __init__( self.resnets = nn.ModuleList(resnets) def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None - ): + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] hidden_states = resnet(hidden_states, temb) @@ -1666,16 +1701,34 @@ def __init__( self.resnets = nn.ModuleList(resnets) def forward( - self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None, cross_attention_kwargs=None + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + + if attention_mask is None: + # if encoder_hidden_states is defined: we are doing cross-attn, so we should use cross-attn mask. + mask = None if encoder_hidden_states is None else encoder_attention_mask + else: + # when attention_mask is defined: we don't even check for encoder_attention_mask. + # this is to maintain compatibility with UnCLIP, which uses 'attention_mask' param for cross-attn masks. + # TODO: UnCLIP should express cross-attn mask via encoder_attention_mask param instead of via attention_mask. + # then we can simplify this whole if/else block to: + # mask = attention_mask if encoder_hidden_states is None else encoder_attention_mask + mask = attention_mask + hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): # attn hidden_states = attn( hidden_states, encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, + attention_mask=mask, **cross_attention_kwargs, ) diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index 43a487a32b43..8a3d9dd16fd5 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -20,6 +20,7 @@ import torch from parameterized import parameterized +from pytest import mark from diffusers import UNet2DConditionModel from diffusers.models.attention_processor import CustomDiffusionAttnProcessor, LoRAAttnProcessor @@ -418,6 +419,76 @@ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_ma assert processor.is_run assert processor.number == 123 + @parameterized.expand( + [ + # fmt: off + [torch.bool], + [torch.long], + [torch.float], + # fmt: on + ] + ) + def test_model_xattn_mask(self, mask_dtype): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**{**init_dict, "attention_head_dim": (8, 16)}) + model.to(torch_device) + model.eval() + + cond = inputs_dict["encoder_hidden_states"] + with torch.no_grad(): + full_cond_out = model(**inputs_dict).sample + assert full_cond_out is not None + + keepall_mask = torch.ones(*cond.shape[:-1], device=cond.device, dtype=mask_dtype) + full_cond_keepallmask_out = model(**{**inputs_dict, "encoder_attention_mask": keepall_mask}).sample + assert full_cond_keepallmask_out.allclose( + full_cond_out + ), "a 'keep all' mask should give the same result as no mask" + + trunc_cond = cond[:, :-1, :] + trunc_cond_out = model(**{**inputs_dict, "encoder_hidden_states": trunc_cond}).sample + assert not trunc_cond_out.allclose( + full_cond_out + ), "discarding the last token from our cond should change the result" + + batch, tokens, _ = cond.shape + mask_last = (torch.arange(tokens) < tokens - 1).expand(batch, -1).to(cond.device, mask_dtype) + masked_cond_out = model(**{**inputs_dict, "encoder_attention_mask": mask_last}).sample + assert masked_cond_out.allclose( + trunc_cond_out + ), "masking the last token from our cond should be equivalent to truncating that token out of the condition" + + # see diffusers.models.attention_processor::Attention#prepare_attention_mask + # note: we may not need to fix mask padding to work for stable-diffusion cross-attn masks. + # since the use-case (somebody passes in a too-short cross-attn mask) is pretty esoteric. + # maybe it's fine that this only works for the unclip use-case. + @mark.skip( + reason="we currently pad mask by target_length tokens (what unclip needs), whereas stable-diffusion's cross-attn needs to instead pad by remaining_length." + ) + def test_model_xattn_padding(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**{**init_dict, "attention_head_dim": (8, 16)}) + model.to(torch_device) + model.eval() + + cond = inputs_dict["encoder_hidden_states"] + with torch.no_grad(): + full_cond_out = model(**inputs_dict).sample + assert full_cond_out is not None + + batch, tokens, _ = cond.shape + keeplast_mask = (torch.arange(tokens) == tokens - 1).expand(batch, -1).to(cond.device, torch.bool) + keeplast_out = model(**{**inputs_dict, "encoder_attention_mask": keeplast_mask}).sample + assert not keeplast_out.allclose(full_cond_out), "a 'keep last token' mask should change the result" + + trunc_mask = torch.zeros(batch, tokens - 1, device=cond.device, dtype=torch.bool) + trunc_mask_out = model(**{**inputs_dict, "encoder_attention_mask": trunc_mask}).sample + assert trunc_mask_out.allclose( + keeplast_out + ), "a mask with fewer tokens than condition, will be padded with 'keep' tokens. a 'discard-all' mask missing the final token is thus equivalent to a 'keep last' mask." + def test_lora_processors(self): # enable deterministic behavior for gradient checkpointing init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() From 67cd46015455a08f7dcf60d70a0609a2a020d0b3 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Mon, 22 May 2023 15:19:56 -0700 Subject: [PATCH 012/199] do not scale the initial global step by gradient accumulation steps when loading from checkpoint (#3506) --- examples/controlnet/train_controlnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index 9754c25b81e9..b6eb98db711b 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -979,7 +979,7 @@ def load_model_hook(models, input_dir): accelerator.load_state(os.path.join(args.output_dir, path)) global_step = int(path.split("-")[1]) - initial_global_step = global_step * args.gradient_accumulation_steps + initial_global_step = global_step first_epoch = global_step // num_update_steps_per_epoch else: initial_global_step = 0 From 2f997f30ab660472561f5e1b5232d4f116315b1b Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Tue, 23 May 2023 11:25:15 +0800 Subject: [PATCH 013/199] Fix bug in panorama pipeline when using dpmsolver scheduler (#3499) fix panorama pipeline with dpmsolver scheduler --- .../pipeline_stable_diffusion_panorama.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 22c22b56c7ee..223f8a236efa 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -612,6 +612,7 @@ def __call__( # 6. Define panorama grid and initialize views for synthesis. views = self.get_views(height, width) + blocks_model_outputs = [None] * len(views) count = torch.zeros_like(latents) value = torch.zeros_like(latents) @@ -632,7 +633,7 @@ def __call__( # denoised (latent) crops are then averaged to produce the final latent # for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113 - for h_start, h_end, w_start, w_end in views: + for j, (h_start, h_end, w_start, w_end) in enumerate(views): # get the latents corresponding to the current view coordinates latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] @@ -656,9 +657,21 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents_view_denoised = self.scheduler.step( - noise_pred, t, latents_for_view, **extra_step_kwargs - ).prev_sample + if hasattr(self.scheduler, "model_outputs"): + # rematch model_outputs in each block + if i >= 1: + self.scheduler.model_outputs = blocks_model_outputs[j] + latents_view_denoised = self.scheduler.step( + noise_pred, t, latents_for_view, **extra_step_kwargs + ).prev_sample + # collect model_outputs + blocks_model_outputs[j] = [ + output if output is not None else None for output in self.scheduler.model_outputs + ] + else: + latents_view_denoised = self.scheduler.step( + noise_pred, t, latents_for_view, **extra_step_kwargs + ).prev_sample value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised count[:, :, h_start:h_end, w_start:w_end] += 1 From edc65051937f4a71a68ac3da31b2f27a7e422114 Mon Sep 17 00:00:00 2001 From: yingjieh Date: Tue, 23 May 2023 16:55:14 +0800 Subject: [PATCH 014/199] [Community Pipelines]Accelerate inference of stable diffusion by IPEX on CPU (#3105) * add stable_diffusion_ipex community pipeline * Update readme.md * reformat * reformat * Update examples/community/README.md Co-authored-by: Pedro Cuenca * Update examples/community/README.md Co-authored-by: Pedro Cuenca * Update examples/community/README.md Co-authored-by: Pedro Cuenca * Update examples/community/README.md Co-authored-by: Pedro Cuenca * Apply suggestions from code review Co-authored-by: Pedro Cuenca * Update README.md * Update README.md * Apply suggestions from code review Co-authored-by: Pedro Cuenca * style --------- Co-authored-by: Pedro Cuenca --- examples/community/README.md | 100 +++ examples/community/stable_diffusion_ipex.py | 848 ++++++++++++++++++++ 2 files changed, 948 insertions(+) create mode 100644 examples/community/stable_diffusion_ipex.py diff --git a/examples/community/README.md b/examples/community/README.md index 974f77fd1011..7cb53cf6c564 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -35,6 +35,7 @@ If a community doesn't work as expected, please open an issue and ping the autho | EDICT Image Editing Pipeline | Diffusion pipeline for text-guided image editing | [EDICT Image Editing Pipeline](#edict-image-editing-pipeline) | - | [Joqsan Azocar](https://github.com/Joqsan) | | Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.0986) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) | | TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) | +| Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. ```py @@ -1363,3 +1364,102 @@ Output Image of `reference_attn=False` and `reference_adain=True` Output Image of `reference_attn=True` and `reference_adain=True` ![output_image](https://github.com/huggingface/diffusers/assets/24734142/3c5255d6-867d-4d35-b202-8dfd30cc6827) + +### Stable Diffusion on IPEX + +This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). + +To use this pipeline, you need to: +1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch) + +**Note:** For each PyTorch release, there is a corresponding release of the IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance. + +|PyTorch Version|IPEX Version| +|--|--| +|[v2.0.\*](https://github.com/pytorch/pytorch/tree/v2.0.1 "v2.0.1")|[v2.0.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.0.100+cpu)| +|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)| + +You can simply use pip to install IPEX with the latest version. +```python +python -m pip install intel_extension_for_pytorch +``` +**Note:** To install a specific version, run with the following command: +``` +python -m pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +``` + +2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16. + +**Note:** The setting of generated image height/width for `prepare_for_ipex()` should be same as the setting of pipeline inference. +```python +pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex") +# For Float32 +pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference +# For BFloat16 +pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference +``` + +Then you can use the ipex pipeline in a similar way to the default stable diffusion pipeline. +```python +# For Float32 +image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' +# For BFloat16 +with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): + image = pipe(prompt, num_inference_steps=20, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' +``` + +The following code compares the performance of the original stable diffusion pipeline with the ipex-optimized pipeline. + +```python +import torch +import intel_extension_for_pytorch as ipex +from diffusers import StableDiffusionPipeline +import time + +prompt = "sailing ship in storm by Rembrandt" +model_id = "runwayml/stable-diffusion-v1-5" +# Helper function for time evaluation +def elapsed_time(pipeline, nb_pass=3, num_inference_steps=20): + # warmup + for _ in range(2): + images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images + #time evaluation + start = time.time() + for _ in range(nb_pass): + pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512) + end = time.time() + return (end - start) / nb_pass + +############## bf16 inference performance ############### + +# 1. IPEX Pipeline initialization +pipe = DiffusionPipeline.from_pretrained(model_id, custom_pipeline="stable_diffusion_ipex") +pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) + +# 2. Original Pipeline initialization +pipe2 = StableDiffusionPipeline.from_pretrained(model_id) + +# 3. Compare performance between Original Pipeline and IPEX Pipeline +with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): + latency = elapsed_time(pipe) + print("Latency of StableDiffusionIPEXPipeline--bf16", latency) + latency = elapsed_time(pipe2) + print("Latency of StableDiffusionPipeline--bf16",latency) + +############## fp32 inference performance ############### + +# 1. IPEX Pipeline initialization +pipe3 = DiffusionPipeline.from_pretrained(model_id, custom_pipeline="stable_diffusion_ipex") +pipe3.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) + +# 2. Original Pipeline initialization +pipe4 = StableDiffusionPipeline.from_pretrained(model_id) + +# 3. Compare performance between Original Pipeline and IPEX Pipeline +latency = elapsed_time(pipe3) +print("Latency of StableDiffusionIPEXPipeline--fp32", latency) +latency = elapsed_time(pipe4) +print("Latency of StableDiffusionPipeline--fp32",latency) + +``` + diff --git a/examples/community/stable_diffusion_ipex.py b/examples/community/stable_diffusion_ipex.py new file mode 100644 index 000000000000..9abe16d56f10 --- /dev/null +++ b/examples/community/stable_diffusion_ipex.py @@ -0,0 +1,848 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import intel_extension_for_pytorch as ipex +import torch +from packaging import version +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from diffusers.configuration_utils import FrozenDict +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionPipeline + + >>> pipe = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", custom_pipeline="stable_diffusion_ipex") + + >>> # For Float32 + >>> pipe.prepare_for_ipex(prompt, dtype=torch.float32, height=512, width=512) #value of image height/width should be consistent with the pipeline inference + >>> # For BFloat16 + >>> pipe.prepare_for_ipex(prompt, dtype=torch.bfloat16, height=512, width=512) #value of image height/width should be consistent with the pipeline inference + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> # For Float32 + >>> image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' + >>> # For BFloat16 + >>> with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16): + >>> image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512).images[0] #value of image height/width should be consistent with 'prepare_for_ipex()' + ``` +""" + + +class StableDiffusionIPEXPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion on IPEX. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True: + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`." + " `clip_sample` should be set to False in the configuration file. Please make sure to update the" + " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in" + " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very" + " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file" + ) + deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(scheduler.config) + new_config["clip_sample"] = False + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse("0.9.0.dev0") + is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + def get_input_example(self, prompt, height=None, width=None, guidance_scale=7.5, num_images_per_prompt=1): + prompt_embeds = None + negative_prompt_embeds = None + negative_prompt = None + callback_steps = 1 + generator = None + latents = None + + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + + device = "cpu" + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 5. Prepare latent variables + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + self.unet.in_channels, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + dummy = torch.ones(1, dtype=torch.int32) + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, dummy) + + unet_input_example = (latent_model_input, dummy, prompt_embeds) + vae_decoder_input_example = latents + + return unet_input_example, vae_decoder_input_example + + def prepare_for_ipex(self, promt, dtype=torch.float32, height=None, width=None, guidance_scale=7.5): + self.unet = self.unet.to(memory_format=torch.channels_last) + self.vae.decoder = self.vae.decoder.to(memory_format=torch.channels_last) + self.text_encoder = self.text_encoder.to(memory_format=torch.channels_last) + if self.safety_checker is not None: + self.safety_checker = self.safety_checker.to(memory_format=torch.channels_last) + + unet_input_example, vae_decoder_input_example = self.get_input_example(promt, height, width, guidance_scale) + + # optimize with ipex + if dtype == torch.bfloat16: + self.unet = ipex.optimize( + self.unet.eval(), dtype=torch.bfloat16, inplace=True, sample_input=unet_input_example + ) + self.vae.decoder = ipex.optimize(self.vae.decoder.eval(), dtype=torch.bfloat16, inplace=True) + self.text_encoder = ipex.optimize(self.text_encoder.eval(), dtype=torch.bfloat16, inplace=True) + if self.safety_checker is not None: + self.safety_checker = ipex.optimize(self.safety_checker.eval(), dtype=torch.bfloat16, inplace=True) + elif dtype == torch.float32: + self.unet = ipex.optimize( + self.unet.eval(), + dtype=torch.float32, + inplace=True, + sample_input=unet_input_example, + level="O1", + weights_prepack=True, + auto_kernel_selection=False, + ) + self.vae.decoder = ipex.optimize( + self.vae.decoder.eval(), + dtype=torch.float32, + inplace=True, + level="O1", + weights_prepack=True, + auto_kernel_selection=False, + ) + self.text_encoder = ipex.optimize( + self.text_encoder.eval(), + dtype=torch.float32, + inplace=True, + level="O1", + weights_prepack=True, + auto_kernel_selection=False, + ) + if self.safety_checker is not None: + self.safety_checker = ipex.optimize( + self.safety_checker.eval(), + dtype=torch.float32, + inplace=True, + level="O1", + weights_prepack=True, + auto_kernel_selection=False, + ) + else: + raise ValueError(" The value of 'dtype' should be 'torch.bfloat16' or 'torch.float32' !") + + # trace unet model to get better performance on IPEX + with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad(): + unet_trace_model = torch.jit.trace(self.unet, unet_input_example, check_trace=False, strict=False) + unet_trace_model = torch.jit.freeze(unet_trace_model) + self.unet.forward = unet_trace_model.forward + + # trace vae.decoder model to get better performance on IPEX + with torch.cpu.amp.autocast(enabled=dtype == torch.bfloat16), torch.no_grad(): + ave_decoder_trace_model = torch.jit.trace( + self.vae.decoder, vae_decoder_input_example, check_trace=False, strict=False + ) + ave_decoder_trace_model = torch.jit.freeze(ave_decoder_trace_model) + self.vae.decoder.forward = ave_decoder_trace_model.forward + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is not None: + safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + else: + has_nsfw_concept = None + return image, has_nsfw_concept + + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents).sample + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttnProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds)["sample"] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + image = latents + has_nsfw_concept = None + elif output_type == "pil": + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # 10. Convert to PIL + image = self.numpy_to_pil(image) + else: + # 8. Post-processing + image = self.decode_latents(latents) + + # 9. Run safety checker + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) From b134f6a8b6b3d75af45a0b918b4006d2a06e0f91 Mon Sep 17 00:00:00 2001 From: takuoko Date: Tue, 23 May 2023 21:20:34 +0900 Subject: [PATCH 015/199] [Community] ControlNet Reference (#3508) add controlnet reference and bugfix Co-authored-by: Patrick von Platen --- examples/community/README.md | 51 +- .../stable_diffusion_controlnet_reference.py | 822 ++++++++++++++++++ .../community/stable_diffusion_reference.py | 51 +- 3 files changed, 900 insertions(+), 24 deletions(-) create mode 100644 examples/community/stable_diffusion_controlnet_reference.py diff --git a/examples/community/README.md b/examples/community/README.md index 7cb53cf6c564..0211287d4ebb 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1324,7 +1324,7 @@ image.save('tensorrt_img2img_new_zealand_hills.png') ### Stable Diffusion Reference -This pipeline uses the Reference only Control. Refer to the [sd-webui-controlnet discussion](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236). +This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280). ```py @@ -1365,6 +1365,54 @@ Output Image of `reference_attn=True` and `reference_adain=True` ![output_image](https://github.com/huggingface/diffusers/assets/24734142/3c5255d6-867d-4d35-b202-8dfd30cc6827) +### Stable Diffusion ControlNet Reference + +This pipeline uses the Reference Control with ControlNet. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280). + + +```py +import cv2 +import torch +import numpy as np +from PIL import Image +from diffusers import UniPCMultistepScheduler +from diffusers.utils import load_image + +input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png") + +# get canny image +image = cv2.Canny(np.array(input_image), 100, 200) +image = image[:, :, None] +image = np.concatenate([image, image, image], axis=2) +canny_image = Image.fromarray(image) + +controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) +pipe = StableDiffusionControlNetReferencePipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + controlnet=controlnet, + safety_checker=None, + torch_dtype=torch.float16 + ).to('cuda:0') + +pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) + +result_img = pipe(ref_image=input_image, + prompt="1girl", + image=canny_image, + num_inference_steps=20, + reference_attn=True, + reference_adain=True).images[0] +``` + +Reference Image + +![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png) + +Output Image + +![output_image](https://github.com/huggingface/diffusers/assets/24734142/7b9a5830-f173-4b92-b0cf-73d0e9c01d60) + + ### Stable Diffusion on IPEX This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). @@ -1462,4 +1510,3 @@ latency = elapsed_time(pipe4) print("Latency of StableDiffusionPipeline--fp32",latency) ``` - diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py new file mode 100644 index 000000000000..606fe09c68fc --- /dev/null +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -0,0 +1,822 @@ +# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280 +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import PIL.Image +import torch + +from diffusers import StableDiffusionControlNetPipeline +from diffusers.models import ControlNetModel +from diffusers.models.attention import BasicTransformerBlock +from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D +from diffusers.pipelines.controlnet.multicontrolnet import MultiControlNetModel +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.utils import is_compiled_module, logging, randn_tensor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import cv2 + >>> import torch + >>> import numpy as np + >>> from PIL import Image + >>> from diffusers import UniPCMultistepScheduler + >>> from diffusers.utils import load_image + + >>> input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png") + + >>> # get canny image + >>> image = cv2.Canny(np.array(input_image), 100, 200) + >>> image = image[:, :, None] + >>> image = np.concatenate([image, image, image], axis=2) + >>> canny_image = Image.fromarray(image) + + >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) + >>> pipe = StableDiffusionControlNetReferencePipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", + controlnet=controlnet, + safety_checker=None, + torch_dtype=torch.float16 + ).to('cuda:0') + + >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe_controlnet.scheduler.config) + + >>> result_img = pipe(ref_image=input_image, + prompt="1girl", + image=canny_image, + num_inference_steps=20, + reference_attn=True, + reference_adain=True).images[0] + + >>> result_img.show() + ``` +""" + + +def torch_dfs(model: torch.nn.Module): + result = [model] + for child in model.children(): + result += torch_dfs(child) + return result + + +class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeline): + def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance): + refimage = refimage.to(device=device, dtype=dtype) + + # encode the mask image into latents space so we can concatenate it to the latents + if isinstance(generator, list): + ref_image_latents = [ + self.vae.encode(refimage[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(batch_size) + ] + ref_image_latents = torch.cat(ref_image_latents, dim=0) + else: + ref_image_latents = self.vae.encode(refimage).latent_dist.sample(generator=generator) + ref_image_latents = self.vae.config.scaling_factor * ref_image_latents + + # duplicate mask and ref_image_latents for each generation per prompt, using mps friendly method + if ref_image_latents.shape[0] < batch_size: + if not batch_size % ref_image_latents.shape[0] == 0: + raise ValueError( + "The passed images and the required batch size don't match. Images are supposed to be duplicated" + f" to a total batch size of {batch_size}, but {ref_image_latents.shape[0]} images were passed." + " Make sure the number of images that you pass is divisible by the total requested batch size." + ) + ref_image_latents = ref_image_latents.repeat(batch_size // ref_image_latents.shape[0], 1, 1, 1) + + ref_image_latents = torch.cat([ref_image_latents] * 2) if do_classifier_free_guidance else ref_image_latents + + # aligning device to prevent device errors when concating it with the latent model input + ref_image_latents = ref_image_latents.to(device=device, dtype=dtype) + return ref_image_latents + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + controlnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, + attention_auto_machine_weight: float = 1.0, + gn_auto_machine_weight: float = 1.0, + style_fidelity: float = 0.5, + reference_attn: bool = True, + reference_adain: bool = True, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, + `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If + the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can + also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If + height and/or width are passed, `image` is resized according to them. If multiple ControlNets are + specified in init, images must be passed as a list such that each element of the list can be correctly + batched for input to a single controlnet. + ref_image (`torch.FloatTensor`, `PIL.Image.Image`): + The Reference Control input condition. Reference Control uses this input condition to generate guidance to Unet. If + the type is specified as `Torch.FloatTensor`, it is passed to Reference Control as is. `PIL.Image.Image` can + also be accepted as an image. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + controlnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the controlnet are multiplied by `controlnet_conditioning_scale` before they are added + to the residual in the original unet. If multiple ControlNets are specified in init, you can set the + corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + In this mode, the ControlNet encoder will try best to recognize the content of the input image even if + you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended. + attention_auto_machine_weight (`float`): + Weight of using reference query for self attention's context. + If attention_auto_machine_weight=1.0, use reference query for all self attention's context. + gn_auto_machine_weight (`float`): + Weight of using reference adain. If gn_auto_machine_weight=2.0, use all reference adain plugins. + style_fidelity (`float`): + style fidelity of ref_uncond_xt. If style_fidelity=1.0, control more important, + elif style_fidelity=0.0, prompt more important, else balanced. + reference_attn (`bool`): + Whether to use reference query for self attention's context. + reference_adain (`bool`): + Whether to use reference adain. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height, width = self._default_height_width(height, width, image) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + image, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + controlnet_conditioning_scale, + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + + if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): + controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) + + global_pool_conditions = ( + controlnet.config.global_pool_conditions + if isinstance(controlnet, ControlNetModel) + else controlnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare image + if isinstance(controlnet, ControlNetModel): + image = self.prepare_image( + image=image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=do_classifier_free_guidance, + guess_mode=guess_mode, + ) + elif isinstance(controlnet, MultiControlNetModel): + images = [] + + for image_ in image: + image_ = self.prepare_image( + image=image_, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=controlnet.dtype, + do_classifier_free_guidance=do_classifier_free_guidance, + guess_mode=guess_mode, + ) + + images.append(image_) + + image = images + else: + assert False + + # 5. Preprocess reference image + ref_image = self.prepare_image( + image=ref_image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=prompt_embeds.dtype, + ) + + # 6. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 7. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 8. Prepare reference latent variables + ref_image_latents = self.prepare_ref_latents( + ref_image, + batch_size * num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + do_classifier_free_guidance, + ) + + # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 9. Modify self attention and group norm + MODE = "write" + uc_mask = ( + torch.Tensor([1] * batch_size * num_images_per_prompt + [0] * batch_size * num_images_per_prompt) + .type_as(ref_image_latents) + .bool() + ) + + def hacked_basic_transformer_inner_forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + ): + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + else: + norm_hidden_states = self.norm1(hidden_states) + + # 1. Self-Attention + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + if self.only_cross_attention: + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + else: + if MODE == "write": + self.bank.append(norm_hidden_states.detach().clone()) + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if MODE == "read": + if attention_auto_machine_weight > self.attn_weight: + attn_output_uc = self.attn1( + norm_hidden_states, + encoder_hidden_states=torch.cat([norm_hidden_states] + self.bank, dim=1), + # attention_mask=attention_mask, + **cross_attention_kwargs, + ) + attn_output_c = attn_output_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + attn_output_c[uc_mask] = self.attn1( + norm_hidden_states[uc_mask], + encoder_hidden_states=norm_hidden_states[uc_mask], + **cross_attention_kwargs, + ) + attn_output = style_fidelity * attn_output_c + (1.0 - style_fidelity) * attn_output_uc + self.bank.clear() + else: + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + hidden_states = attn_output + hidden_states + + if self.attn2 is not None: + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + + # 2. Cross-Attention + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 3. Feed-forward + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + ff_output = self.ff(norm_hidden_states) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + + hidden_states = ff_output + hidden_states + + return hidden_states + + def hacked_mid_forward(self, *args, **kwargs): + eps = 1e-6 + x = self.original_forward(*args, **kwargs) + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(x, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank) / float(len(self.mean_bank)) + var_acc = sum(self.var_bank) / float(len(self.var_bank)) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + x_uc = (((x - mean) / std) * std_acc) + mean_acc + x_c = x_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + x_c[uc_mask] = x[uc_mask] + x = style_fidelity * x_c + (1.0 - style_fidelity) * x_uc + self.mean_bank = [] + self.var_bank = [] + return x + + def hack_CrossAttnDownBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + ): + eps = 1e-6 + + # TODO(Patrick, William) - attention mask is not used + output_states = () + + for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): + hidden_states = resnet(hidden_states, temb) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + output_states = output_states + (hidden_states,) + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + def hacked_DownBlock2D_forward(self, hidden_states, temb=None): + eps = 1e-6 + + output_states = () + + for i, resnet in enumerate(self.resnets): + hidden_states = resnet(hidden_states, temb) + + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + output_states = output_states + (hidden_states,) + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states) + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + def hacked_CrossAttnUpBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + ): + eps = 1e-6 + # TODO(Patrick, William) - attention mask is not used + for i, (resnet, attn) in enumerate(zip(self.resnets, self.attentions)): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states + + def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): + eps = 1e-6 + for i, resnet in enumerate(self.resnets): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + hidden_states = resnet(hidden_states, temb) + + if MODE == "write": + if gn_auto_machine_weight >= self.gn_weight: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + self.mean_bank.append(mean) + self.var_bank.append(var) + if MODE == "read": + if len(self.mean_bank) > 0 and len(self.var_bank) > 0: + var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) + std = torch.maximum(var, torch.zeros_like(var) + eps) ** 0.5 + mean_acc = sum(self.mean_bank[i]) / float(len(self.mean_bank[i])) + var_acc = sum(self.var_bank[i]) / float(len(self.var_bank[i])) + std_acc = torch.maximum(var_acc, torch.zeros_like(var_acc) + eps) ** 0.5 + hidden_states_uc = (((hidden_states - mean) / std) * std_acc) + mean_acc + hidden_states_c = hidden_states_uc.clone() + if do_classifier_free_guidance and style_fidelity > 0: + hidden_states_c[uc_mask] = hidden_states[uc_mask] + hidden_states = style_fidelity * hidden_states_c + (1.0 - style_fidelity) * hidden_states_uc + + if MODE == "read": + self.mean_bank = [] + self.var_bank = [] + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size) + + return hidden_states + + if reference_attn: + attn_modules = [module for module in torch_dfs(self.unet) if isinstance(module, BasicTransformerBlock)] + attn_modules = sorted(attn_modules, key=lambda x: -x.norm1.normalized_shape[0]) + + for i, module in enumerate(attn_modules): + module._original_inner_forward = module.forward + module.forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock) + module.bank = [] + module.attn_weight = float(i) / float(len(attn_modules)) + + if reference_adain: + gn_modules = [self.unet.mid_block] + self.unet.mid_block.gn_weight = 0 + + down_blocks = self.unet.down_blocks + for w, module in enumerate(down_blocks): + module.gn_weight = 1.0 - float(w) / float(len(down_blocks)) + gn_modules.append(module) + + up_blocks = self.unet.up_blocks + for w, module in enumerate(up_blocks): + module.gn_weight = float(w) / float(len(up_blocks)) + gn_modules.append(module) + + for i, module in enumerate(gn_modules): + if getattr(module, "original_forward", None) is None: + module.original_forward = module.forward + if i == 0: + # mid_block + module.forward = hacked_mid_forward.__get__(module, torch.nn.Module) + elif isinstance(module, CrossAttnDownBlock2D): + module.forward = hack_CrossAttnDownBlock2D_forward.__get__(module, CrossAttnDownBlock2D) + elif isinstance(module, DownBlock2D): + module.forward = hacked_DownBlock2D_forward.__get__(module, DownBlock2D) + elif isinstance(module, CrossAttnUpBlock2D): + module.forward = hacked_CrossAttnUpBlock2D_forward.__get__(module, CrossAttnUpBlock2D) + elif isinstance(module, UpBlock2D): + module.forward = hacked_UpBlock2D_forward.__get__(module, UpBlock2D) + module.mean_bank = [] + module.var_bank = [] + module.gn_weight *= 2 + + # 11. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # controlnet(s) inference + if guess_mode and do_classifier_free_guidance: + # Infer ControlNet only for the conditional batch. + controlnet_latent_model_input = latents + controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + controlnet_latent_model_input = latent_model_input + controlnet_prompt_embeds = prompt_embeds + + down_block_res_samples, mid_block_res_sample = self.controlnet( + controlnet_latent_model_input, + t, + encoder_hidden_states=controlnet_prompt_embeds, + controlnet_cond=image, + conditioning_scale=controlnet_conditioning_scale, + guess_mode=guess_mode, + return_dict=False, + ) + + if guess_mode and do_classifier_free_guidance: + # Infered ControlNet only for the conditional batch. + # To apply the output of ControlNet to both the unconditional and conditional batches, + # add 0 to the unconditional batch to keep it unchanged. + down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) + + # ref only part + noise = randn_tensor( + ref_image_latents.shape, generator=generator, device=device, dtype=ref_image_latents.dtype + ) + ref_xt = self.scheduler.add_noise( + ref_image_latents, + noise, + t.reshape( + 1, + ), + ) + ref_xt = self.scheduler.scale_model_input(ref_xt, t) + + MODE = "write" + self.unet( + ref_xt, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + ) + + # predict the noise residual + MODE = "read" + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + down_block_additional_residuals=down_block_res_samples, + mid_block_additional_residual=mid_block_res_sample, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # If we do sequential model offloading, let's offload unet and controlnet + # manually for max memory savings + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.unet.to("cpu") + self.controlnet.to("cpu") + torch.cuda.empty_cache() + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index 5e8051cdcdb2..22e0b40f60a3 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -1,5 +1,5 @@ -# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 -from typing import Any, Callable, Dict, List, Optional, Union +# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280 +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import PIL.Image @@ -162,7 +162,7 @@ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do def __call__( self, prompt: Union[str, List[str]] = None, - ref_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -356,12 +356,13 @@ def __call__( def hacked_basic_transformer_inner_forward( self, - hidden_states, - encoder_hidden_states=None, - timestep=None, - attention_mask=None, - cross_attention_kwargs=None, - class_labels=None, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, ): if self.use_ada_layer_norm: norm_hidden_states = self.norm1(hidden_states, timestep) @@ -427,7 +428,7 @@ def hacked_basic_transformer_inner_forward( attn_output = self.attn2( norm_hidden_states, encoder_hidden_states=encoder_hidden_states, - attention_mask=attention_mask, + attention_mask=encoder_attention_mask, **cross_attention_kwargs, ) hidden_states = attn_output + hidden_states @@ -473,11 +474,12 @@ def hacked_mid_forward(self, *args, **kwargs): def hack_CrossAttnDownBlock2D_forward( self, - hidden_states, - temb=None, - encoder_hidden_states=None, - attention_mask=None, - cross_attention_kwargs=None, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): eps = 1e-6 @@ -490,6 +492,8 @@ def hack_CrossAttnDownBlock2D_forward( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] if MODE == "write": @@ -566,13 +570,14 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None): def hacked_CrossAttnUpBlock2D_forward( self, - hidden_states, - res_hidden_states_tuple, - temb=None, - encoder_hidden_states=None, - cross_attention_kwargs=None, - upsample_size=None, - attention_mask=None, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, ): eps = 1e-6 # TODO(Patrick, William) - attention mask is not used @@ -586,6 +591,8 @@ def hacked_CrossAttnUpBlock2D_forward( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, return_dict=False, )[0] From d4197bf4d72f04d4927ff1e7be2f8ee46efebe47 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 23 May 2023 14:20:55 +0200 Subject: [PATCH 016/199] Allow custom pipeline loading (#3504) --- src/diffusers/pipelines/pipeline_utils.py | 10 ++++++--- tests/pipelines/test_pipelines.py | 27 +++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index aed1139a2a16..2f56f650ea33 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -491,15 +491,19 @@ def register_modules(self, **kwargs): library = module.__module__.split(".")[0] # check if the module is a pipeline module - pipeline_dir = module.__module__.split(".")[-2] if len(module.__module__.split(".")) > 2 else None + module_path_items = module.__module__.split(".") + pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None + path = module.__module__.split(".") is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir) # if library is not in LOADABLE_CLASSES, then it is a custom module. # Or if it's a pipeline module, then the module is inside the pipeline # folder so we set the library to module name. - if library not in LOADABLE_CLASSES or is_pipeline_module: + if is_pipeline_module: library = pipeline_dir + elif library not in LOADABLE_CLASSES: + library = module.__module__ # retrieve class_name class_name = module.__class__.__name__ @@ -1039,7 +1043,7 @@ def load_module(name, value): # 6.2 Define all importable classes is_pipeline_module = hasattr(pipelines, library_name) - importable_classes = ALL_IMPORTABLE_CLASSES if is_pipeline_module else LOADABLE_CLASSES[library_name] + importable_classes = ALL_IMPORTABLE_CLASSES loaded_sub_model = None # 6.3 Use passed sub model or load class_name from library_name diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index a9abb0b4fb62..6ec9ff0346a6 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -35,6 +35,7 @@ from diffusers import ( AutoencoderKL, + ConfigMixin, DDIMPipeline, DDIMScheduler, DDPMPipeline, @@ -44,6 +45,7 @@ EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, LMSDiscreteScheduler, + ModelMixin, PNDMScheduler, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipelineLegacy, @@ -77,6 +79,17 @@ enable_full_determinism() +class CustomEncoder(ModelMixin, ConfigMixin): + def __init__(self): + super().__init__() + + +class CustomPipeline(DiffusionPipeline): + def __init__(self, encoder: CustomEncoder, scheduler: DDIMScheduler): + super().__init__() + self.register_modules(encoder=encoder, scheduler=scheduler) + + class DownloadTests(unittest.TestCase): def test_one_request_upon_cached(self): # TODO: For some reason this test fails on MPS where no HEAD call is made. @@ -695,6 +708,20 @@ def test_local_custom_pipeline_file(self): # compare to https://github.com/huggingface/diffusers/blob/main/tests/fixtures/custom_pipeline/pipeline.py#L102 assert output_str == "This is a local test" + def test_custom_model_and_pipeline(self): + pipe = CustomPipeline( + encoder=CustomEncoder(), + scheduler=DDIMScheduler(), + ) + + with tempfile.TemporaryDirectory() as tmpdirname: + pipe.save_pretrained(tmpdirname) + + pipe_new = CustomPipeline.from_pretrained(tmpdirname) + pipe_new.save_pretrained(tmpdirname) + + assert dict(pipe_new.config) == dict(pipe.config) + @slow @require_torch_gpu def test_download_from_git(self): From 9e2734a710fcc73a9790ec4b15a3cb4fb229cf55 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 23 May 2023 15:22:43 +0200 Subject: [PATCH 017/199] Make sure Diffusers works even if Hub is down (#3447) * Make sure Diffusers works even if Hub is down * Make sure hub down is well tested --- src/diffusers/pipelines/pipeline_utils.py | 17 ++++++++++++----- tests/pipelines/test_pipelines.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 2f56f650ea33..d5fa22548a15 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -30,6 +30,7 @@ import torch from huggingface_hub import hf_hub_download, model_info, snapshot_download from packaging import version +from requests.exceptions import HTTPError from tqdm.auto import tqdm import diffusers @@ -1228,6 +1229,17 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: allow_patterns = None ignore_patterns = None + if not local_files_only: + try: + info = model_info( + pretrained_model_name, + use_auth_token=use_auth_token, + revision=revision, + ) + except HTTPError as e: + logger.warn(f"Couldn't connect to the Hub: {e}.\nWill try to load from local cache.") + local_files_only = True + if not local_files_only: config_file = hf_hub_download( pretrained_model_name, @@ -1239,11 +1251,6 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: resume_download=resume_download, use_auth_token=use_auth_token, ) - info = model_info( - pretrained_model_name, - use_auth_token=use_auth_token, - revision=revision, - ) config_dict = cls._dict_from_json_file(config_file) diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 6ec9ff0346a6..d05785a31315 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -353,7 +353,7 @@ def test_cached_files_are_used_when_no_internet(self): with mock.patch("requests.request", return_value=response_mock): # Download this model to make sure it's in the cache. pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, local_files_only=True + "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None ) comps = {k: v for k, v in pipe.components.items() if hasattr(v, "parameters")} From 84ce50f08e8a99e91e838fe96d1993789b03511e Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 23 May 2023 17:53:34 +0200 Subject: [PATCH 018/199] Improve README (#3524) Update README.md --- README.md | 130 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 82 insertions(+), 48 deletions(-) diff --git a/README.md b/README.md index 76d7df79c813..17c883519b39 100644 --- a/README.md +++ b/README.md @@ -99,55 +99,11 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l | **Documentation** | **What can I learn?** | |---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Tutorial | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. | -| Loading | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. | -| Pipelines for inference | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. | -| Optimization | Guides for how to optimize your diffusion model to run faster and consume less memory. | +| [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview) | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model. | +| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview) | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers. | +| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview) | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library. | +| [Optimization](https://huggingface.co/docs/diffusers/optimization/opt_overview) | Guides for how to optimize your diffusion model to run faster and consume less memory. | | [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. | - -## Supported pipelines - -| Pipeline | Paper | Tasks | -|---|---|:---:| -| [alt_diffusion](./api/pipelines/alt_diffusion) | [**AltDiffusion**](https://arxiv.org/abs/2211.06679) | Image-to-Image Text-Guided Generation | -| [audio_diffusion](./api/pipelines/audio_diffusion) | [**Audio Diffusion**](https://github.com/teticio/audio-diffusion.git) | Unconditional Audio Generation | -| [controlnet](./api/pipelines/stable_diffusion/controlnet) | [**ControlNet with Stable Diffusion**](https://arxiv.org/abs/2302.05543) | Image-to-Image Text-Guided Generation | -| [cycle_diffusion](./api/pipelines/cycle_diffusion) | [**Cycle Diffusion**](https://arxiv.org/abs/2210.05559) | Image-to-Image Text-Guided Generation | -| [dance_diffusion](./api/pipelines/dance_diffusion) | [**Dance Diffusion**](https://github.com/williamberman/diffusers.git) | Unconditional Audio Generation | -| [ddpm](./api/pipelines/ddpm) | [**Denoising Diffusion Probabilistic Models**](https://arxiv.org/abs/2006.11239) | Unconditional Image Generation | -| [ddim](./api/pipelines/ddim) | [**Denoising Diffusion Implicit Models**](https://arxiv.org/abs/2010.02502) | Unconditional Image Generation | -| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | -| [latent_diffusion](./api/pipelines/latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | -| [latent_diffusion_uncond](./api/pipelines/latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | -| [paint_by_example](./api/pipelines/paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | -| [pndm](./api/pipelines/pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | -| [score_sde_ve](./api/pipelines/score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | -| [score_sde_vp](./api/pipelines/score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | -| [semantic_stable_diffusion](./api/pipelines/semantic_stable_diffusion) | [**Semantic Guidance**](https://arxiv.org/abs/2301.12247) | Text-Guided Generation | -| [stable_diffusion_text2img](./api/pipelines/stable_diffusion/text2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-to-Image Generation | -| [stable_diffusion_img2img](./api/pipelines/stable_diffusion/img2img) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Image-to-Image Text-Guided Generation | -| [stable_diffusion_inpaint](./api/pipelines/stable_diffusion/inpaint) | [**Stable Diffusion**](https://stability.ai/blog/stable-diffusion-public-release) | Text-Guided Image Inpainting | -| [stable_diffusion_panorama](./api/pipelines/stable_diffusion/panorama) | [**MultiDiffusion**](https://multidiffusion.github.io/) | Text-to-Panorama Generation | -| [stable_diffusion_pix2pix](./api/pipelines/stable_diffusion/pix2pix) | [**InstructPix2Pix**](https://github.com/timothybrooks/instruct-pix2pix) | Text-Guided Image Editing| -| [stable_diffusion_pix2pix_zero](./api/pipelines/stable_diffusion/pix2pix_zero) | [**Zero-shot Image-to-Image Translation**](https://pix2pixzero.github.io/) | Text-Guided Image Editing | -| [stable_diffusion_attend_and_excite](./api/pipelines/stable_diffusion/attend_and_excite) | [**Attend and Excite for Stable Diffusion**](https://attendandexcite.github.io/Attend-and-Excite/) | Text-to-Image Generation | -| [stable_diffusion_self_attention_guidance](./api/pipelines/stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://ku-cvlab.github.io/Self-Attention-Guidance) | Text-to-Image Generation | -| [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation | -| [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Depth-Conditional Stable Diffusion**](https://github.com/Stability-AI/stablediffusion#depth-conditional-stable-diffusion) | Depth-to-Image Generation | -| [stable_diffusion_2](./api/pipelines/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image | -| [stable_diffusion_safe](./api/pipelines/stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | -| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation | -| [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation | -| [stochastic_karras_ve](./api/pipelines/stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | -| [unclip](./api/pipelines/unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation | -| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | -| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | -| [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | -| [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | - ## Contribution We ❤️ contributions from the open-source community! @@ -160,6 +116,84 @@ You can look out for [issues](https://github.com/huggingface/diffusers/issues) y Also, say 👋 in our public Discord channel Join us on Discord. We discuss the hottest trends about diffusion models, help each other with contributions, personal projects or just hang out ☕. + +## Popular Tasks & Pipelines + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
TaskPipeline🤗 Hub
Unconditional Image Generation DDPM google/ddpm-ema-church-256
Text-to-ImageStable Diffusion Text-to-Image runwayml/stable-diffusion-v1-5
Text-to-Imageunclip kakaobrain/karlo-v1-alpha
Text-to-Imageif DeepFloyd/IF-I-XL-v1.0
Text-guided Image-to-ImageControlnet lllyasviel/sd-controlnet-canny
Text-guided Image-to-ImageInstruct Pix2Pix timbrooks/instruct-pix2pix
Text-guided Image-to-ImageStable Diffusion Image-to-Image runwayml/stable-diffusion-v1-5
Text-guided Image InpaintingStable Diffusion Inpaint runwayml/stable-diffusion-inpainting
Image VariationStable Diffusion Image Variation lambdalabs/sd-image-variations-diffusers
Super ResolutionStable Diffusion Upscale stabilityai/stable-diffusion-x4-upscaler
Super ResolutionStable Diffusion Latent Upscale stabilityai/sd-x2-latent-upscaler
+ +## ❤️ Popular repos building on 🧨 Diffusers + +- https://github.com/microsoft/TaskMatrix +- https://github.com/invoke-ai/InvokeAI +- https://github.com/apple/ml-stable-diffusion +- https://github.com/Sanster/lama-cleaner +- https://github.com/IDEA-Research/Grounded-Segment-Anything +- https://github.com/ashawkey/stable-dreamfusion +- https://github.com/deep-floyd/IF +- https://github.com/bentoml/BentoML +- https://github.com/bmaltais/kohya_ss + ## Credits This library concretizes previous work by many different authors and would not have been possible without their great research and implementations. We'd like to thank, in particular, the following implementations which have helped us in our development and without which the API could not have been as polished today: From b402604de4c2ea4f4bb689201d848b0e73513430 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 23 May 2023 18:28:39 +0200 Subject: [PATCH 019/199] Update README.md (#3525) --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 17c883519b39..709abaff8e65 100644 --- a/README.md +++ b/README.md @@ -59,8 +59,9 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from ```python from diffusers import DiffusionPipeline +import torch -pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") +pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) pipeline.to("cuda") pipeline("An image of a squirrel in Picasso style").images[0] ``` @@ -182,7 +183,7 @@ just hang out ☕. -## ❤️ Popular repos building on 🧨 Diffusers +## Popular using 🧨 Diffusers - https://github.com/microsoft/TaskMatrix - https://github.com/invoke-ai/InvokeAI @@ -193,6 +194,9 @@ just hang out ☕. - https://github.com/deep-floyd/IF - https://github.com/bentoml/BentoML - https://github.com/bmaltais/kohya_ss +- +3000 other amazing GitHub repositories 💪 + +Thank you for using us ❤️ ## Credits From abab61d49ea2aad144f70fb30700d07942d30872 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 23 May 2023 17:29:18 +0100 Subject: [PATCH 020/199] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 709abaff8e65..cb6e29ee1406 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ just hang out ☕. -## Popular using 🧨 Diffusers +## Popular libraries using 🧨 Diffusers - https://github.com/microsoft/TaskMatrix - https://github.com/invoke-ai/InvokeAI From bde2cb5d9b335aa87ff989445cf2e2e9607ad400 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Tue, 23 May 2023 19:24:17 +0200 Subject: [PATCH 021/199] Run `torch.compile` tests in separate subprocesses (#3503) * Run ControlNet compile test in a separate subprocess `torch.compile()` spawns several subprocesses and the GPU memory used was not reclaimed after the test ran. This approach was taken from `transformers`. * Style * Prepare a couple more compile tests to run in subprocess. * Use require_torch_2 decorator. * Test inpaint_compile in subprocess. * Run img2img compile test in subprocess. * Run stable diffusion compile test in subprocess. * style * Temporarily trigger on pr to test. * Revert "Temporarily trigger on pr to test." This reverts commit 82d76868ddf9cc634a9f14b2b0aef1d5433cd750. --- src/diffusers/utils/testing_utils.py | 45 ++++++++++ tests/models/test_modeling_common.py | 44 ++++++--- tests/pipelines/controlnet/test_controlnet.py | 90 +++++++++++-------- .../stable_diffusion/test_stable_diffusion.py | 86 ++++++++++-------- .../test_stable_diffusion_img2img.py | 70 ++++++++++----- .../test_stable_diffusion_inpaint.py | 73 ++++++++++----- tests/pipelines/test_pipelines.py | 73 +++++++++------ 7 files changed, 318 insertions(+), 163 deletions(-) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 93d0ef5b7b5f..7d5e6bcacecd 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -1,5 +1,6 @@ import inspect import logging +import multiprocessing import os import random import re @@ -477,6 +478,50 @@ def summary_failures_short(tr): config.option.tbstyle = orig_tbstyle +# Taken from: https://github.com/huggingface/transformers/blob/3658488ff77ff8d45101293e749263acf437f4d5/src/transformers/testing_utils.py#L1787 +def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None): + """ + To run a test in a subprocess. In particular, this can avoid (GPU) memory issue. + + Args: + test_case (`unittest.TestCase`): + The test that will run `target_func`. + target_func (`Callable`): + The function implementing the actual testing logic. + inputs (`dict`, *optional*, defaults to `None`): + The inputs that will be passed to `target_func` through an (input) queue. + timeout (`int`, *optional*, defaults to `None`): + The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env. + variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`. + """ + if timeout is None: + timeout = int(os.environ.get("PYTEST_TIMEOUT", 600)) + + start_methohd = "spawn" + ctx = multiprocessing.get_context(start_methohd) + + input_queue = ctx.Queue(1) + output_queue = ctx.JoinableQueue(1) + + # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle. + input_queue.put(inputs, timeout=timeout) + + process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout)) + process.start() + # Kill the child process if we can't get outputs from it in time: otherwise, the hanging subprocess prevents + # the test to exit properly. + try: + results = output_queue.get(timeout=timeout) + output_queue.task_done() + except Exception as e: + process.terminate() + test_case.fail(e) + process.join(timeout=timeout) + + if results["error"] is not None: + test_case.fail(f'{results["error"]}') + + class CaptureLogger: """ Args: diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index b2c5f2d79d4f..adc18e003a56 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -15,6 +15,7 @@ import inspect import tempfile +import traceback import unittest import unittest.mock as mock from typing import Dict, List, Tuple @@ -27,7 +28,31 @@ from diffusers.models import UNet2DConditionModel from diffusers.training_utils import EMAModel from diffusers.utils import logging, torch_device -from diffusers.utils.testing_utils import CaptureLogger, require_torch_gpu +from diffusers.utils.testing_utils import CaptureLogger, require_torch_2, run_test_in_subprocess + + +# Will be run via run_test_in_subprocess +def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout): + error = None + try: + init_dict, model_class = in_queue.get(timeout=timeout) + + model = model_class(**init_dict) + model.to(torch_device) + model = torch.compile(model) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + new_model = model_class.from_pretrained(tmpdirname) + new_model.to(torch_device) + + assert new_model.__class__ == model_class + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() class ModelUtilsTest(unittest.TestCase): @@ -235,20 +260,11 @@ def test_from_save_pretrained_variant(self): max_diff = (image - new_image).abs().sum().item() self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes") - @require_torch_gpu + @require_torch_2 def test_from_save_pretrained_dynamo(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - model.to(torch_device) - model = torch.compile(model) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - new_model = self.model_class.from_pretrained(tmpdirname) - new_model.to(torch_device) - - assert new_model.__class__ == self.model_class + init_dict, _ = self.prepare_init_args_and_inputs_for_common() + inputs = [init_dict, self.model_class] + run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=inputs) def test_from_save_pretrained_dtype(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 97b5e20f3c14..ee6f8fce2508 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -15,11 +15,11 @@ import gc import tempfile +import traceback import unittest import numpy as np import torch -from packaging import version from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import ( @@ -32,7 +32,12 @@ from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet import MultiControlNetModel from diffusers.utils import load_image, load_numpy, randn_tensor, slow, torch_device from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_2, + require_torch_gpu, + run_test_in_subprocess, +) from ..pipeline_params import ( TEXT_TO_IMAGE_BATCH_PARAMS, @@ -44,6 +49,51 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_stable_diffusion_compile(in_queue, out_queue, timeout): + error = None + try: + _ = in_queue.get(timeout=timeout) + + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") + + pipe = StableDiffusionControlNetPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) + pipe.to("cuda") + pipe.set_progress_bar_config(disable=None) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + pipe.controlnet.to(memory_format=torch.channels_last) + pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) + + generator = torch.Generator(device="cpu").manual_seed(0) + prompt = "bird" + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ) + + output = pipe(prompt, image, generator=generator, output_type="np") + image = output.images[0] + + assert image.shape == (768, 512, 3) + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" + ) + + assert np.abs(expected_image - image).max() < 1.0 + + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS @@ -594,41 +644,9 @@ def test_canny_guess_mode(self): expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + @require_torch_2 def test_stable_diffusion_compile(self): - if version.parse(torch.__version__) < version.parse("2.0"): - print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0") - return - - controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") - - pipe = StableDiffusionControlNetPipeline.from_pretrained( - "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet - ) - pipe.to("cuda") - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - pipe.controlnet.to(memory_format=torch.channels_last) - pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) - - generator = torch.Generator(device="cpu").manual_seed(0) - prompt = "bird" - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" - ) - - output = pipe(prompt, image, generator=generator, output_type="np") - image = output.images[0] - - assert image.shape == (768, 512, 3) - - expected_image = load_numpy( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny_out_full.npy" - ) - - assert np.abs(expected_image - image).max() < 1.0 + run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) def test_v11_shuffle_global_pool_conditions(self): controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11e_sd15_shuffle") diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index aec4436710b9..6140bf771e65 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -15,19 +15,14 @@ import gc -import os -import signal -import subprocess -import sys import tempfile import time +import traceback import unittest import numpy as np -import pytest import torch from huggingface_hub import hf_hub_download -from packaging import version from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import ( @@ -44,25 +39,52 @@ ) from diffusers.models.attention_processor import AttnProcessor from diffusers.utils import load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu +from diffusers.utils.testing_utils import ( + CaptureLogger, + enable_full_determinism, + require_torch_2, + require_torch_gpu, + run_test_in_subprocess, +) from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin -@pytest.fixture(autouse=True) -def process_fixture(): - # This will be run before each test - command = [sys.executable, os.path.abspath(__file__)] - process = subprocess.Popen(command) - enable_full_determinism() - yield process - # This will be run after each test +enable_full_determinism() + + +# Will be run via run_test_in_subprocess +def _test_stable_diffusion_compile(in_queue, out_queue, timeout): + error = None try: - os.kill(process.pid, signal.SIGTERM) # or signal.SIGKILL - except ProcessLookupError: - pass + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) + sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) + sd_pipe = sd_pipe.to(torch_device) + + sd_pipe.unet.to(memory_format=torch.channels_last) + sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True) + + sd_pipe.set_progress_bar_config(disable=None) + + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239]) + assert np.abs(image_slice - expected_slice).max() < 5e-3 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): @@ -927,27 +949,15 @@ def test_stable_diffusion_textual_inversion(self): max_diff = np.abs(expected_image - image).max() assert max_diff < 8e-1 + @require_torch_2 def test_stable_diffusion_compile(self): - if version.parse(torch.__version__) < version.parse("2.0"): - print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0") - return - - sd_pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - sd_pipe.scheduler = DDIMScheduler.from_config(sd_pipe.scheduler.config) - sd_pipe = sd_pipe.to(torch_device) - - sd_pipe.unet.to(memory_format=torch.channels_last) - sd_pipe.unet = torch.compile(sd_pipe.unet, mode="reduce-overhead", fullgraph=True) - - sd_pipe.set_progress_bar_config(disable=None) - - inputs = self.get_inputs(torch_device) - image = sd_pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.38019, 0.28647, 0.27321, 0.40377, 0.38290, 0.35446, 0.39218, 0.38165, 0.42239]) - assert np.abs(image_slice - expected_slice).max() < 5e-3 + seed = 0 + inputs = self.get_inputs(torch_device, seed=seed) + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=inputs) @slow diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 8ab252b9be80..33305d5980be 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -15,11 +15,11 @@ import gc import random +import traceback import unittest import numpy as np import torch -from packaging import version from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import ( @@ -34,7 +34,13 @@ ) from diffusers.image_processor import VaeImageProcessor from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_2, + require_torch_gpu, + run_test_in_subprocess, + skip_mps, +) from ..pipeline_params import ( IMAGE_TO_IMAGE_IMAGE_PARAMS, @@ -47,6 +53,38 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_img2img_compile(in_queue, out_queue, timeout): + error = None + try: + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 768, 3) + expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781]) + + assert np.abs(expected_slice - image_slice).max() < 1e-3 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} @@ -464,27 +502,15 @@ def test_img2img_safety_checker_works(self): assert out.nsfw_content_detected[0], f"Safety checker should work for prompt: {inputs['prompt']}" assert np.abs(out.images[0]).sum() < 1e-5 # should be all zeros + @require_torch_2 def test_img2img_compile(self): - if version.parse(torch.__version__) < version.parse("2.0"): - print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0") - return - - pipe = StableDiffusionImg2ImgPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", safety_checker=None) - pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, -3:, -3:, -1].flatten() - - assert image.shape == (1, 512, 768, 3) - expected_slice = np.array([0.0593, 0.0607, 0.0851, 0.0582, 0.0636, 0.0721, 0.0751, 0.0981, 0.0781]) - - assert np.abs(expected_slice - image_slice).max() < 1e-3 + seed = 0 + inputs = self.get_inputs(torch_device, seed=seed) + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_img2img_compile, inputs=inputs) @nightly diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 44de277ead07..eb1c097dfba0 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -15,11 +15,11 @@ import gc import random +import traceback import unittest import numpy as np import torch -from packaging import version from PIL import Image from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer @@ -33,7 +33,12 @@ ) from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device -from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_2, + require_torch_gpu, + run_test_in_subprocess, +) from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS @@ -43,6 +48,40 @@ enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_inpaint_compile(in_queue, out_queue, timeout): + error = None + try: + inputs = in_queue.get(timeout=timeout) + torch_device = inputs.pop("torch_device") + seed = inputs.pop("seed") + inputs["generator"] = torch.Generator(device=torch_device).manual_seed(seed) + + pipe = StableDiffusionInpaintPipeline.from_pretrained( + "runwayml/stable-diffusion-inpainting", safety_checker=None + ) + pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + pipe.unet.to(memory_format=torch.channels_last) + pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + + image = pipe(**inputs).images + image_slice = image[0, 253:256, 253:256, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272]) + + assert np.abs(expected_slice - image_slice).max() < 3e-3 + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS @@ -315,29 +354,15 @@ def test_stable_diffusion_inpaint_with_sequential_cpu_offloading(self): # make sure that less than 2.2 GB is allocated assert mem_bytes < 2.2 * 10**9 + @require_torch_2 def test_inpaint_compile(self): - if version.parse(torch.__version__) < version.parse("2.0"): - print(f"Test `test_stable_diffusion_ddim` is skipped because {torch.__version__} is < 2.0") - return - - pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", safety_checker=None - ) - pipe.scheduler = PNDMScheduler.from_config(pipe.scheduler.config) - pipe.to(torch_device) - pipe.set_progress_bar_config(disable=None) - - pipe.unet.to(memory_format=torch.channels_last) - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - inputs = self.get_inputs(torch_device) - image = pipe(**inputs).images - image_slice = image[0, 253:256, 253:256, -1].flatten() - - assert image.shape == (1, 512, 512, 3) - expected_slice = np.array([0.0425, 0.0273, 0.0344, 0.1694, 0.1727, 0.1812, 0.3256, 0.3311, 0.3272]) - - assert np.abs(expected_slice - image_slice).max() < 3e-3 + seed = 0 + inputs = self.get_inputs(torch_device, seed=seed) + # Can't pickle a Generator object + del inputs["generator"] + inputs["torch_device"] = torch_device + inputs["seed"] = seed + run_test_in_subprocess(test_case=self, target_func=_test_inpaint_compile, inputs=inputs) def test_stable_diffusion_inpaint_pil_input_resolution_test(self): pipe = StableDiffusionInpaintPipeline.from_pretrained( diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index d05785a31315..8eaee0915a4f 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -20,6 +20,7 @@ import shutil import sys import tempfile +import traceback import unittest import unittest.mock as mock @@ -73,12 +74,54 @@ require_compel, require_flax, require_torch_gpu, + run_test_in_subprocess, ) enable_full_determinism() +# Will be run via run_test_in_subprocess +def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout): + error = None + try: + # 1. Load models + model = UNet2DModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=3, + out_channels=3, + down_block_types=("DownBlock2D", "AttnDownBlock2D"), + up_block_types=("AttnUpBlock2D", "UpBlock2D"), + ) + model = torch.compile(model) + scheduler = DDPMScheduler(num_train_timesteps=10) + + ddpm = DDPMPipeline(model, scheduler) + ddpm.to(torch_device) + ddpm.set_progress_bar_config(disable=None) + + with tempfile.TemporaryDirectory() as tmpdirname: + ddpm.save_pretrained(tmpdirname) + new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) + new_ddpm.to(torch_device) + + generator = torch.Generator(device=torch_device).manual_seed(0) + image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images + + generator = torch.Generator(device=torch_device).manual_seed(0) + new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images + + assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" + except Exception: + error = f"{traceback.format_exc()}" + + results = {"error": error} + out_queue.put(results, timeout=timeout) + out_queue.join() + + class CustomEncoder(ModelMixin, ConfigMixin): def __init__(self): super().__init__() @@ -1342,35 +1385,7 @@ def test_from_save_pretrained(self): @require_torch_2 def test_from_save_pretrained_dynamo(self): - # 1. Load models - model = UNet2DModel( - block_out_channels=(32, 64), - layers_per_block=2, - sample_size=32, - in_channels=3, - out_channels=3, - down_block_types=("DownBlock2D", "AttnDownBlock2D"), - up_block_types=("AttnUpBlock2D", "UpBlock2D"), - ) - model = torch.compile(model) - scheduler = DDPMScheduler(num_train_timesteps=10) - - ddpm = DDPMPipeline(model, scheduler) - ddpm.to(torch_device) - ddpm.set_progress_bar_config(disable=None) - - with tempfile.TemporaryDirectory() as tmpdirname: - ddpm.save_pretrained(tmpdirname) - new_ddpm = DDPMPipeline.from_pretrained(tmpdirname) - new_ddpm.to(torch_device) - - generator = torch.Generator(device=torch_device).manual_seed(0) - image = ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - generator = torch.Generator(device=torch_device).manual_seed(0) - new_image = new_ddpm(generator=generator, num_inference_steps=5, output_type="numpy").images - - assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass" + run_test_in_subprocess(test_case=self, target_func=_test_from_save_pretrained_dynamo, inputs=None) def test_from_pretrained_hub(self): model_path = "google/ddpm-cifar10-32" From c13dbd5c3a53017d27de35ad77b8d57f04c8ec7c Mon Sep 17 00:00:00 2001 From: Will Berman Date: Tue, 23 May 2023 13:11:53 -0700 Subject: [PATCH 022/199] fix attention mask pad check (#3531) --- src/diffusers/models/attention_processor.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index d0e2e7bd2dac..13c7afc8e922 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -381,12 +381,7 @@ def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, return attention_mask current_length: int = attention_mask.shape[-1] - if current_length > target_length: - # we *could* trim the mask with: - # attention_mask = attention_mask[:,:target_length] - # but this is weird enough that it's more likely to be a mistake than a shortcut - raise ValueError(f"mask's length ({current_length}) exceeds the sequence length ({target_length}).") - elif current_length < target_length: + if current_length != target_length: if attention_mask.device.type == "mps": # HACK: MPS: Does not support padding by greater than dimension of input tensor. # Instead, we can manually construct the padding tensor. From db56f8a4f5b433ec600d0acf69026e8de375f3a4 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Wed, 24 May 2023 03:17:41 -0700 Subject: [PATCH 023/199] explicit broadcasts for assignments (#3535) --- src/diffusers/models/resnet.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index debe120e8ead..92bc89c80099 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -433,7 +433,8 @@ def forward(self, x): x = F.pad(x, (self.pad,) * 4, self.pad_mode) weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) indices = torch.arange(x.shape[1], device=x.device) - weight[indices, indices] = self.kernel.to(weight) + kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1) + weight[indices, indices] = kernel return F.conv2d(x, weight, stride=2) @@ -449,7 +450,8 @@ def forward(self, x): x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode) weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) indices = torch.arange(x.shape[1], device=x.device) - weight[indices, indices] = self.kernel.to(weight) + kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1) + weight[indices, indices] = kernel return F.conv_transpose2d(x, weight, stride=2, padding=self.pad * 2 + 1) From 8e69708b0d4f2784676cbfd9bfefa487d9f1ebb3 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 24 May 2023 16:16:28 +0530 Subject: [PATCH 024/199] [Examples/DreamBooth] refactor save_model_card utility in dreambooth examples (#3543) refactor save_model_card utility in dreambooth examples. --- examples/dreambooth/train_dreambooth.py | 16 +++++++++++++--- examples/dreambooth/train_dreambooth_lora.py | 17 +++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index ad43ee7aeee2..158d03185a54 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -46,6 +46,7 @@ DDPMScheduler, DiffusionPipeline, DPMSolverMultistepScheduler, + StableDiffusionPipeline, UNet2DConditionModel, ) from diffusers.optimization import get_scheduler @@ -62,7 +63,15 @@ logger = get_logger(__name__) -def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None): +def save_model_card( + repo_id: str, + images=None, + base_model=str, + train_text_encoder=False, + prompt=str, + repo_folder=None, + pipeline: DiffusionPipeline = None, +): img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) @@ -74,8 +83,8 @@ def save_model_card(repo_id: str, images=None, base_model=str, train_text_encode base_model: {base_model} instance_prompt: {prompt} tags: -- stable-diffusion -- stable-diffusion-diffusers +- {'stable-diffusion' if isinstance(pipeline, StableDiffusionPipeline) else 'if'} +- {'stable-diffusion-diffusers' if isinstance(pipeline, StableDiffusionPipeline) else 'if-diffusers'} - text-to-image - diffusers - dreambooth @@ -1297,6 +1306,7 @@ def compute_text_embeddings(prompt): train_text_encoder=args.train_text_encoder, prompt=args.instance_prompt, repo_folder=args.output_dir, + pipeline=pipeline, ) upload_folder( repo_id=repo_id, diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index e640542e36da..4ff759dcd6d4 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -68,7 +68,15 @@ logger = get_logger(__name__) -def save_model_card(repo_id: str, images=None, base_model=str, train_text_encoder=False, prompt=str, repo_folder=None): +def save_model_card( + repo_id: str, + images=None, + base_model=str, + train_text_encoder=False, + prompt=str, + repo_folder=None, + pipeline: DiffusionPipeline = None, +): img_str = "" for i, image in enumerate(images): image.save(os.path.join(repo_folder, f"image_{i}.png")) @@ -80,8 +88,8 @@ def save_model_card(repo_id: str, images=None, base_model=str, train_text_encode base_model: {base_model} instance_prompt: {prompt} tags: -- stable-diffusion -- stable-diffusion-diffusers +- {'stable-diffusion' if isinstance(pipeline, StableDiffusionPipeline) else 'if'} +- {'stable-diffusion-diffusers' if isinstance(pipeline, StableDiffusionPipeline) else 'if-diffusers'} - text-to-image - diffusers - lora @@ -844,7 +852,7 @@ def main(args): hidden_size=module.out_features, cross_attention_dim=None ) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) - temp_pipeline = StableDiffusionPipeline.from_pretrained( + temp_pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, text_encoder=text_encoder ) temp_pipeline._modify_text_encoder(text_lora_attn_procs) @@ -1332,6 +1340,7 @@ def compute_text_embeddings(prompt): train_text_encoder=args.train_text_encoder, prompt=args.instance_prompt, repo_folder=args.output_dir, + pipeline=pipeline, ) upload_folder( repo_id=repo_id, From a94977b8b32b94ccd00d2f8f812aadb46764baba Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Wed, 24 May 2023 20:28:08 +0800 Subject: [PATCH 025/199] Fix panorama to support all schedulers (#3546) * refactor blocks init * refactor blocks loop * remove unused function and warnings * fix scheduler update location * reformat code * reformat code again * fix PNDM test case * reformat pndm test case --- .../pipeline_stable_diffusion_panorama.py | 33 ++++++++----------- .../test_stable_diffusion_panorama.py | 15 ++++++--- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 223f8a236efa..66706c806a81 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import inspect import warnings from typing import Any, Callable, Dict, List, Optional, Union @@ -21,7 +22,7 @@ from ...image_processor import VaeImageProcessor from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel -from ...schedulers import DDIMScheduler, PNDMScheduler +from ...schedulers import DDIMScheduler from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring from ..pipeline_utils import DiffusionPipeline from . import StableDiffusionPipelineOutput @@ -96,9 +97,6 @@ def __init__( ): super().__init__() - if isinstance(scheduler, PNDMScheduler): - logger.error("PNDMScheduler for this pipeline is currently not supported.") - if safety_checker is None and requires_safety_checker: logger.warning( f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" @@ -612,7 +610,7 @@ def __call__( # 6. Define panorama grid and initialize views for synthesis. views = self.get_views(height, width) - blocks_model_outputs = [None] * len(views) + views_scheduler_status = [copy.deepcopy(self.scheduler.__dict__)] * len(views) count = torch.zeros_like(latents) value = torch.zeros_like(latents) @@ -637,6 +635,9 @@ def __call__( # get the latents corresponding to the current view coordinates latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] + # rematch block's scheduler status + self.scheduler.__dict__.update(views_scheduler_status[j]) + # expand the latents if we are doing classifier free guidance latent_model_input = ( torch.cat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view @@ -657,21 +658,13 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - if hasattr(self.scheduler, "model_outputs"): - # rematch model_outputs in each block - if i >= 1: - self.scheduler.model_outputs = blocks_model_outputs[j] - latents_view_denoised = self.scheduler.step( - noise_pred, t, latents_for_view, **extra_step_kwargs - ).prev_sample - # collect model_outputs - blocks_model_outputs[j] = [ - output if output is not None else None for output in self.scheduler.model_outputs - ] - else: - latents_view_denoised = self.scheduler.step( - noise_pred, t, latents_for_view, **extra_step_kwargs - ).prev_sample + latents_view_denoised = self.scheduler.step( + noise_pred, t, latents_for_view, **extra_step_kwargs + ).prev_sample + + # save views scheduler status after sample + views_scheduler_status[j] = copy.deepcopy(self.scheduler.__dict__) + value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised count[:, :, h_start:h_end, w_start:w_end] += 1 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 02a15b2a29dc..021065416838 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -174,15 +174,22 @@ def test_stable_diffusion_panorama_euler(self): def test_stable_diffusion_panorama_pndm(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() - components["scheduler"] = PNDMScheduler() + components["scheduler"] = PNDMScheduler( + beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", skip_prk_steps=True + ) sd_pipe = StableDiffusionPanoramaPipeline(**components) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) - # the pipeline does not expect pndm so test if it raises error. - with self.assertRaises(ValueError): - _ = sd_pipe(**inputs).images + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array([0.6391, 0.6291, 0.4861, 0.5134, 0.5552, 0.4578, 0.5032, 0.5023, 0.4539]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 @slow From f19f1287358beb31a71bc1bf0ef680a2c6155964 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 25 May 2023 12:11:20 +0200 Subject: [PATCH 026/199] Add open parti prompts to docs (#3549) * Add open parti prompts * More changes --- docs/source/en/conceptual/evaluation.mdx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docs/source/en/conceptual/evaluation.mdx b/docs/source/en/conceptual/evaluation.mdx index 2721adea0c16..6e5c14acad4e 100644 --- a/docs/source/en/conceptual/evaluation.mdx +++ b/docs/source/en/conceptual/evaluation.mdx @@ -37,7 +37,8 @@ We cover Diffusion models with the following pipelines: ## Qualitative Evaluation -Qualitative evaluation typically involves human assessment of generated images. Quality is measured across aspects such as compositionality, image-text alignment, and spatial relations. Common prompts provide a degree of uniformity for subjective metrics. DrawBench and PartiPrompts are prompt datasets used for qualitative benchmarking. DrawBench and PartiPrompts were introduced by [Imagen](https://imagen.research.google/) and [Parti](https://parti.research.google/) respectively. +Qualitative evaluation typically involves human assessment of generated images. Quality is measured across aspects such as compositionality, image-text alignment, and spatial relations. Common prompts provide a degree of uniformity for subjective metrics. +DrawBench and PartiPrompts are prompt datasets used for qualitative benchmarking. DrawBench and PartiPrompts were introduced by [Imagen](https://imagen.research.google/) and [Parti](https://parti.research.google/) respectively. From the [official Parti website](https://parti.research.google/): @@ -51,7 +52,13 @@ PartiPrompts has the following columns: - Category of the prompt (such as “Abstract”, “World Knowledge”, etc.) - Challenge reflecting the difficulty (such as “Basic”, “Complex”, “Writing & Symbols”, etc.) -These benchmarks allow for side-by-side human evaluation of different image generation models. Let’s see how we can use `diffusers` on a couple of PartiPrompts. +These benchmarks allow for side-by-side human evaluation of different image generation models. + +For this, the 🧨 Diffusers team has built **Open Parti Prompts**, which is a community-driven qualitative benchmark based on Parti Prompts to compare state-of-the-art open-source diffusion models: +- [Open Parti Prompts Game](https://huggingface.co/spaces/OpenGenAI/open-parti-prompts): For 10 parti prompts, 4 generated images are shown and the user selects the image that suits the prompt best. +- [Open Parti Prompts Leaderboard](https://huggingface.co/spaces/OpenGenAI/parti-prompts-leaderboard): The leaderboard comparing the currently best open-sourced diffusion models to each other. + +To manually compare images, let’s see how we can use `diffusers` on a couple of PartiPrompts. Below we show some prompts sampled across different challenges: Basic, Complex, Linguistic Structures, Imagination, and Writing & Symbols. Here we are using PartiPrompts as a [dataset](https://huggingface.co/datasets/nateraw/parti-prompts). From 03b7a84cbee11fa1cff98e5275050f284da168df Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 25 May 2023 11:28:34 -1000 Subject: [PATCH 027/199] Add Kandinsky 2.1 (#3308) add kandinsky2.1 --------- Co-authored-by: yiyixuxu Co-authored-by: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Co-authored-by: ayushmangal Co-authored-by: Patrick von Platen Co-authored-by: Sayak Paul --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/pipelines/kandinsky.mdx | 306 ++++ scripts/convert_kandinsky_to_diffusers.py | 1400 +++++++++++++++++ src/diffusers/__init__.py | 4 + src/diffusers/models/attention_processor.py | 45 +- src/diffusers/models/embeddings.py | 45 + src/diffusers/models/resnet.py | 13 +- src/diffusers/models/unet_2d_blocks.py | 30 +- src/diffusers/models/unet_2d_condition.py | 78 +- src/diffusers/models/vae.py | 37 +- src/diffusers/models/vq_model.py | 6 +- src/diffusers/pipelines/__init__.py | 6 + src/diffusers/pipelines/kandinsky/__init__.py | 19 + .../pipelines/kandinsky/pipeline_kandinsky.py | 463 ++++++ .../kandinsky/pipeline_kandinsky_img2img.py | 547 +++++++ .../kandinsky/pipeline_kandinsky_inpaint.py | 672 ++++++++ .../kandinsky/pipeline_kandinsky_prior.py | 563 +++++++ .../pipelines/kandinsky/text_encoder.py | 27 + .../versatile_diffusion/modeling_text_unet.py | 80 +- .../dummy_torch_and_transformers_objects.py | 60 + tests/pipelines/kandinsky/__init__.py | 0 tests/pipelines/kandinsky/test_kandinsky.py | 282 ++++ .../kandinsky/test_kandinsky_img2img.py | 303 ++++ .../kandinsky/test_kandinsky_inpaint.py | 313 ++++ .../kandinsky/test_kandinsky_prior.py | 236 +++ tests/pipelines/test_pipelines_common.py | 2 +- 26 files changed, 5497 insertions(+), 42 deletions(-) create mode 100644 docs/source/en/api/pipelines/kandinsky.mdx create mode 100644 scripts/convert_kandinsky_to_diffusers.py create mode 100644 src/diffusers/pipelines/kandinsky/__init__.py create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py create mode 100644 src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py create mode 100644 src/diffusers/pipelines/kandinsky/text_encoder.py create mode 100644 tests/pipelines/kandinsky/__init__.py create mode 100644 tests/pipelines/kandinsky/test_kandinsky.py create mode 100644 tests/pipelines/kandinsky/test_kandinsky_img2img.py create mode 100644 tests/pipelines/kandinsky/test_kandinsky_inpaint.py create mode 100644 tests/pipelines/kandinsky/test_kandinsky_prior.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index aa2d907da4bd..368ea30a2690 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -166,6 +166,8 @@ title: DiT - local: api/pipelines/if title: IF + - local: api/pipelines/kandinsky + title: Kandinsky - local: api/pipelines/latent_diffusion title: Latent Diffusion - local: api/pipelines/paint_by_example diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx new file mode 100644 index 000000000000..b5b4f0f06400 --- /dev/null +++ b/docs/source/en/api/pipelines/kandinsky.mdx @@ -0,0 +1,306 @@ + + +# Kandinsky + +## Overview + +Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas. + +It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation. + +The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov) and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2) + +## Available Pipelines: + +| Pipeline | Tasks | Colab +|---|---|:---:| +| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | - | +| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | - | +| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | - | + +## Usage example + +In the following, we will walk you through some cool examples of using the Kandinsky pipelines to create some visually aesthetic artwork. + +### Text-to-Image Generation + +For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. The first step is to encode text prompts with CLIP and then diffuse the CLIP text embeddings to CLIP image embeddings, as first proposed in [DALL-E 2](https://cdn.openai.com/papers/dall-e-2.pdf). Let's throw a fun prompt at Kandinsky to see what it comes up with :) + +```python +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +negative_prompt = "low quality, bad quality" +``` + +We will pass both the `prompt` and `negative_prompt` to our prior diffusion pipeline. In contrast to other diffusion pipelines, such as Stable Diffusion, the `prompt` and `negative_prompt` shall be passed separately so that we can retrieve a CLIP image embedding for each prompt input. You can use `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers. + +```python +from diffusers import KandinskyPriorPipeline +import torch + +# create prior +pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 +) +pipe_prior.to("cuda") + +generator = torch.Generator(device="cuda").manual_seed(12) +image_emb = pipe_prior( + prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt +).images + +zero_image_emb = pipe_prior( + negative_prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt +).images +``` + +Once we create the image embedding, we can use [`KandinskyPipeline`] to generate images. + +```python +from PIL import Image +from diffusers import KandinskyPipeline + + +def image_grid(imgs, rows, cols): + assert len(imgs) == rows * cols + + w, h = imgs[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + grid_w, grid_h = grid.size + + for i, img in enumerate(imgs): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid + + +# create diffuser pipeline +pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +pipe.to("cuda") + +images = pipe( + prompt, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + num_images_per_prompt=2, + height=768, + width=768, + num_inference_steps=100, + guidance_scale=4.0, + generator=generator, +).images +``` + +One cheeseburger monster coming up! Enjoy! + +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/cheeseburger.png) + +The Kandinsky model works extremely well with creative prompts. Here is some of the amazing art that can be created using the exact same process but with different prompts. + +```python +prompt = "bird eye view shot of a full body woman with cyan light orange magenta makeup, digital art, long braided hair her face separated by makeup in the style of yin Yang surrealism, symmetrical face, real image, contrasting tone, pastel gradient background" +``` +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/hair.png) + +```python +prompt = "A car exploding into colorful dust" +``` +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/dusts.png) + +```python +prompt = "editorial photography of an organic, almost liquid smoke style armchair" +``` +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/smokechair.png) + +```python +prompt = "birds eye view of a quilted paper style alien planet landscape, vibrant colours, Cinematic lighting" +``` +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/alienplanet.png) + + +### Text Guided Image-to-Image Generation + +The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline. + +**Note**: You can also directly move the weights of the text-to-image pipelines to the image-to-image pipelines +without loading them twice by making use of the [`~DiffusionPipeline.components`] function as explained [here](#converting-between-different-pipelines). + +Let's download an image. + +```python +from PIL import Image +import requests +from io import BytesIO + +# download image +url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" +response = requests.get(url) +original_image = Image.open(BytesIO(response.content)).convert("RGB") +original_image = original_image.resize((768, 512)) +``` + +![img](https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg) + +```python +import torch +from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline + +# create prior +pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 +) +pipe_prior.to("cuda") + +# create img2img pipeline +pipe = KandinskyImg2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +pipe.to("cuda") + +prompt = "A fantasy landscape, Cinematic lighting" +negative_prompt = "low quality, bad quality" + +generator = torch.Generator(device="cuda").manual_seed(30) +image_emb = pipe_prior( + prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt +).images + +zero_image_emb = pipe_prior( + negative_prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt +).images + +out = pipe( + prompt, + image=original_image, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + height=768, + width=768, + num_inference_steps=500, + strength=0.3, +) + +out.images[0].save("fantasy_land.png") +``` + +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/img2img_fantasyland.png) + + +### Text Guided Inpainting Generation + +You can use [`KandinskyInpaintPipeline`] to edit images. In this example, we will add a hat to the portrait of a cat. + +```python +from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline +from diffusers.utils import load_image +import torch +import numpy as np + +pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 +) +pipe_prior.to("cuda") + +prompt = "a hat" +image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) + +pipe = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16) +pipe.to("cuda") + +init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" +) + +mask = np.ones((768, 768), dtype=np.float32) +# Let's mask out an area above the cat's head +mask[:250, 250:-250] = 0 + +out = pipe( + prompt, + image=init_image, + mask_image=mask, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + height=768, + width=768, + num_inference_steps=150, +) + +image = out.images[0] +image.save("cat_with_hat.png") +``` +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/inpaint_cat_hat.png) + +### Interpolate + +The [`KandinskyPriorPipeline`] also comes with a cool utility function that will allow you to interpolate the latent space of different images and texts super easily. Here is an example of how you can create an Impressionist-style portrait for your pet based on "The Starry Night". + +Note that you can interpolate between texts and images - in the below example, we passed a text prompt "a cat" and two images to the `interplate` function, along with a `weights` variable containing the corresponding weights for each condition we interplate. + +```python +from diffusers import KandinskyPriorPipeline, KandinskyPipeline +from diffusers.utils import load_image +import PIL + +import torch +from torchvision import transforms + +pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 +) +pipe_prior.to("cuda") + +img1 = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" +) + +img2 = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/starry_night.jpeg" +) + +# add all the conditions we want to interpolate, can be either text or image +images_texts = ["a cat", img1, img2] +# specify the weights for each condition in images_texts +weights = [0.3, 0.3, 0.4] +image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights) + +pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +pipe.to("cuda") + +image = pipe( + "", image_embeds=image_emb, negative_image_embeds=zero_image_emb, height=768, width=768, num_inference_steps=150 +).images[0] + +image.save("starry_cat.png") +``` +![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/starry_cat.png) + + +## KandinskyPriorPipeline + +[[autodoc]] KandinskyPriorPipeline + - all + - __call__ + - interpolate + +## KandinskyPipeline + +[[autodoc]] KandinskyPipeline + - all + - __call__ + +## KandinskyInpaintPipeline + +[[autodoc]] KandinskyInpaintPipeline + - all + - __call__ + +## KandinskyImg2ImgPipeline + +[[autodoc]] KandinskyImg2ImgPipeline + - all + - __call__ + diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py new file mode 100644 index 000000000000..de9879f7f03b --- /dev/null +++ b/scripts/convert_kandinsky_to_diffusers.py @@ -0,0 +1,1400 @@ +import argparse +import os +import tempfile + +import torch +from accelerate import load_checkpoint_and_dispatch + +from diffusers import UNet2DConditionModel +from diffusers.models.prior_transformer import PriorTransformer +from diffusers.models.vq_model import VQModel +from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel + + +""" +Example - From the diffusers root directory: + +Download weights: +```sh +$ wget https://huggingface.co/ai-forever/Kandinsky_2.1/blob/main/prior_fp16.ckpt +``` + +Convert the model: +```sh +python scripts/convert_kandinsky_to_diffusers.py \ + --prior_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/prior_fp16.ckpt \ + --clip_stat_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/ViT-L-14_stats.th \ + --text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/decoder_fp16.ckpt \ + --inpaint_text2img_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/inpainting_fp16.ckpt \ + --movq_checkpoint_path /home/yiyi_huggingface_co/Kandinsky-2/checkpoints_Kandinsky_2.1/movq_final.ckpt \ + --dump_path /home/yiyi_huggingface_co/dump \ + --debug decoder +``` +""" + + +# prior + +PRIOR_ORIGINAL_PREFIX = "model" + +# Uses default arguments +PRIOR_CONFIG = {} + + +def prior_model_from_original_config(): + model = PriorTransformer(**PRIOR_CONFIG) + + return model + + +def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint, clip_stats_checkpoint): + diffusers_checkpoint = {} + + # .time_embed.0 -> .time_embedding.linear_1 + diffusers_checkpoint.update( + { + "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.weight"], + "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.0.bias"], + } + ) + + # .clip_img_proj -> .proj_in + diffusers_checkpoint.update( + { + "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.weight"], + "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_img_proj.bias"], + } + ) + + # .text_emb_proj -> .embedding_proj + diffusers_checkpoint.update( + { + "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.weight"], + "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_emb_proj.bias"], + } + ) + + # .text_enc_proj -> .encoder_hidden_states_proj + diffusers_checkpoint.update( + { + "encoder_hidden_states_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.weight"], + "encoder_hidden_states_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.text_enc_proj.bias"], + } + ) + + # .positional_embedding -> .positional_embedding + diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.positional_embedding"]}) + + # .prd_emb -> .prd_embedding + diffusers_checkpoint.update({"prd_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.prd_emb"]}) + + # .time_embed.2 -> .time_embedding.linear_2 + diffusers_checkpoint.update( + { + "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.weight"], + "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.2.bias"], + } + ) + + # .resblocks. -> .transformer_blocks. + for idx in range(len(model.transformer_blocks)): + diffusers_transformer_prefix = f"transformer_blocks.{idx}" + original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.transformer.resblocks.{idx}" + + # .attn -> .attn1 + diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1" + original_attention_prefix = f"{original_transformer_prefix}.attn" + diffusers_checkpoint.update( + prior_attention_to_diffusers( + checkpoint, + diffusers_attention_prefix=diffusers_attention_prefix, + original_attention_prefix=original_attention_prefix, + attention_head_dim=model.attention_head_dim, + ) + ) + + # .mlp -> .ff + diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff" + original_ff_prefix = f"{original_transformer_prefix}.mlp" + diffusers_checkpoint.update( + prior_ff_to_diffusers( + checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix + ) + ) + + # .ln_1 -> .norm1 + diffusers_checkpoint.update( + { + f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[ + f"{original_transformer_prefix}.ln_1.weight" + ], + f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"], + } + ) + + # .ln_2 -> .norm3 + diffusers_checkpoint.update( + { + f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[ + f"{original_transformer_prefix}.ln_2.weight" + ], + f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"], + } + ) + + # .final_ln -> .norm_out + diffusers_checkpoint.update( + { + "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.weight"], + "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.final_ln.bias"], + } + ) + + # .out_proj -> .proj_to_clip_embeddings + diffusers_checkpoint.update( + { + "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.weight"], + "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.out_proj.bias"], + } + ) + + # clip stats + clip_mean, clip_std = clip_stats_checkpoint + clip_mean = clip_mean[None, :] + clip_std = clip_std[None, :] + + diffusers_checkpoint.update({"clip_mean": clip_mean, "clip_std": clip_std}) + + return diffusers_checkpoint + + +def prior_attention_to_diffusers( + checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim +): + diffusers_checkpoint = {} + + # .c_qkv -> .{to_q, to_k, to_v} + [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions( + weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"], + bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"], + split=3, + chunk_size=attention_head_dim, + ) + + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.to_q.weight": q_weight, + f"{diffusers_attention_prefix}.to_q.bias": q_bias, + f"{diffusers_attention_prefix}.to_k.weight": k_weight, + f"{diffusers_attention_prefix}.to_k.bias": k_bias, + f"{diffusers_attention_prefix}.to_v.weight": v_weight, + f"{diffusers_attention_prefix}.to_v.bias": v_bias, + } + ) + + # .c_proj -> .to_out.0 + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"], + f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"], + } + ) + + return diffusers_checkpoint + + +def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix): + diffusers_checkpoint = { + # .c_fc -> .net.0.proj + f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"], + f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"], + # .c_proj -> .net.2 + f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"], + f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"], + } + + return diffusers_checkpoint + + +# done prior + +# unet + +# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can +# update then. + +UNET_CONFIG = { + "act_fn": "silu", + "attention_head_dim": 64, + "block_out_channels": (384, 768, 1152, 1536), + "center_input_sample": False, + "class_embed_type": "identity", + "cross_attention_dim": 768, + "down_block_types": ( + "ResnetDownsampleBlock2D", + "SimpleCrossAttnDownBlock2D", + "SimpleCrossAttnDownBlock2D", + "SimpleCrossAttnDownBlock2D", + ), + "downsample_padding": 1, + "dual_cross_attention": False, + "flip_sin_to_cos": True, + "freq_shift": 0, + "in_channels": 4, + "layers_per_block": 3, + "mid_block_scale_factor": 1, + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "norm_eps": 1e-05, + "norm_num_groups": 32, + "only_cross_attention": False, + "out_channels": 8, + "resnet_time_scale_shift": "scale_shift", + "sample_size": 64, + "up_block_types": ( + "SimpleCrossAttnUpBlock2D", + "SimpleCrossAttnUpBlock2D", + "SimpleCrossAttnUpBlock2D", + "ResnetUpsampleBlock2D", + ), + "upcast_attention": False, + "use_linear_projection": False, +} + + +def unet_model_from_original_config(): + model = UNet2DConditionModel(**UNET_CONFIG) + + return model + + +def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + + num_head_channels = UNET_CONFIG["attention_head_dim"] + + diffusers_checkpoint.update(unet_time_embeddings(checkpoint)) + diffusers_checkpoint.update(unet_conv_in(checkpoint)) + + # .input_blocks -> .down_blocks + + original_down_block_idx = 1 + + for diffusers_down_block_idx in range(len(model.down_blocks)): + checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint( + model, + checkpoint, + diffusers_down_block_idx=diffusers_down_block_idx, + original_down_block_idx=original_down_block_idx, + num_head_channels=num_head_channels, + ) + + original_down_block_idx += num_original_down_blocks + + diffusers_checkpoint.update(checkpoint_update) + + # done .input_blocks -> .down_blocks + + diffusers_checkpoint.update( + unet_midblock_to_diffusers_checkpoint( + model, + checkpoint, + num_head_channels=num_head_channels, + ) + ) + + # .output_blocks -> .up_blocks + + original_up_block_idx = 0 + + for diffusers_up_block_idx in range(len(model.up_blocks)): + checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint( + model, + checkpoint, + diffusers_up_block_idx=diffusers_up_block_idx, + original_up_block_idx=original_up_block_idx, + num_head_channels=num_head_channels, + ) + + original_up_block_idx += num_original_up_blocks + + diffusers_checkpoint.update(checkpoint_update) + + # done .output_blocks -> .up_blocks + + diffusers_checkpoint.update(unet_conv_norm_out(checkpoint)) + diffusers_checkpoint.update(unet_conv_out(checkpoint)) + + return diffusers_checkpoint + + +# done unet + +# inpaint unet + +# We are hardcoding the model configuration for now. If we need to generalize to more model configurations, we can +# update then. + +INPAINT_UNET_CONFIG = { + "act_fn": "silu", + "attention_head_dim": 64, + "block_out_channels": (384, 768, 1152, 1536), + "center_input_sample": False, + "class_embed_type": "identity", + "cross_attention_dim": 768, + "down_block_types": ( + "ResnetDownsampleBlock2D", + "SimpleCrossAttnDownBlock2D", + "SimpleCrossAttnDownBlock2D", + "SimpleCrossAttnDownBlock2D", + ), + "downsample_padding": 1, + "dual_cross_attention": False, + "flip_sin_to_cos": True, + "freq_shift": 0, + "in_channels": 9, + "layers_per_block": 3, + "mid_block_scale_factor": 1, + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "norm_eps": 1e-05, + "norm_num_groups": 32, + "only_cross_attention": False, + "out_channels": 8, + "resnet_time_scale_shift": "scale_shift", + "sample_size": 64, + "up_block_types": ( + "SimpleCrossAttnUpBlock2D", + "SimpleCrossAttnUpBlock2D", + "SimpleCrossAttnUpBlock2D", + "ResnetUpsampleBlock2D", + ), + "upcast_attention": False, + "use_linear_projection": False, +} + + +def inpaint_unet_model_from_original_config(): + model = UNet2DConditionModel(**INPAINT_UNET_CONFIG) + + return model + + +def inpaint_unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + + num_head_channels = UNET_CONFIG["attention_head_dim"] + + diffusers_checkpoint.update(unet_time_embeddings(checkpoint)) + diffusers_checkpoint.update(unet_conv_in(checkpoint)) + + # .input_blocks -> .down_blocks + + original_down_block_idx = 1 + + for diffusers_down_block_idx in range(len(model.down_blocks)): + checkpoint_update, num_original_down_blocks = unet_downblock_to_diffusers_checkpoint( + model, + checkpoint, + diffusers_down_block_idx=diffusers_down_block_idx, + original_down_block_idx=original_down_block_idx, + num_head_channels=num_head_channels, + ) + + original_down_block_idx += num_original_down_blocks + + diffusers_checkpoint.update(checkpoint_update) + + # done .input_blocks -> .down_blocks + + diffusers_checkpoint.update( + unet_midblock_to_diffusers_checkpoint( + model, + checkpoint, + num_head_channels=num_head_channels, + ) + ) + + # .output_blocks -> .up_blocks + + original_up_block_idx = 0 + + for diffusers_up_block_idx in range(len(model.up_blocks)): + checkpoint_update, num_original_up_blocks = unet_upblock_to_diffusers_checkpoint( + model, + checkpoint, + diffusers_up_block_idx=diffusers_up_block_idx, + original_up_block_idx=original_up_block_idx, + num_head_channels=num_head_channels, + ) + + original_up_block_idx += num_original_up_blocks + + diffusers_checkpoint.update(checkpoint_update) + + # done .output_blocks -> .up_blocks + + diffusers_checkpoint.update(unet_conv_norm_out(checkpoint)) + diffusers_checkpoint.update(unet_conv_out(checkpoint)) + + return diffusers_checkpoint + + +# done inpaint unet + +# text proj + +TEXT_PROJ_CONFIG = {} + + +def text_proj_from_original_config(): + model = KandinskyTextProjModel(**TEXT_PROJ_CONFIG) + return model + + +# Note that the input checkpoint is the original text2img model checkpoint +def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint): + diffusers_checkpoint = { + # .text_seq_proj.0 -> .encoder_hidden_states_proj + "encoder_hidden_states_proj.weight": checkpoint["to_model_dim_n.weight"], + "encoder_hidden_states_proj.bias": checkpoint["to_model_dim_n.bias"], + # .clip_tok_proj -> .clip_extra_context_tokens_proj + "clip_extra_context_tokens_proj.weight": checkpoint["clip_to_seq.weight"], + "clip_extra_context_tokens_proj.bias": checkpoint["clip_to_seq.bias"], + # .proj_n -> .embedding_proj + "embedding_proj.weight": checkpoint["proj_n.weight"], + "embedding_proj.bias": checkpoint["proj_n.bias"], + # .ln_model_n -> .embedding_norm + "embedding_norm.weight": checkpoint["ln_model_n.weight"], + "embedding_norm.bias": checkpoint["ln_model_n.bias"], + # .clip_emb -> .clip_image_embeddings_project_to_time_embeddings + "clip_image_embeddings_project_to_time_embeddings.weight": checkpoint["img_layer.weight"], + "clip_image_embeddings_project_to_time_embeddings.bias": checkpoint["img_layer.bias"], + } + + return diffusers_checkpoint + + +# unet utils + + +# .time_embed -> .time_embedding +def unet_time_embeddings(checkpoint): + diffusers_checkpoint = {} + + diffusers_checkpoint.update( + { + "time_embedding.linear_1.weight": checkpoint["time_embed.0.weight"], + "time_embedding.linear_1.bias": checkpoint["time_embed.0.bias"], + "time_embedding.linear_2.weight": checkpoint["time_embed.2.weight"], + "time_embedding.linear_2.bias": checkpoint["time_embed.2.bias"], + } + ) + + return diffusers_checkpoint + + +# .input_blocks.0 -> .conv_in +def unet_conv_in(checkpoint): + diffusers_checkpoint = {} + + diffusers_checkpoint.update( + { + "conv_in.weight": checkpoint["input_blocks.0.0.weight"], + "conv_in.bias": checkpoint["input_blocks.0.0.bias"], + } + ) + + return diffusers_checkpoint + + +# .out.0 -> .conv_norm_out +def unet_conv_norm_out(checkpoint): + diffusers_checkpoint = {} + + diffusers_checkpoint.update( + { + "conv_norm_out.weight": checkpoint["out.0.weight"], + "conv_norm_out.bias": checkpoint["out.0.bias"], + } + ) + + return diffusers_checkpoint + + +# .out.2 -> .conv_out +def unet_conv_out(checkpoint): + diffusers_checkpoint = {} + + diffusers_checkpoint.update( + { + "conv_out.weight": checkpoint["out.2.weight"], + "conv_out.bias": checkpoint["out.2.bias"], + } + ) + + return diffusers_checkpoint + + +# .input_blocks -> .down_blocks +def unet_downblock_to_diffusers_checkpoint( + model, checkpoint, *, diffusers_down_block_idx, original_down_block_idx, num_head_channels +): + diffusers_checkpoint = {} + + diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.resnets" + original_down_block_prefix = "input_blocks" + + down_block = model.down_blocks[diffusers_down_block_idx] + + num_resnets = len(down_block.resnets) + + if down_block.downsamplers is None: + downsampler = False + else: + assert len(down_block.downsamplers) == 1 + downsampler = True + # The downsample block is also a resnet + num_resnets += 1 + + for resnet_idx_inc in range(num_resnets): + full_resnet_prefix = f"{original_down_block_prefix}.{original_down_block_idx + resnet_idx_inc}.0" + + if downsampler and resnet_idx_inc == num_resnets - 1: + # this is a downsample block + full_diffusers_resnet_prefix = f"down_blocks.{diffusers_down_block_idx}.downsamplers.0" + else: + # this is a regular resnet block + full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}" + + diffusers_checkpoint.update( + resnet_to_diffusers_checkpoint( + checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix + ) + ) + + if hasattr(down_block, "attentions"): + num_attentions = len(down_block.attentions) + diffusers_attention_prefix = f"down_blocks.{diffusers_down_block_idx}.attentions" + + for attention_idx_inc in range(num_attentions): + full_attention_prefix = f"{original_down_block_prefix}.{original_down_block_idx + attention_idx_inc}.1" + full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}" + + diffusers_checkpoint.update( + attention_to_diffusers_checkpoint( + checkpoint, + attention_prefix=full_attention_prefix, + diffusers_attention_prefix=full_diffusers_attention_prefix, + num_head_channels=num_head_channels, + ) + ) + + num_original_down_blocks = num_resnets + + return diffusers_checkpoint, num_original_down_blocks + + +# .middle_block -> .mid_block +def unet_midblock_to_diffusers_checkpoint(model, checkpoint, *, num_head_channels): + diffusers_checkpoint = {} + + # block 0 + + original_block_idx = 0 + + diffusers_checkpoint.update( + resnet_to_diffusers_checkpoint( + checkpoint, + diffusers_resnet_prefix="mid_block.resnets.0", + resnet_prefix=f"middle_block.{original_block_idx}", + ) + ) + + original_block_idx += 1 + + # optional block 1 + + if hasattr(model.mid_block, "attentions") and model.mid_block.attentions[0] is not None: + diffusers_checkpoint.update( + attention_to_diffusers_checkpoint( + checkpoint, + diffusers_attention_prefix="mid_block.attentions.0", + attention_prefix=f"middle_block.{original_block_idx}", + num_head_channels=num_head_channels, + ) + ) + original_block_idx += 1 + + # block 1 or block 2 + + diffusers_checkpoint.update( + resnet_to_diffusers_checkpoint( + checkpoint, + diffusers_resnet_prefix="mid_block.resnets.1", + resnet_prefix=f"middle_block.{original_block_idx}", + ) + ) + + return diffusers_checkpoint + + +# .output_blocks -> .up_blocks +def unet_upblock_to_diffusers_checkpoint( + model, checkpoint, *, diffusers_up_block_idx, original_up_block_idx, num_head_channels +): + diffusers_checkpoint = {} + + diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.resnets" + original_up_block_prefix = "output_blocks" + + up_block = model.up_blocks[diffusers_up_block_idx] + + num_resnets = len(up_block.resnets) + + if up_block.upsamplers is None: + upsampler = False + else: + assert len(up_block.upsamplers) == 1 + upsampler = True + # The upsample block is also a resnet + num_resnets += 1 + + has_attentions = hasattr(up_block, "attentions") + + for resnet_idx_inc in range(num_resnets): + if upsampler and resnet_idx_inc == num_resnets - 1: + # this is an upsample block + if has_attentions: + # There is a middle attention block that we skip + original_resnet_block_idx = 2 + else: + original_resnet_block_idx = 1 + + # we add the `minus 1` because the last two resnets are stuck together in the same output block + full_resnet_prefix = ( + f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc - 1}.{original_resnet_block_idx}" + ) + + full_diffusers_resnet_prefix = f"up_blocks.{diffusers_up_block_idx}.upsamplers.0" + else: + # this is a regular resnet block + full_resnet_prefix = f"{original_up_block_prefix}.{original_up_block_idx + resnet_idx_inc}.0" + full_diffusers_resnet_prefix = f"{diffusers_resnet_prefix}.{resnet_idx_inc}" + + diffusers_checkpoint.update( + resnet_to_diffusers_checkpoint( + checkpoint, resnet_prefix=full_resnet_prefix, diffusers_resnet_prefix=full_diffusers_resnet_prefix + ) + ) + + if has_attentions: + num_attentions = len(up_block.attentions) + diffusers_attention_prefix = f"up_blocks.{diffusers_up_block_idx}.attentions" + + for attention_idx_inc in range(num_attentions): + full_attention_prefix = f"{original_up_block_prefix}.{original_up_block_idx + attention_idx_inc}.1" + full_diffusers_attention_prefix = f"{diffusers_attention_prefix}.{attention_idx_inc}" + + diffusers_checkpoint.update( + attention_to_diffusers_checkpoint( + checkpoint, + attention_prefix=full_attention_prefix, + diffusers_attention_prefix=full_diffusers_attention_prefix, + num_head_channels=num_head_channels, + ) + ) + + num_original_down_blocks = num_resnets - 1 if upsampler else num_resnets + + return diffusers_checkpoint, num_original_down_blocks + + +def resnet_to_diffusers_checkpoint(checkpoint, *, diffusers_resnet_prefix, resnet_prefix): + diffusers_checkpoint = { + f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.in_layers.0.weight"], + f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.in_layers.0.bias"], + f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.in_layers.2.weight"], + f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.in_layers.2.bias"], + f"{diffusers_resnet_prefix}.time_emb_proj.weight": checkpoint[f"{resnet_prefix}.emb_layers.1.weight"], + f"{diffusers_resnet_prefix}.time_emb_proj.bias": checkpoint[f"{resnet_prefix}.emb_layers.1.bias"], + f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.out_layers.0.weight"], + f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.out_layers.0.bias"], + f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.out_layers.3.weight"], + f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.out_layers.3.bias"], + } + + skip_connection_prefix = f"{resnet_prefix}.skip_connection" + + if f"{skip_connection_prefix}.weight" in checkpoint: + diffusers_checkpoint.update( + { + f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{skip_connection_prefix}.weight"], + f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{skip_connection_prefix}.bias"], + } + ) + + return diffusers_checkpoint + + +def attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix, num_head_channels): + diffusers_checkpoint = {} + + # .norm -> .group_norm + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"], + f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"], + } + ) + + # .qkv -> .{query, key, value} + [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions( + weight=checkpoint[f"{attention_prefix}.qkv.weight"][:, :, 0], + bias=checkpoint[f"{attention_prefix}.qkv.bias"], + split=3, + chunk_size=num_head_channels, + ) + + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.to_q.weight": q_weight, + f"{diffusers_attention_prefix}.to_q.bias": q_bias, + f"{diffusers_attention_prefix}.to_k.weight": k_weight, + f"{diffusers_attention_prefix}.to_k.bias": k_bias, + f"{diffusers_attention_prefix}.to_v.weight": v_weight, + f"{diffusers_attention_prefix}.to_v.bias": v_bias, + } + ) + + # .encoder_kv -> .{context_key, context_value} + [encoder_k_weight, encoder_v_weight], [encoder_k_bias, encoder_v_bias] = split_attentions( + weight=checkpoint[f"{attention_prefix}.encoder_kv.weight"][:, :, 0], + bias=checkpoint[f"{attention_prefix}.encoder_kv.bias"], + split=2, + chunk_size=num_head_channels, + ) + + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.add_k_proj.weight": encoder_k_weight, + f"{diffusers_attention_prefix}.add_k_proj.bias": encoder_k_bias, + f"{diffusers_attention_prefix}.add_v_proj.weight": encoder_v_weight, + f"{diffusers_attention_prefix}.add_v_proj.bias": encoder_v_bias, + } + ) + + # .proj_out (1d conv) -> .proj_attn (linear) + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][ + :, :, 0 + ], + f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"], + } + ) + + return diffusers_checkpoint + + +# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?) +def split_attentions(*, weight, bias, split, chunk_size): + weights = [None] * split + biases = [None] * split + + weights_biases_idx = 0 + + for starting_row_index in range(0, weight.shape[0], chunk_size): + row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size) + + weight_rows = weight[row_indices, :] + bias_rows = bias[row_indices] + + if weights[weights_biases_idx] is None: + assert weights[weights_biases_idx] is None + weights[weights_biases_idx] = weight_rows + biases[weights_biases_idx] = bias_rows + else: + assert weights[weights_biases_idx] is not None + weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows]) + biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows]) + + weights_biases_idx = (weights_biases_idx + 1) % split + + return weights, biases + + +# done unet utils + + +def prior(*, args, checkpoint_map_location): + print("loading prior") + + prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location) + + clip_stats_checkpoint = torch.load(args.clip_stat_path, map_location=checkpoint_map_location) + + prior_model = prior_model_from_original_config() + + prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint( + prior_model, prior_checkpoint, clip_stats_checkpoint + ) + + del prior_checkpoint + del clip_stats_checkpoint + + load_checkpoint_to_model(prior_diffusers_checkpoint, prior_model, strict=True) + + print("done loading prior") + + return prior_model + + +def text2img(*, args, checkpoint_map_location): + print("loading text2img") + + text2img_checkpoint = torch.load(args.text2img_checkpoint_path, map_location=checkpoint_map_location) + + unet_model = unet_model_from_original_config() + + unet_diffusers_checkpoint = unet_original_checkpoint_to_diffusers_checkpoint(unet_model, text2img_checkpoint) + + # text proj interlude + + # The original decoder implementation includes a set of parameters that are used + # for creating the `encoder_hidden_states` which are what the U-net is conditioned + # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull + # the parameters into the KandinskyTextProjModel class + text_proj_model = text_proj_from_original_config() + + text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(text2img_checkpoint) + + load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True) + + del text2img_checkpoint + + load_checkpoint_to_model(unet_diffusers_checkpoint, unet_model, strict=True) + + print("done loading text2img") + + return unet_model, text_proj_model + + +def inpaint_text2img(*, args, checkpoint_map_location): + print("loading inpaint text2img") + + inpaint_text2img_checkpoint = torch.load( + args.inpaint_text2img_checkpoint_path, map_location=checkpoint_map_location + ) + + inpaint_unet_model = inpaint_unet_model_from_original_config() + + inpaint_unet_diffusers_checkpoint = inpaint_unet_original_checkpoint_to_diffusers_checkpoint( + inpaint_unet_model, inpaint_text2img_checkpoint + ) + + # text proj interlude + + # The original decoder implementation includes a set of parameters that are used + # for creating the `encoder_hidden_states` which are what the U-net is conditioned + # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull + # the parameters into the KandinskyTextProjModel class + text_proj_model = text_proj_from_original_config() + + text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(inpaint_text2img_checkpoint) + + load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True) + + del inpaint_text2img_checkpoint + + load_checkpoint_to_model(inpaint_unet_diffusers_checkpoint, inpaint_unet_model, strict=True) + + print("done loading inpaint text2img") + + return inpaint_unet_model, text_proj_model + + +# movq + +MOVQ_CONFIG = { + "in_channels": 3, + "out_channels": 3, + "latent_channels": 4, + "down_block_types": ("DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "AttnDownEncoderBlock2D"), + "up_block_types": ("AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"), + "num_vq_embeddings": 16384, + "block_out_channels": (128, 256, 256, 512), + "vq_embed_dim": 4, + "layers_per_block": 2, + "norm_type": "spatial", +} + + +def movq_model_from_original_config(): + movq = VQModel(**MOVQ_CONFIG) + return movq + + +def movq_encoder_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + + # conv_in + diffusers_checkpoint.update( + { + "encoder.conv_in.weight": checkpoint["encoder.conv_in.weight"], + "encoder.conv_in.bias": checkpoint["encoder.conv_in.bias"], + } + ) + + # down_blocks + for down_block_idx, down_block in enumerate(model.encoder.down_blocks): + diffusers_down_block_prefix = f"encoder.down_blocks.{down_block_idx}" + down_block_prefix = f"encoder.down.{down_block_idx}" + + # resnets + for resnet_idx, resnet in enumerate(down_block.resnets): + diffusers_resnet_prefix = f"{diffusers_down_block_prefix}.resnets.{resnet_idx}" + resnet_prefix = f"{down_block_prefix}.block.{resnet_idx}" + + diffusers_checkpoint.update( + movq_resnet_to_diffusers_checkpoint( + resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix + ) + ) + + # downsample + + # do not include the downsample when on the last down block + # There is no downsample on the last down block + if down_block_idx != len(model.encoder.down_blocks) - 1: + # There's a single downsample in the original checkpoint but a list of downsamples + # in the diffusers model. + diffusers_downsample_prefix = f"{diffusers_down_block_prefix}.downsamplers.0.conv" + downsample_prefix = f"{down_block_prefix}.downsample.conv" + diffusers_checkpoint.update( + { + f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"], + f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"], + } + ) + + # attentions + + if hasattr(down_block, "attentions"): + for attention_idx, _ in enumerate(down_block.attentions): + diffusers_attention_prefix = f"{diffusers_down_block_prefix}.attentions.{attention_idx}" + attention_prefix = f"{down_block_prefix}.attn.{attention_idx}" + diffusers_checkpoint.update( + movq_attention_to_diffusers_checkpoint( + checkpoint, + diffusers_attention_prefix=diffusers_attention_prefix, + attention_prefix=attention_prefix, + ) + ) + + # mid block + + # mid block attentions + + # There is a single hardcoded attention block in the middle of the VQ-diffusion encoder + diffusers_attention_prefix = "encoder.mid_block.attentions.0" + attention_prefix = "encoder.mid.attn_1" + diffusers_checkpoint.update( + movq_attention_to_diffusers_checkpoint( + checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix + ) + ) + + # mid block resnets + + for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets): + diffusers_resnet_prefix = f"encoder.mid_block.resnets.{diffusers_resnet_idx}" + + # the hardcoded prefixes to `block_` are 1 and 2 + orig_resnet_idx = diffusers_resnet_idx + 1 + # There are two hardcoded resnets in the middle of the VQ-diffusion encoder + resnet_prefix = f"encoder.mid.block_{orig_resnet_idx}" + + diffusers_checkpoint.update( + movq_resnet_to_diffusers_checkpoint( + resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix + ) + ) + + diffusers_checkpoint.update( + { + # conv_norm_out + "encoder.conv_norm_out.weight": checkpoint["encoder.norm_out.weight"], + "encoder.conv_norm_out.bias": checkpoint["encoder.norm_out.bias"], + # conv_out + "encoder.conv_out.weight": checkpoint["encoder.conv_out.weight"], + "encoder.conv_out.bias": checkpoint["encoder.conv_out.bias"], + } + ) + + return diffusers_checkpoint + + +def movq_decoder_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + + # conv in + diffusers_checkpoint.update( + { + "decoder.conv_in.weight": checkpoint["decoder.conv_in.weight"], + "decoder.conv_in.bias": checkpoint["decoder.conv_in.bias"], + } + ) + + # up_blocks + + for diffusers_up_block_idx, up_block in enumerate(model.decoder.up_blocks): + # up_blocks are stored in reverse order in the VQ-diffusion checkpoint + orig_up_block_idx = len(model.decoder.up_blocks) - 1 - diffusers_up_block_idx + + diffusers_up_block_prefix = f"decoder.up_blocks.{diffusers_up_block_idx}" + up_block_prefix = f"decoder.up.{orig_up_block_idx}" + + # resnets + for resnet_idx, resnet in enumerate(up_block.resnets): + diffusers_resnet_prefix = f"{diffusers_up_block_prefix}.resnets.{resnet_idx}" + resnet_prefix = f"{up_block_prefix}.block.{resnet_idx}" + + diffusers_checkpoint.update( + movq_resnet_to_diffusers_checkpoint_spatial_norm( + resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix + ) + ) + + # upsample + + # there is no up sample on the last up block + if diffusers_up_block_idx != len(model.decoder.up_blocks) - 1: + # There's a single upsample in the VQ-diffusion checkpoint but a list of downsamples + # in the diffusers model. + diffusers_downsample_prefix = f"{diffusers_up_block_prefix}.upsamplers.0.conv" + downsample_prefix = f"{up_block_prefix}.upsample.conv" + diffusers_checkpoint.update( + { + f"{diffusers_downsample_prefix}.weight": checkpoint[f"{downsample_prefix}.weight"], + f"{diffusers_downsample_prefix}.bias": checkpoint[f"{downsample_prefix}.bias"], + } + ) + + # attentions + + if hasattr(up_block, "attentions"): + for attention_idx, _ in enumerate(up_block.attentions): + diffusers_attention_prefix = f"{diffusers_up_block_prefix}.attentions.{attention_idx}" + attention_prefix = f"{up_block_prefix}.attn.{attention_idx}" + diffusers_checkpoint.update( + movq_attention_to_diffusers_checkpoint_spatial_norm( + checkpoint, + diffusers_attention_prefix=diffusers_attention_prefix, + attention_prefix=attention_prefix, + ) + ) + + # mid block + + # mid block attentions + + # There is a single hardcoded attention block in the middle of the VQ-diffusion decoder + diffusers_attention_prefix = "decoder.mid_block.attentions.0" + attention_prefix = "decoder.mid.attn_1" + diffusers_checkpoint.update( + movq_attention_to_diffusers_checkpoint_spatial_norm( + checkpoint, diffusers_attention_prefix=diffusers_attention_prefix, attention_prefix=attention_prefix + ) + ) + + # mid block resnets + + for diffusers_resnet_idx, resnet in enumerate(model.encoder.mid_block.resnets): + diffusers_resnet_prefix = f"decoder.mid_block.resnets.{diffusers_resnet_idx}" + + # the hardcoded prefixes to `block_` are 1 and 2 + orig_resnet_idx = diffusers_resnet_idx + 1 + # There are two hardcoded resnets in the middle of the VQ-diffusion decoder + resnet_prefix = f"decoder.mid.block_{orig_resnet_idx}" + + diffusers_checkpoint.update( + movq_resnet_to_diffusers_checkpoint_spatial_norm( + resnet, checkpoint, diffusers_resnet_prefix=diffusers_resnet_prefix, resnet_prefix=resnet_prefix + ) + ) + + diffusers_checkpoint.update( + { + # conv_norm_out + "decoder.conv_norm_out.norm_layer.weight": checkpoint["decoder.norm_out.norm_layer.weight"], + "decoder.conv_norm_out.norm_layer.bias": checkpoint["decoder.norm_out.norm_layer.bias"], + "decoder.conv_norm_out.conv_y.weight": checkpoint["decoder.norm_out.conv_y.weight"], + "decoder.conv_norm_out.conv_y.bias": checkpoint["decoder.norm_out.conv_y.bias"], + "decoder.conv_norm_out.conv_b.weight": checkpoint["decoder.norm_out.conv_b.weight"], + "decoder.conv_norm_out.conv_b.bias": checkpoint["decoder.norm_out.conv_b.bias"], + # conv_out + "decoder.conv_out.weight": checkpoint["decoder.conv_out.weight"], + "decoder.conv_out.bias": checkpoint["decoder.conv_out.bias"], + } + ) + + return diffusers_checkpoint + + +def movq_resnet_to_diffusers_checkpoint(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix): + rv = { + # norm1 + f"{diffusers_resnet_prefix}.norm1.weight": checkpoint[f"{resnet_prefix}.norm1.weight"], + f"{diffusers_resnet_prefix}.norm1.bias": checkpoint[f"{resnet_prefix}.norm1.bias"], + # conv1 + f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"], + f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"], + # norm2 + f"{diffusers_resnet_prefix}.norm2.weight": checkpoint[f"{resnet_prefix}.norm2.weight"], + f"{diffusers_resnet_prefix}.norm2.bias": checkpoint[f"{resnet_prefix}.norm2.bias"], + # conv2 + f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"], + f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"], + } + + if resnet.conv_shortcut is not None: + rv.update( + { + f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"], + f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"], + } + ) + + return rv + + +def movq_resnet_to_diffusers_checkpoint_spatial_norm(resnet, checkpoint, *, diffusers_resnet_prefix, resnet_prefix): + rv = { + # norm1 + f"{diffusers_resnet_prefix}.norm1.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm1.norm_layer.weight"], + f"{diffusers_resnet_prefix}.norm1.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm1.norm_layer.bias"], + f"{diffusers_resnet_prefix}.norm1.conv_y.weight": checkpoint[f"{resnet_prefix}.norm1.conv_y.weight"], + f"{diffusers_resnet_prefix}.norm1.conv_y.bias": checkpoint[f"{resnet_prefix}.norm1.conv_y.bias"], + f"{diffusers_resnet_prefix}.norm1.conv_b.weight": checkpoint[f"{resnet_prefix}.norm1.conv_b.weight"], + f"{diffusers_resnet_prefix}.norm1.conv_b.bias": checkpoint[f"{resnet_prefix}.norm1.conv_b.bias"], + # conv1 + f"{diffusers_resnet_prefix}.conv1.weight": checkpoint[f"{resnet_prefix}.conv1.weight"], + f"{diffusers_resnet_prefix}.conv1.bias": checkpoint[f"{resnet_prefix}.conv1.bias"], + # norm2 + f"{diffusers_resnet_prefix}.norm2.norm_layer.weight": checkpoint[f"{resnet_prefix}.norm2.norm_layer.weight"], + f"{diffusers_resnet_prefix}.norm2.norm_layer.bias": checkpoint[f"{resnet_prefix}.norm2.norm_layer.bias"], + f"{diffusers_resnet_prefix}.norm2.conv_y.weight": checkpoint[f"{resnet_prefix}.norm2.conv_y.weight"], + f"{diffusers_resnet_prefix}.norm2.conv_y.bias": checkpoint[f"{resnet_prefix}.norm2.conv_y.bias"], + f"{diffusers_resnet_prefix}.norm2.conv_b.weight": checkpoint[f"{resnet_prefix}.norm2.conv_b.weight"], + f"{diffusers_resnet_prefix}.norm2.conv_b.bias": checkpoint[f"{resnet_prefix}.norm2.conv_b.bias"], + # conv2 + f"{diffusers_resnet_prefix}.conv2.weight": checkpoint[f"{resnet_prefix}.conv2.weight"], + f"{diffusers_resnet_prefix}.conv2.bias": checkpoint[f"{resnet_prefix}.conv2.bias"], + } + + if resnet.conv_shortcut is not None: + rv.update( + { + f"{diffusers_resnet_prefix}.conv_shortcut.weight": checkpoint[f"{resnet_prefix}.nin_shortcut.weight"], + f"{diffusers_resnet_prefix}.conv_shortcut.bias": checkpoint[f"{resnet_prefix}.nin_shortcut.bias"], + } + ) + + return rv + + +def movq_attention_to_diffusers_checkpoint(checkpoint, *, diffusers_attention_prefix, attention_prefix): + return { + # norm + f"{diffusers_attention_prefix}.group_norm.weight": checkpoint[f"{attention_prefix}.norm.weight"], + f"{diffusers_attention_prefix}.group_norm.bias": checkpoint[f"{attention_prefix}.norm.bias"], + # query + f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"], + # key + f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"], + # value + f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"], + # proj_attn + f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"], + } + + +def movq_attention_to_diffusers_checkpoint_spatial_norm(checkpoint, *, diffusers_attention_prefix, attention_prefix): + return { + # norm + f"{diffusers_attention_prefix}.spatial_norm.norm_layer.weight": checkpoint[ + f"{attention_prefix}.norm.norm_layer.weight" + ], + f"{diffusers_attention_prefix}.spatial_norm.norm_layer.bias": checkpoint[ + f"{attention_prefix}.norm.norm_layer.bias" + ], + f"{diffusers_attention_prefix}.spatial_norm.conv_y.weight": checkpoint[ + f"{attention_prefix}.norm.conv_y.weight" + ], + f"{diffusers_attention_prefix}.spatial_norm.conv_y.bias": checkpoint[f"{attention_prefix}.norm.conv_y.bias"], + f"{diffusers_attention_prefix}.spatial_norm.conv_b.weight": checkpoint[ + f"{attention_prefix}.norm.conv_b.weight" + ], + f"{diffusers_attention_prefix}.spatial_norm.conv_b.bias": checkpoint[f"{attention_prefix}.norm.conv_b.bias"], + # query + f"{diffusers_attention_prefix}.to_q.weight": checkpoint[f"{attention_prefix}.q.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_q.bias": checkpoint[f"{attention_prefix}.q.bias"], + # key + f"{diffusers_attention_prefix}.to_k.weight": checkpoint[f"{attention_prefix}.k.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_k.bias": checkpoint[f"{attention_prefix}.k.bias"], + # value + f"{diffusers_attention_prefix}.to_v.weight": checkpoint[f"{attention_prefix}.v.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_v.bias": checkpoint[f"{attention_prefix}.v.bias"], + # proj_attn + f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{attention_prefix}.proj_out.weight"][:, :, 0, 0], + f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{attention_prefix}.proj_out.bias"], + } + + +def movq_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + diffusers_checkpoint.update(movq_encoder_to_diffusers_checkpoint(model, checkpoint)) + + # quant_conv + + diffusers_checkpoint.update( + { + "quant_conv.weight": checkpoint["quant_conv.weight"], + "quant_conv.bias": checkpoint["quant_conv.bias"], + } + ) + + # quantize + diffusers_checkpoint.update({"quantize.embedding.weight": checkpoint["quantize.embedding.weight"]}) + + # post_quant_conv + diffusers_checkpoint.update( + { + "post_quant_conv.weight": checkpoint["post_quant_conv.weight"], + "post_quant_conv.bias": checkpoint["post_quant_conv.bias"], + } + ) + + # decoder + diffusers_checkpoint.update(movq_decoder_to_diffusers_checkpoint(model, checkpoint)) + + return diffusers_checkpoint + + +def movq(*, args, checkpoint_map_location): + print("loading movq") + + movq_checkpoint = torch.load(args.movq_checkpoint_path, map_location=checkpoint_map_location) + + movq_model = movq_model_from_original_config() + + movq_diffusers_checkpoint = movq_original_checkpoint_to_diffusers_checkpoint(movq_model, movq_checkpoint) + + del movq_checkpoint + + load_checkpoint_to_model(movq_diffusers_checkpoint, movq_model, strict=True) + + print("done loading movq") + + return movq_model + + +def load_checkpoint_to_model(checkpoint, model, strict=False): + with tempfile.NamedTemporaryFile(delete=False) as file: + torch.save(checkpoint, file.name) + del checkpoint + if strict: + model.load_state_dict(torch.load(file.name), strict=True) + else: + load_checkpoint_and_dispatch(model, file.name, device_map="auto") + os.remove(file.name) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") + + parser.add_argument( + "--prior_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the prior checkpoint to convert.", + ) + parser.add_argument( + "--clip_stat_path", + default=None, + type=str, + required=False, + help="Path to the clip stats checkpoint to convert.", + ) + parser.add_argument( + "--text2img_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the text2img checkpoint to convert.", + ) + parser.add_argument( + "--movq_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the text2img checkpoint to convert.", + ) + parser.add_argument( + "--inpaint_text2img_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the inpaint text2img checkpoint to convert.", + ) + parser.add_argument( + "--checkpoint_load_device", + default="cpu", + type=str, + required=False, + help="The device passed to `map_location` when loading checkpoints.", + ) + + parser.add_argument( + "--debug", + default=None, + type=str, + required=False, + help="Only run a specific stage of the convert script. Used for debugging", + ) + + args = parser.parse_args() + + print(f"loading checkpoints to {args.checkpoint_load_device}") + + checkpoint_map_location = torch.device(args.checkpoint_load_device) + + if args.debug is not None: + print(f"debug: only executing {args.debug}") + + if args.debug is None: + print("to-do") + elif args.debug == "prior": + prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location) + prior_model.save_pretrained(args.dump_path) + elif args.debug == "text2img": + unet_model, text_proj_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location) + unet_model.save_pretrained(f"{args.dump_path}/unet") + text_proj_model.save_pretrained(f"{args.dump_path}/text_proj") + elif args.debug == "inpaint_text2img": + inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img( + args=args, checkpoint_map_location=checkpoint_map_location + ) + inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet") + inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj") + elif args.debug == "decoder": + decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location) + decoder.save_pretrained(f"{args.dump_path}/decoder") + else: + raise ValueError(f"unknown debug value : {args.debug}") diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 9b3f8adad376..f6d8c254d157 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -129,6 +129,10 @@ IFInpaintingSuperResolutionPipeline, IFPipeline, IFSuperResolutionPipeline, + KandinskyImg2ImgPipeline, + KandinskyInpaintPipeline, + KandinskyPipeline, + KandinskyPriorPipeline, LDMTextToImagePipeline, PaintByExamplePipeline, SemanticStableDiffusionPipeline, diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 13c7afc8e922..acdee10c7674 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -62,6 +62,7 @@ def __init__( cross_attention_norm_num_groups: int = 32, added_kv_proj_dim: Optional[int] = None, norm_num_groups: Optional[int] = None, + spatial_norm_dim: Optional[int] = None, out_bias: bool = True, scale_qk: bool = True, only_cross_attention: bool = False, @@ -105,6 +106,11 @@ def __init__( else: self.group_norm = None + if spatial_norm_dim is not None: + self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim) + else: + self.spatial_norm = None + if cross_attention_norm is None: self.norm_cross = None elif cross_attention_norm == "layer_norm": @@ -431,9 +437,13 @@ def __call__( hidden_states, encoder_hidden_states=None, attention_mask=None, + temb=None, ): residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + input_ndim = hidden_states.ndim if input_ndim == 4: @@ -899,9 +909,19 @@ def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") - def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + def __call__( + self, + attn: Attention, + hidden_states, + encoder_hidden_states=None, + attention_mask=None, + temb=None, + ): residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + input_ndim = hidden_states.ndim if input_ndim == 4: @@ -1271,3 +1291,26 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, ] + + +class SpatialNorm(nn.Module): + """ + Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002 + """ + + def __init__( + self, + f_channels, + zq_channels, + ): + super().__init__() + self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True) + self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) + self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, f, zq): + f_size = f.shape[-2:] + zq = F.interpolate(zq, size=f_size, mode="nearest") + norm_f = self.norm_layer(f) + new_f = norm_f * self.conv_y(zq) + self.conv_b(zq) + return new_f diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index fb803039b268..991264a9aa8f 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -360,6 +360,33 @@ def forward(self, labels: torch.LongTensor, force_drop_ids=None): return embeddings +class TextImageProjection(nn.Module): + def __init__( + self, + text_embed_dim: int = 1024, + image_embed_dim: int = 768, + cross_attention_dim: int = 768, + num_image_text_embeds: int = 10, + ): + super().__init__() + + self.num_image_text_embeds = num_image_text_embeds + self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) + self.text_proj = nn.Linear(text_embed_dim, cross_attention_dim) + + def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + batch_size = text_embeds.shape[0] + + # image + image_text_embeds = self.image_embeds(image_embeds) + image_text_embeds = image_text_embeds.reshape(batch_size, self.num_image_text_embeds, -1) + + # text + text_embeds = self.text_proj(text_embeds) + + return torch.cat([image_text_embeds, text_embeds], dim=1) + + class CombinedTimestepLabelEmbeddings(nn.Module): def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1): super().__init__() @@ -395,6 +422,24 @@ def forward(self, hidden_states): return hidden_states +class TextImageTimeEmbedding(nn.Module): + def __init__(self, text_embed_dim: int = 768, image_embed_dim: int = 768, time_embed_dim: int = 1536): + super().__init__() + self.text_proj = nn.Linear(text_embed_dim, time_embed_dim) + self.text_norm = nn.LayerNorm(time_embed_dim) + self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) + + def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTensor): + # text + time_text_embeds = self.text_proj(text_embeds) + time_text_embeds = self.text_norm(time_text_embeds) + + # image + time_image_embeds = self.image_proj(image_embeds) + + return time_image_embeds + time_text_embeds + + class AttentionPooling(nn.Module): # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54 diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index 92bc89c80099..cf9e3182d400 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -21,6 +21,7 @@ import torch.nn.functional as F from .attention import AdaGroupNorm +from .attention_processor import SpatialNorm class Upsample1D(nn.Module): @@ -500,7 +501,7 @@ def __init__( eps=1e-6, non_linearity="swish", skip_time_act=False, - time_embedding_norm="default", # default, scale_shift, ada_group + time_embedding_norm="default", # default, scale_shift, ada_group, spatial kernel=None, output_scale_factor=1.0, use_in_shortcut=None, @@ -527,6 +528,8 @@ def __init__( if self.time_embedding_norm == "ada_group": self.norm1 = AdaGroupNorm(temb_channels, in_channels, groups, eps=eps) + elif self.time_embedding_norm == "spatial": + self.norm1 = SpatialNorm(in_channels, temb_channels) else: self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True) @@ -537,7 +540,7 @@ def __init__( self.time_emb_proj = torch.nn.Linear(temb_channels, out_channels) elif self.time_embedding_norm == "scale_shift": self.time_emb_proj = torch.nn.Linear(temb_channels, 2 * out_channels) - elif self.time_embedding_norm == "ada_group": + elif self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial": self.time_emb_proj = None else: raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ") @@ -546,6 +549,8 @@ def __init__( if self.time_embedding_norm == "ada_group": self.norm2 = AdaGroupNorm(temb_channels, out_channels, groups_out, eps=eps) + elif self.time_embedding_norm == "spatial": + self.norm2 = SpatialNorm(out_channels, temb_channels) else: self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True) @@ -591,7 +596,7 @@ def __init__( def forward(self, input_tensor, temb): hidden_states = input_tensor - if self.time_embedding_norm == "ada_group": + if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial": hidden_states = self.norm1(hidden_states, temb) else: hidden_states = self.norm1(hidden_states) @@ -619,7 +624,7 @@ def forward(self, input_tensor, temb): if temb is not None and self.time_embedding_norm == "default": hidden_states = hidden_states + temb - if self.time_embedding_norm == "ada_group": + if self.time_embedding_norm == "ada_group" or self.time_embedding_norm == "spatial": hidden_states = self.norm2(hidden_states, temb) else: hidden_states = self.norm2(hidden_states) diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index 6f8e3d0f5500..e96f33356870 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -349,6 +349,7 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, resnet_time_scale_shift=resnet_time_scale_shift, + temb_channels=temb_channels, ) elif up_block_type == "AttnUpDecoderBlock2D": return AttnUpDecoderBlock2D( @@ -361,6 +362,7 @@ def get_up_block( resnet_groups=resnet_groups, attn_num_head_channels=attn_num_head_channels, resnet_time_scale_shift=resnet_time_scale_shift, + temb_channels=temb_channels, ) elif up_block_type == "KUpBlock2D": return KUpBlock2D( @@ -396,7 +398,7 @@ def __init__( dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, - resnet_time_scale_shift: str = "default", + resnet_time_scale_shift: str = "default", # default, spatial resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, @@ -434,7 +436,8 @@ def __init__( dim_head=attn_num_head_channels if attn_num_head_channels is not None else in_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, + norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None, + spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None, residual_connection=True, bias=True, upcast_softmax=True, @@ -466,7 +469,7 @@ def forward(self, hidden_states, temb=None): hidden_states = self.resnets[0](hidden_states, temb) for attn, resnet in zip(self.attentions, self.resnets[1:]): if attn is not None: - hidden_states = attn(hidden_states) + hidden_states = attn(hidden_states, temb=temb) hidden_states = resnet(hidden_states, temb) return hidden_states @@ -2116,12 +2119,13 @@ def __init__( dropout: float = 0.0, num_layers: int = 1, resnet_eps: float = 1e-6, - resnet_time_scale_shift: str = "default", + resnet_time_scale_shift: str = "default", # default, spatial resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, output_scale_factor=1.0, add_upsample=True, + temb_channels=None, ): super().__init__() resnets = [] @@ -2133,7 +2137,7 @@ def __init__( ResnetBlock2D( in_channels=input_channels, out_channels=out_channels, - temb_channels=None, + temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, @@ -2151,9 +2155,9 @@ def __init__( else: self.upsamplers = None - def forward(self, hidden_states): + def forward(self, hidden_states, temb=None): for resnet in self.resnets: - hidden_states = resnet(hidden_states, temb=None) + hidden_states = resnet(hidden_states, temb=temb) if self.upsamplers is not None: for upsampler in self.upsamplers: @@ -2177,6 +2181,7 @@ def __init__( attn_num_head_channels=1, output_scale_factor=1.0, add_upsample=True, + temb_channels=None, ): super().__init__() resnets = [] @@ -2189,7 +2194,7 @@ def __init__( ResnetBlock2D( in_channels=input_channels, out_channels=out_channels, - temb_channels=None, + temb_channels=temb_channels, eps=resnet_eps, groups=resnet_groups, dropout=dropout, @@ -2206,7 +2211,8 @@ def __init__( dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups, + norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None, + spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None, residual_connection=True, bias=True, upcast_softmax=True, @@ -2222,10 +2228,10 @@ def __init__( else: self.upsamplers = None - def forward(self, hidden_states): + def forward(self, hidden_states, temb=None): for resnet, attn in zip(self.resnets, self.attentions): - hidden_states = resnet(hidden_states, temb=None) - hidden_states = attn(hidden_states) + hidden_states = resnet(hidden_states, temb=temb) + hidden_states = attn(hidden_states, temb=temb) if self.upsamplers is not None: for upsampler in self.upsamplers: diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 76a40ffa1ec5..484f9323c69f 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -23,7 +23,14 @@ from ..loaders import UNet2DConditionLoadersMixin from ..utils import BaseOutput, logging from .attention_processor import AttentionProcessor, AttnProcessor -from .embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps +from .embeddings import ( + GaussianFourierProjection, + TextImageProjection, + TextImageTimeEmbedding, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) from .modeling_utils import ModelMixin from .unet_2d_blocks import ( CrossAttnDownBlock2D, @@ -90,7 +97,11 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): The dimension of the cross attention features. encoder_hid_dim (`int`, *optional*, defaults to None): - If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`. + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to None): + If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`. @@ -156,6 +167,7 @@ def __init__( norm_eps: float = 1e-5, cross_attention_dim: Union[int, Tuple[int]] = 1280, encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, attention_head_dim: Union[int, Tuple[int]] = 8, dual_cross_attention: bool = False, use_linear_projection: bool = False, @@ -247,8 +259,31 @@ def __init__( cond_proj_dim=time_cond_proj_dim, ) - if encoder_hid_dim is not None: + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.") + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) else: self.encoder_hid_proj = None @@ -290,8 +325,15 @@ def __init__( self.add_embedding = TextTimeEmbedding( text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads ) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim + ) elif addition_embed_type is not None: - raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.") + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") if time_embedding_act_fn is None: self.time_embed_act = None @@ -616,6 +658,7 @@ def forward( timestep_cond: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, @@ -636,6 +679,10 @@ def forward( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + added_cond_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time + embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and + `addition_embed_type` for more information. Returns: [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: @@ -728,12 +775,33 @@ def forward( if self.config.addition_embed_type == "text": aug_emb = self.add_embedding(encoder_hidden_states) emb = emb + aug_emb + elif self.config.addition_embed_type == "text_image": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) + + aug_emb = self.add_embedding(text_embs, image_embs) + emb = emb + aug_emb if self.time_embed_act is not None: emb = self.time_embed_act(emb) - if self.encoder_hid_proj is not None: + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) # 2. pre-process sample = self.conv_in(sample) diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py index 6f8514f28d33..dd4af0efcfd9 100644 --- a/src/diffusers/models/vae.py +++ b/src/diffusers/models/vae.py @@ -19,6 +19,7 @@ import torch.nn as nn from ..utils import BaseOutput, is_torch_version, randn_tensor +from .attention_processor import SpatialNorm from .unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block @@ -158,6 +159,7 @@ def __init__( layers_per_block=2, norm_num_groups=32, act_fn="silu", + norm_type="group", # group, spatial ): super().__init__() self.layers_per_block = layers_per_block @@ -173,16 +175,18 @@ def __init__( self.mid_block = None self.up_blocks = nn.ModuleList([]) + temb_channels = in_channels if norm_type == "spatial" else None + # mid self.mid_block = UNetMidBlock2D( in_channels=block_out_channels[-1], resnet_eps=1e-6, resnet_act_fn=act_fn, output_scale_factor=1, - resnet_time_scale_shift="default", + resnet_time_scale_shift="default" if norm_type == "group" else norm_type, attn_num_head_channels=None, resnet_groups=norm_num_groups, - temb_channels=None, + temb_channels=temb_channels, ) # up @@ -205,19 +209,23 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, attn_num_head_channels=None, - temb_channels=None, + temb_channels=temb_channels, + resnet_time_scale_shift=norm_type, ) self.up_blocks.append(up_block) prev_output_channel = output_channel # out - self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6) + if norm_type == "spatial": + self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels) + else: + self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6) self.conv_act = nn.SiLU() self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1) self.gradient_checkpointing = False - def forward(self, z): + def forward(self, z, latent_embeds=None): sample = z sample = self.conv_in(sample) @@ -233,34 +241,39 @@ def custom_forward(*inputs): if is_torch_version(">=", "1.11.0"): # middle sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.mid_block), sample, use_reentrant=False + create_custom_forward(self.mid_block), sample, latent_embeds, use_reentrant=False ) sample = sample.to(upscale_dtype) # up for up_block in self.up_blocks: sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(up_block), sample, use_reentrant=False + create_custom_forward(up_block), sample, latent_embeds, use_reentrant=False ) else: # middle - sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample) + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), sample, latent_embeds + ) sample = sample.to(upscale_dtype) # up for up_block in self.up_blocks: - sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample) + sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds) else: # middle - sample = self.mid_block(sample) + sample = self.mid_block(sample, latent_embeds) sample = sample.to(upscale_dtype) # up for up_block in self.up_blocks: - sample = up_block(sample) + sample = up_block(sample, latent_embeds) # post-process - sample = self.conv_norm_out(sample) + if latent_embeds is None: + sample = self.conv_norm_out(sample) + else: + sample = self.conv_norm_out(sample, latent_embeds) sample = self.conv_act(sample) sample = self.conv_out(sample) diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py index 65f734dccb2d..73158294ee6e 100644 --- a/src/diffusers/models/vq_model.py +++ b/src/diffusers/models/vq_model.py @@ -82,6 +82,7 @@ def __init__( norm_num_groups: int = 32, vq_embed_dim: Optional[int] = None, scaling_factor: float = 0.18215, + norm_type: str = "group", # group, spatial ): super().__init__() @@ -112,6 +113,7 @@ def __init__( layers_per_block=layers_per_block, act_fn=act_fn, norm_num_groups=norm_num_groups, + norm_type=norm_type, ) def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: @@ -131,8 +133,8 @@ def decode( quant, emb_loss, info = self.quantize(h) else: quant = h - quant = self.post_quant_conv(quant) - dec = self.decoder(quant) + quant2 = self.post_quant_conv(quant) + dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None) if not return_dict: return (dec,) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 9b44f4e5eb14..bb3fc5d04cb6 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -57,6 +57,12 @@ IFPipeline, IFSuperResolutionPipeline, ) + from .kandinsky import ( + KandinskyImg2ImgPipeline, + KandinskyInpaintPipeline, + KandinskyPipeline, + KandinskyPriorPipeline, + ) from .latent_diffusion import LDMTextToImagePipeline from .paint_by_example import PaintByExamplePipeline from .semantic_stable_diffusion import SemanticStableDiffusionPipeline diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py new file mode 100644 index 000000000000..c8eecba0c7f2 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky/__init__.py @@ -0,0 +1,19 @@ +from ...utils import ( + OptionalDependencyNotAvailable, + is_torch_available, + is_transformers_available, + is_transformers_version, +) + + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import KandinskyPipeline, KandinskyPriorPipeline +else: + from .pipeline_kandinsky import KandinskyPipeline + from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline + from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline + from .pipeline_kandinsky_prior import KandinskyPriorPipeline + from .text_encoder import MultilingualCLIP diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py new file mode 100644 index 000000000000..29545bd88dc2 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -0,0 +1,463 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union + +import torch +from transformers import ( + XLMRobertaTokenizer, +) + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDIMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from .text_encoder import MultilingualCLIP + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline + >>> import torch + + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-prior") + >>> pipe_prior.to("cuda") + + >>> prompt = "red cat, 4k photo" + >>> out = pipe_prior(prompt) + >>> image_emb = out.images + >>> zero_image_emb = out.zero_embeds + + >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1") + >>> pipe.to("cuda") + + >>> image = pipe( + ... prompt, + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=100, + ... ).images + + >>> image[0].save("cat.png") + ``` +""" + + +def get_new_h_w(h, w, scale_factor=8): + new_h = h // scale_factor**2 + if h % scale_factor**2 != 0: + new_h += 1 + new_w = w // scale_factor**2 + if w % scale_factor**2 != 0: + new_w += 1 + return new_h * scale_factor, new_w * scale_factor + + +class KandinskyPipeline(DiffusionPipeline): + """ + Pipeline for text-to-image generation using Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + text_encoder ([`MultilingualCLIP`]): + Frozen text-encoder. + tokenizer ([`XLMRobertaTokenizer`]): + Tokenizer of class + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ Decoder to generate the image from the latents. + """ + + def __init__( + self, + text_encoder: MultilingualCLIP, + tokenizer: XLMRobertaTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + movq: VQModel, + ): + super().__init__() + + self.register_modules( + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + truncation=True, + max_length=77, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + text_input_ids = text_input_ids.to(device) + text_mask = text_inputs.attention_mask.to(device) + + prompt_embeds, text_encoder_hidden_states = self.text_encoder( + input_ids=text_input_ids, attention_mask=text_mask + ) + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=77, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + uncond_text_input_ids = uncond_input.input_ids.to(device) + uncond_text_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder( + input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask + ) + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len) + + seq_len = uncond_text_encoder_hidden_states.shape[1] + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + # done duplicates + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) + + text_mask = torch.cat([uncond_text_mask, text_mask]) + + return prompt_embeds, text_encoder_hidden_states, text_mask + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.text_encoder, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]], + image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + num_images_per_prompt: int = 1, + negative_prompt: Optional[Union[str, List[str]]] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + device = self._execution_device + + batch_size = batch_size * num_images_per_prompt + do_classifier_free_guidance = guidance_scale > 1.0 + + prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to( + dtype=prompt_embeds.dtype, device=device + ) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps_tensor = self.scheduler.timesteps + + num_channels_latents = self.unet.config.in_channels + + height, width = get_new_h_w(height, width, self.movq_scale_factor) + + # create initial latent + latents = self.prepare_latents( + (batch_size, num_channels_latents, height, width), + text_encoder_hidden_states.dtype, + device, + generator, + latents, + self.scheduler, + ) + + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=text_encoder_hidden_states, + added_cond_kwargs=added_cond_kwargs, + ).sample + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument + # need to use DDPM scheduler instead + # prev_timestep=prev_timestep, + generator=generator, + ).prev_sample + # post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py new file mode 100644 index 000000000000..470fa606af1a --- /dev/null +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py @@ -0,0 +1,547 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +from PIL import Image +from transformers import ( + XLMRobertaTokenizer, +) + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDIMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from .text_encoder import MultilingualCLIP + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline + >>> from diffusers.utils import load_image + >>> import torch + + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> prompt = "A red cartoon frog, 4k" + >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) + + >>> pipe = KandinskyImg2ImgPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16 + ... ) + >>> pipe.to("cuda") + + >>> init_image = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/frog.png" + ... ) + + >>> image = pipe( + ... prompt, + ... image=init_image, + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=100, + ... strength=0.2, + ... ).images + + >>> image[0].save("red_frog.png") + ``` +""" + + +def get_new_h_w(h, w, scale_factor=8): + new_h = h // scale_factor**2 + if h % scale_factor**2 != 0: + new_h += 1 + new_w = w // scale_factor**2 + if w % scale_factor**2 != 0: + new_w += 1 + return new_h * scale_factor, new_w * scale_factor + + +def prepare_image(pil_image, w=512, h=512): + pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1) + arr = np.array(pil_image.convert("RGB")) + arr = arr.astype(np.float32) / 127.5 - 1 + arr = np.transpose(arr, [2, 0, 1]) + image = torch.from_numpy(arr).unsqueeze(0) + return image + + +class KandinskyImg2ImgPipeline(DiffusionPipeline): + """ + Pipeline for image-to-image generation using Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + text_encoder ([`MultilingualCLIP`]): + Frozen text-encoder. + tokenizer ([`XLMRobertaTokenizer`]): + Tokenizer of class + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ image encoder and decoder + """ + + def __init__( + self, + text_encoder: MultilingualCLIP, + movq: VQModel, + tokenizer: XLMRobertaTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + ): + super().__init__() + + self.register_modules( + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, latents, latent_timestep, shape, dtype, device, generator, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + + shape = latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + latents = self.add_noise(latents, noise, latent_timestep) + return latents + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=77, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + text_input_ids = text_input_ids.to(device) + text_mask = text_inputs.attention_mask.to(device) + + prompt_embeds, text_encoder_hidden_states = self.text_encoder( + input_ids=text_input_ids, attention_mask=text_mask + ) + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=77, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + uncond_text_input_ids = uncond_input.input_ids.to(device) + uncond_text_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder( + input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask + ) + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len) + + seq_len = uncond_text_encoder_hidden_states.shape[1] + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + # done duplicates + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) + + text_mask = torch.cat([uncond_text_mask, text_mask]) + + return prompt_embeds, text_encoder_hidden_states, text_mask + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.text_encoder, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # add_noise method to overwrite the one in schedule because it use a different beta schedule for adding noise vs sampling + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + timesteps: torch.IntTensor, + ) -> torch.FloatTensor: + betas = torch.linspace(0.0001, 0.02, 1000, dtype=torch.float32) + alphas = 1.0 - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) + alphas_cumprod = alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(original_samples.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + + return noisy_samples + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]], + image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + image_embeds: torch.FloatTensor, + negative_image_embeds: torch.FloatTensor, + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + strength: float = 0.3, + guidance_scale: float = 7.0, + num_images_per_prompt: int = 1, + negative_prompt: Optional[Union[str, List[str]]] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + image (`torch.FloatTensor`, `PIL.Image.Image`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + strength (`float`, *optional*, defaults to 0.3): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + # 1. Define call parameters + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + device = self._execution_device + + batch_size = batch_size * num_images_per_prompt + + do_classifier_free_guidance = guidance_scale > 1.0 + + # 2. get text and image embeddings + prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to( + dtype=prompt_embeds.dtype, device=device + ) + + # 3. pre-processing initial image + if not isinstance(image, list): + image = [image] + if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image): + raise ValueError( + f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support PIL image and pytorch tensor" + ) + + image = torch.cat([prepare_image(i, width, height) for i in image], dim=0) + image = image.to(dtype=prompt_embeds.dtype, device=device) + + latents = self.movq.encode(image)["latents"] + latents = latents.repeat_interleave(num_images_per_prompt, dim=0) + + # 4. set timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + + timesteps_tensor, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + + # the formular to calculate timestep for add_noise is taken from the original kandinsky repo + latent_timestep = int(self.scheduler.config.num_train_timesteps * strength) - 2 + + latent_timestep = torch.tensor([latent_timestep] * batch_size, dtype=timesteps_tensor.dtype, device=device) + + num_channels_latents = self.unet.config.in_channels + + height, width = get_new_h_w(height, width, self.movq_scale_factor) + + # 5. Create initial latent + latents = self.prepare_latents( + latents, + latent_timestep, + (batch_size, num_channels_latents, height, width), + text_encoder_hidden_states.dtype, + device, + generator, + self.scheduler, + ) + + # 6. Denoising loop + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=text_encoder_hidden_states, + added_cond_kwargs=added_cond_kwargs, + ).sample + + if do_classifier_free_guidance: + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + ).prev_sample + + # 7. post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py new file mode 100644 index 000000000000..cc9a35e580b3 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -0,0 +1,672 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from PIL import Image +from transformers import ( + XLMRobertaTokenizer, +) + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDIMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from .text_encoder import MultilingualCLIP + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline + >>> from diffusers.utils import load_image + >>> import torch + >>> import numpy as np + + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> prompt = "a hat" + >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) + + >>> pipe = KandinskyInpaintPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16 + ... ) + >>> pipe.to("cuda") + + >>> init_image = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ) + + >>> mask = np.ones((768, 768), dtype=np.float32) + >>> mask[:250, 250:-250] = 0 + + >>> out = pipe( + ... prompt, + ... image=init_image, + ... mask_image=mask, + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=50, + ... ) + + >>> image = out.images[0] + >>> image.save("cat_with_hat.png") + ``` +""" + + +def get_new_h_w(h, w, scale_factor=8): + new_h = h // scale_factor**2 + if h % scale_factor**2 != 0: + new_h += 1 + new_w = w // scale_factor**2 + if w % scale_factor**2 != 0: + new_w += 1 + return new_h * scale_factor, new_w * scale_factor + + +def prepare_mask(masks): + prepared_masks = [] + for mask in masks: + old_mask = deepcopy(mask) + for i in range(mask.shape[1]): + for j in range(mask.shape[2]): + if old_mask[0][i][j] == 1: + continue + if i != 0: + mask[:, i - 1, j] = 0 + if j != 0: + mask[:, i, j - 1] = 0 + if i != 0 and j != 0: + mask[:, i - 1, j - 1] = 0 + if i != mask.shape[1] - 1: + mask[:, i + 1, j] = 0 + if j != mask.shape[2] - 1: + mask[:, i, j + 1] = 0 + if i != mask.shape[1] - 1 and j != mask.shape[2] - 1: + mask[:, i + 1, j + 1] = 0 + prepared_masks.append(mask) + return torch.stack(prepared_masks, dim=0) + + +def prepare_mask_and_masked_image(image, mask, height, width): + r""" + Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will + be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for + the ``image`` and ``1`` for the ``mask``. + + The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be + binarized (``mask > 0.5``) and cast to ``torch.float32`` too. + + Args: + image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint. + It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width`` + ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``. + mask (_type_): The mask to apply to the image, i.e. regions to inpaint. + It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width`` + ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + + + Raises: + ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask + should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions. + TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not + (ot the other way around). + + Returns: + tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4 + dimensions: ``batch x channels x height x width``. + """ + + if image is None: + raise ValueError("`image` input cannot be undefined.") + + if mask is None: + raise ValueError("`mask_image` input cannot be undefined.") + + if isinstance(image, torch.Tensor): + if not isinstance(mask, torch.Tensor): + raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not") + + # Batch single image + if image.ndim == 3: + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" + image = image.unsqueeze(0) + + # Batch and add channel dim for single mask + if mask.ndim == 2: + mask = mask.unsqueeze(0).unsqueeze(0) + + # Batch single mask or add channel dim + if mask.ndim == 3: + # Single batched mask, no channel dim or single mask not batched but channel dim + if mask.shape[0] == 1: + mask = mask.unsqueeze(0) + + # Batched masks no channel dim + else: + mask = mask.unsqueeze(1) + + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" + + # Check image is in [-1, 1] + if image.min() < -1 or image.max() > 1: + raise ValueError("Image should be in [-1, 1] range") + + # Check mask is in [0, 1] + if mask.min() < 0 or mask.max() > 1: + raise ValueError("Mask should be in [0, 1] range") + + # Binarize mask + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + + # Image as float32 + image = image.to(dtype=torch.float32) + elif isinstance(mask, torch.Tensor): + raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not") + else: + # preprocess image + if isinstance(image, (PIL.Image.Image, np.ndarray)): + image = [image] + + if isinstance(image, list) and isinstance(image[0], PIL.Image.Image): + # resize all images w.r.t passed height an width + image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image] + image = [np.array(i.convert("RGB"))[None, :] for i in image] + image = np.concatenate(image, axis=0) + elif isinstance(image, list) and isinstance(image[0], np.ndarray): + image = np.concatenate([i[None, :] for i in image], axis=0) + + image = image.transpose(0, 3, 1, 2) + image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 + + # preprocess mask + if isinstance(mask, (PIL.Image.Image, np.ndarray)): + mask = [mask] + + if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image): + mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask] + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + mask = mask.astype(np.float32) / 255.0 + elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): + mask = np.concatenate([m[None, None, :] for m in mask], axis=0) + + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + mask = torch.from_numpy(mask) + + return mask, image + + +class KandinskyInpaintPipeline(DiffusionPipeline): + """ + Pipeline for text-guided image inpainting using Kandinsky2.1 + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + text_encoder ([`MultilingualCLIP`]): + Frozen text-encoder. + tokenizer ([`XLMRobertaTokenizer`]): + Tokenizer of class + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ image encoder and decoder + """ + + def __init__( + self, + text_encoder: MultilingualCLIP, + movq: VQModel, + tokenizer: XLMRobertaTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + ): + super().__init__() + + self.register_modules( + text_encoder=text_encoder, + movq=movq, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=77, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + text_input_ids = text_input_ids.to(device) + text_mask = text_inputs.attention_mask.to(device) + + prompt_embeds, text_encoder_hidden_states = self.text_encoder( + input_ids=text_input_ids, attention_mask=text_mask + ) + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=77, + truncation=True, + return_attention_mask=True, + add_special_tokens=True, + return_tensors="pt", + ) + uncond_text_input_ids = uncond_input.input_ids.to(device) + uncond_text_mask = uncond_input.attention_mask.to(device) + + negative_prompt_embeds, uncond_text_encoder_hidden_states = self.text_encoder( + input_ids=uncond_text_input_ids, attention_mask=uncond_text_mask + ) + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len) + + seq_len = uncond_text_encoder_hidden_states.shape[1] + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + # done duplicates + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) + + text_mask = torch.cat([uncond_text_mask, text_mask]) + + return prompt_embeds, text_encoder_hidden_states, text_mask + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.text_encoder, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]], + image: Union[torch.FloatTensor, PIL.Image.Image], + mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], + image_embeds: torch.FloatTensor, + negative_image_embeds: torch.FloatTensor, + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + num_images_per_prompt: int = 1, + negative_prompt: Optional[Union[str, List[str]]] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + image (`torch.FloatTensor`, `PIL.Image.Image` or `np.ndarray`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. + mask_image (`PIL.Image.Image`,`torch.FloatTensor` or `np.ndarray`): + `Image`, or a tensor representing an image batch, to mask `image`. White pixels in the mask will be + repainted, while black pixels will be preserved. You can pass a pytorch tensor as mask only if the + image you passed is a pytorch tensor, and it should contain one color channel (L) instead of 3, so the + expected shape would be either `(B, 1, H, W,)`, `(B, H, W)`, `(1, H, W)` or `(H, W)` If image is an PIL + image or numpy array, mask should also be a either PIL image or numpy array. If it is a PIL image, it + will be converted to a single channel (luminance) before use. If it is a nummpy array, the expected + shape is `(H, W)`. + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + + # Define call parameters + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + device = self._execution_device + + batch_size = batch_size * num_images_per_prompt + do_classifier_free_guidance = guidance_scale > 1.0 + + prompt_embeds, text_encoder_hidden_states, _ = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to( + dtype=prompt_embeds.dtype, device=device + ) + + # preprocess image and mask + mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width) + + image = image.to(dtype=prompt_embeds.dtype, device=device) + image = self.movq.encode(image)["latents"] + + mask_image = mask_image.to(dtype=prompt_embeds.dtype, device=device) + + image_shape = tuple(image.shape[-2:]) + mask_image = F.interpolate( + mask_image, + image_shape, + mode="nearest", + ) + mask_image = prepare_mask(mask_image) + masked_image = image * mask_image + + mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0) + masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0) + if do_classifier_free_guidance: + mask_image = mask_image.repeat(2, 1, 1, 1) + masked_image = masked_image.repeat(2, 1, 1, 1) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps_tensor = self.scheduler.timesteps + + num_channels_latents = self.movq.config.latent_channels + + # get h, w for latents + sample_height, sample_width = get_new_h_w(height, width, self.movq_scale_factor) + + # create initial latent + latents = self.prepare_latents( + (batch_size, num_channels_latents, sample_height, sample_width), + text_encoder_hidden_states.dtype, + device, + generator, + latents, + self.scheduler, + ) + + # Check that sizes of mask, masked image and latents match with expected + num_channels_mask = mask_image.shape[1] + num_channels_masked_image = masked_image.shape[1] + if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: + raise ValueError( + f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" + f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" + f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" + f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" + " `pipeline.unet` or your `mask_image` or `image` input." + ) + + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1) + + added_cond_kwargs = {"text_embeds": prompt_embeds, "image_embeds": image_embeds} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=text_encoder_hidden_states, + added_cond_kwargs=added_cond_kwargs, + ).sample + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + ).prev_sample + + # post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py new file mode 100644 index 000000000000..d9474b43da54 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -0,0 +1,563 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection + +from ...models import PriorTransformer +from ...pipelines import DiffusionPipeline +from ...schedulers import UnCLIPScheduler +from ...utils import ( + BaseOutput, + is_accelerate_available, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline + >>> import torch + + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior") + >>> pipe_prior.to("cuda") + + >>> prompt = "red cat, 4k photo" + >>> out = pipe_prior(prompt) + >>> image_emb = out.images + >>> zero_image_emb = out.zero_embeds + + >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1") + >>> pipe.to("cuda") + + >>> image = pipe( + ... prompt, + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=100, + ... ).images + + >>> image[0].save("cat.png") + ``` +""" + +EXAMPLE_INTERPOLATE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyPriorPipeline, KandinskyPipeline + >>> from diffusers.utils import load_image + >>> import PIL + + >>> import torch + >>> from torchvision import transforms + + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> img1 = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ) + + >>> img2 = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/starry_night.jpeg" + ... ) + + >>> images_texts = ["a cat", img1, img2] + >>> weights = [0.3, 0.3, 0.4] + >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights) + + >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) + >>> pipe.to("cuda") + + >>> image = pipe( + ... "", + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=150, + ... ).images[0] + + >>> image.save("starry_cat.png") + ``` +""" + + +@dataclass +class KandinskyPriorPipelineOutput(BaseOutput): + """ + Output class for KandinskyPriorPipeline. + + Args: + images (`torch.FloatTensor`) + clip image embeddings for text prompt + zero_embeds (`List[PIL.Image.Image]` or `np.ndarray`) + clip image embeddings for unconditional tokens + """ + + images: Union[torch.FloatTensor, np.ndarray] + zero_embeds: Union[torch.FloatTensor, np.ndarray] + + +class KandinskyPriorPipeline(DiffusionPipeline): + """ + Pipeline for generating image prior for Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + prior ([`PriorTransformer`]): + The canonincal unCLIP prior to approximate the image embedding from the text embedding. + image_encoder ([`CLIPVisionModelWithProjection`]): + Frozen image-encoder. + text_encoder ([`CLIPTextModelWithProjection`]): + Frozen text-encoder. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + scheduler ([`UnCLIPScheduler`]): + A scheduler to be used in combination with `prior` to generate image embedding. + """ + + def __init__( + self, + prior: PriorTransformer, + image_encoder: CLIPVisionModelWithProjection, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + scheduler: UnCLIPScheduler, + image_processor: CLIPImageProcessor, + ): + super().__init__() + + self.register_modules( + prior=prior, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, + image_encoder=image_encoder, + image_processor=image_processor, + ) + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) + def interpolate( + self, + images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + weights: List[float], + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + negative_prior_prompt: Optional[str] = None, + negative_prompt: Union[str] = "", + guidance_scale: float = 4.0, + device=None, + ): + """ + Function invoked when using the prior pipeline for interpolation. + + Args: + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + list of prompts and images to guide the image generation. + weights: (`List[float]`): + list of weights for each condition in `images_and_prompts` + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + negative_prior_prompt (`str`, *optional*): + The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if + `guidance_scale` is less than `1`). + negative_prompt (`str` or `List[str]`, *optional*): + The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if + `guidance_scale` is less than `1`). + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + + Examples: + + Returns: + [`KandinskyPriorPipelineOutput`] or `tuple` + """ + + device = device or self.device + + if len(images_and_prompts) != len(weights): + raise ValueError( + f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length" + ) + + image_embeddings = [] + for cond, weight in zip(images_and_prompts, weights): + if isinstance(cond, str): + image_emb = self.__call__( + cond, + num_inference_steps=num_inference_steps, + num_images_per_prompt=num_images_per_prompt, + generator=generator, + latents=latents, + negative_prompt=negative_prior_prompt, + guidance_scale=guidance_scale, + ).images + + elif isinstance(cond, (PIL.Image.Image, torch.Tensor)): + if isinstance(cond, PIL.Image.Image): + cond = ( + self.image_processor(cond, return_tensors="pt") + .pixel_values[0] + .unsqueeze(0) + .to(dtype=self.image_encoder.dtype, device=device) + ) + + image_emb = self.image_encoder(cond)["image_embeds"] + + else: + raise ValueError( + f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor` but is {type(cond)}" + ) + + image_embeddings.append(image_emb * weight) + + image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True) + + out_zero = self.__call__( + negative_prompt, + num_inference_steps=num_inference_steps, + num_images_per_prompt=num_images_per_prompt, + generator=generator, + latents=latents, + negative_prompt=negative_prior_prompt, + guidance_scale=guidance_scale, + ) + zero_image_emb = out_zero.zero_embeds if negative_prompt == "" else out_zero.images + + return image_emb, zero_image_emb + + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + def get_zero_embed(self, batch_size=1, device=None): + device = device or self.device + zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to( + device=device, dtype=self.image_encoder.dtype + ) + zero_image_emb = self.image_encoder(zero_img)["image_embeds"] + zero_image_emb = zero_image_emb.repeat(batch_size, 1) + return zero_image_emb + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.image_encoder, + self.text_encoder, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"): + return self.device + for module in self.text_encoder.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + text_mask = text_inputs.attention_mask.bool().to(device) + + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] + + text_encoder_output = self.text_encoder(text_input_ids.to(device)) + + prompt_embeds = text_encoder_output.text_embeds + text_encoder_hidden_states = text_encoder_output.last_hidden_state + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + uncond_text_mask = uncond_input.attention_mask.bool().to(device) + negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device)) + + negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds + uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len) + + seq_len = uncond_text_encoder_hidden_states.shape[1] + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + # done duplicates + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) + + text_mask = torch.cat([uncond_text_mask, text_mask]) + + return prompt_embeds, text_encoder_hidden_states, text_mask + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]], + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + negative_prompt: Optional[Union[str, List[str]]] = None, + guidance_scale: float = 4.0, + output_type: Optional[str] = "pt", # pt only + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + output_type (`str`, *optional*, defaults to `"pt"`): + The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"` + (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`KandinskyPriorPipelineOutput`] or `tuple` + """ + + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + device = self._execution_device + + batch_size = batch_size * num_images_per_prompt + + do_classifier_free_guidance = guidance_scale > 1.0 + prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + # prior + self.scheduler.set_timesteps(num_inference_steps, device=device) + prior_timesteps_tensor = self.scheduler.timesteps + + embedding_dim = self.prior.config.embedding_dim + + latents = self.prepare_latents( + (batch_size, embedding_dim), + prompt_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + + for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + predicted_image_embedding = self.prior( + latent_model_input, + timestep=t, + proj_embedding=prompt_embeds, + encoder_hidden_states=text_encoder_hidden_states, + attention_mask=text_mask, + ).predicted_image_embedding + + if do_classifier_free_guidance: + predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2) + predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * ( + predicted_image_embedding_text - predicted_image_embedding_uncond + ) + + if i + 1 == prior_timesteps_tensor.shape[0]: + prev_timestep = None + else: + prev_timestep = prior_timesteps_tensor[i + 1] + + latents = self.scheduler.step( + predicted_image_embedding, + timestep=t, + sample=latents, + generator=generator, + prev_timestep=prev_timestep, + ).prev_sample + + latents = self.prior.post_process_latents(latents) + + image_embeddings = latents + zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device) + + if output_type not in ["pt", "np"]: + raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}") + + if output_type == "np": + image_embeddings = image_embeddings.cpu().numpy() + zero_embeds = zero_embeds.cpu().numpy() + + if not return_dict: + return (image_embeddings, zero_embeds) + + return KandinskyPriorPipelineOutput(images=image_embeddings, zero_embeds=zero_embeds) diff --git a/src/diffusers/pipelines/kandinsky/text_encoder.py b/src/diffusers/pipelines/kandinsky/text_encoder.py new file mode 100644 index 000000000000..caa0029f00ca --- /dev/null +++ b/src/diffusers/pipelines/kandinsky/text_encoder.py @@ -0,0 +1,27 @@ +import torch +from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel + + +class MCLIPConfig(XLMRobertaConfig): + model_type = "M-CLIP" + + def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs): + self.transformerDimensions = transformerDimSize + self.numDims = imageDimSize + super().__init__(**kwargs) + + +class MultilingualCLIP(PreTrainedModel): + config_class = MCLIPConfig + + def __init__(self, config, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.transformer = XLMRobertaModel(config) + self.LinearTransformation = torch.nn.Linear( + in_features=config.transformerDimensions, out_features=config.numDims + ) + + def forward(self, input_ids, attention_mask): + embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0] + embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None] + return self.LinearTransformation(embs2), embs diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 29cde43337d2..af647fe810aa 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -15,7 +15,14 @@ AttnProcessor, ) from ...models.dual_transformer_2d import DualTransformer2DModel -from ...models.embeddings import GaussianFourierProjection, TextTimeEmbedding, TimestepEmbedding, Timesteps +from ...models.embeddings import ( + GaussianFourierProjection, + TextImageProjection, + TextImageTimeEmbedding, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) from ...models.transformer_2d import Transformer2DModel from ...models.unet_2d_condition import UNet2DConditionOutput from ...utils import is_torch_version, logging @@ -182,7 +189,11 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): The dimension of the cross attention features. encoder_hid_dim (`int`, *optional*, defaults to None): - If given, `encoder_hidden_states` will be projected from this dimension to `cross_attention_dim`. + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to None): + If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`. @@ -253,6 +264,7 @@ def __init__( norm_eps: float = 1e-5, cross_attention_dim: Union[int, Tuple[int]] = 1280, encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, attention_head_dim: Union[int, Tuple[int]] = 8, dual_cross_attention: bool = False, use_linear_projection: bool = False, @@ -350,8 +362,31 @@ def __init__( cond_proj_dim=time_cond_proj_dim, ) - if encoder_hid_dim is not None: + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.") + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) else: self.encoder_hid_proj = None @@ -393,8 +428,15 @@ def __init__( self.add_embedding = TextTimeEmbedding( text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads ) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim + ) elif addition_embed_type is not None: - raise ValueError(f"addition_embed_type: {addition_embed_type} must be None or 'text'.") + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") if time_embedding_act_fn is None: self.time_embed_act = None @@ -719,6 +761,7 @@ def forward( timestep_cond: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, mid_block_additional_residual: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, @@ -739,6 +782,10 @@ def forward( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + added_cond_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time + embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and + `addition_embed_type` for more information. Returns: [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: @@ -831,12 +878,35 @@ def forward( if self.config.addition_embed_type == "text": aug_emb = self.add_embedding(encoder_hidden_states) emb = emb + aug_emb + elif self.config.addition_embed_type == "text_image": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires" + " the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + + image_embs = added_cond_kwargs.get("image_embeds") + text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) + + aug_emb = self.add_embedding(text_embs, image_embs) + emb = emb + aug_emb if self.time_embed_act is not None: emb = self.time_embed_act(emb) - if self.encoder_hid_proj is not None: + if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj": encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states) + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj": + # Kadinsky 2.1 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which" + " requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) # 2. pre-process sample = self.conv_in(sample) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 4c6c595c41d8..ea6a61cf7587 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -152,6 +152,66 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class KandinskyImg2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyInpaintPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyPriorPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class LDMTextToImagePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/kandinsky/__init__.py b/tests/pipelines/kandinsky/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py new file mode 100644 index 000000000000..8f7d5ae2019c --- /dev/null +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from transformers import XLMRobertaTokenizerFast + +from diffusers import DDIMScheduler, KandinskyPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel +from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP +from diffusers.utils import floats_tensor, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyPipeline + params = [ + "prompt", + "image_embeds", + "negative_image_embeds", + ] + batch_params = ["prompt", "negative_prompt", "image_embeds", "negative_image_embeds"] + required_optional_params = [ + "generator", + "height", + "width", + "latents", + "guidance_scale", + "negative_prompt", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_tokenizer(self): + tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = MCLIPConfig( + numDims=self.cross_attention_dim, + transformerDimensions=self.text_embedder_hidden_size, + hidden_size=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_hidden_layers=5, + vocab_size=1005, + ) + + text_encoder = MultilingualCLIP(config) + text_encoder = text_encoder.eval() + + return text_encoder + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 4, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "text_image", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "text_image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 64], + "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + unet = self.dummy_unet + movq = self.dummy_movq + + scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="linear", + beta_start=0.00085, + beta_end=0.012, + clip_sample=False, + set_alpha_to_one=False, + steps_offset=1, + prediction_type="epsilon", + thresholding=False, + ) + + components = { + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device) + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "horse", + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "generator": generator, + "height": 64, + "width": 64, + "guidance_scale": 4.0, + "num_inference_steps": 2, + "output_type": "np", + } + return inputs + + def test_kandinsky(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.328663, 1.0, 0.23216873, 1.0, 0.92717564, 0.4639046, 0.96894777, 0.31713378, 0.6293953] + ) + + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + +@slow +@require_torch_gpu +class KandinskyPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_text2img(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinsky/kandinsky_text2img_cat_fp16.npy" + ) + + pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) + pipeline = pipeline.to(torch_device) + pipeline.set_progress_bar_config(disable=None) + + prompt = "red cat, 4k photo" + + generator = torch.Generator(device="cuda").manual_seed(0) + image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + ).images + zero_image_emb = pipe_prior("", num_inference_steps=5).images + + generator = torch.Generator(device="cuda").manual_seed(0) + output = pipeline( + prompt, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + generator=generator, + num_inference_steps=100, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (512, 512, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py new file mode 100644 index 000000000000..6958403ae11c --- /dev/null +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -0,0 +1,303 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from PIL import Image +from transformers import XLMRobertaTokenizerFast + +from diffusers import DDIMScheduler, KandinskyImg2ImgPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel +from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP +from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyImg2ImgPipeline + params = ["prompt", "image_embeds", "negative_image_embeds", "image"] + batch_params = [ + "prompt", + "negative_prompt", + "image_embeds", + "negative_image_embeds", + "image", + ] + required_optional_params = [ + "generator", + "height", + "width", + "strength", + "guidance_scale", + "negative_prompt", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_tokenizer(self): + tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = MCLIPConfig( + numDims=self.cross_attention_dim, + transformerDimensions=self.text_embedder_hidden_size, + hidden_size=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_hidden_layers=5, + vocab_size=1005, + ) + + text_encoder = MultilingualCLIP(config) + text_encoder = text_encoder.eval() + + return text_encoder + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 4, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "text_image", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "text_image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 64], + "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + unet = self.dummy_unet + movq = self.dummy_movq + + ddim_config = { + "num_train_timesteps": 1000, + "beta_schedule": "linear", + "beta_start": 0.00085, + "beta_end": 0.012, + "clip_sample": False, + "set_alpha_to_one": False, + "steps_offset": 0, + "prediction_type": "epsilon", + "thresholding": False, + } + + scheduler = DDIMScheduler(**ddim_config) + + components = { + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device) + # create init_image + image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256)) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "horse", + "image": init_image, + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "generator": generator, + "height": 64, + "width": 64, + "num_inference_steps": 10, + "guidance_scale": 7.0, + "strength": 0.2, + "output_type": "np", + } + return inputs + + def test_kandinsky_img2img(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.61474943, 0.6073539, 0.43308544, 0.5928269, 0.47493595, 0.46755973, 0.4613838, 0.45368797, 0.50119233] + ) + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + +@slow +@require_torch_gpu +class KandinskyImg2ImgPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_img2img(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinsky/kandinsky_img2img_frog.npy" + ) + + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" + ) + prompt = "A red cartoon frog, 4k" + + pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyImg2ImgPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + + pipeline.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + ).images + zero_image_emb = pipe_prior("", num_inference_steps=5).images + + output = pipeline( + prompt, + image=init_image, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + generator=generator, + num_inference_steps=100, + height=768, + width=768, + strength=0.2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (768, 768, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py new file mode 100644 index 000000000000..1bca753bec18 --- /dev/null +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -0,0 +1,313 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from PIL import Image +from transformers import XLMRobertaTokenizerFast + +from diffusers import DDIMScheduler, KandinskyInpaintPipeline, KandinskyPriorPipeline, UNet2DConditionModel, VQModel +from diffusers.pipelines.kandinsky.text_encoder import MCLIPConfig, MultilingualCLIP +from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyInpaintPipeline + params = ["prompt", "image_embeds", "negative_image_embeds", "image", "mask_image"] + batch_params = [ + "prompt", + "negative_prompt", + "image_embeds", + "negative_image_embeds", + "image", + "mask_image", + ] + required_optional_params = [ + "generator", + "height", + "width", + "latents", + "guidance_scale", + "negative_prompt", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_tokenizer(self): + tokenizer = XLMRobertaTokenizerFast.from_pretrained("YiYiXu/tiny-random-mclip-base") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = MCLIPConfig( + numDims=self.cross_attention_dim, + transformerDimensions=self.text_embedder_hidden_size, + hidden_size=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_hidden_layers=5, + vocab_size=1005, + ) + + text_encoder = MultilingualCLIP(config) + text_encoder = text_encoder.eval() + + return text_encoder + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 9, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "text_image", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "text_image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 64], + "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + unet = self.dummy_unet + movq = self.dummy_movq + + scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="linear", + beta_start=0.00085, + beta_end=0.012, + clip_sample=False, + set_alpha_to_one=False, + steps_offset=1, + prediction_type="epsilon", + thresholding=False, + ) + + components = { + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.cross_attention_dim), rng=random.Random(seed + 1)).to(device) + # create init_image + image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256)) + # create mask + mask = np.ones((64, 64), dtype=np.float32) + mask[:32, :32] = 0 + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "horse", + "image": init_image, + "mask_image": mask, + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "generator": generator, + "height": 64, + "width": 64, + "num_inference_steps": 2, + "guidance_scale": 4.0, + "output_type": "np", + } + return inputs + + def test_kandinsky_inpaint(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + print(f"image.shape {image.shape}") + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.8326919, 0.73790467, 0.20918581, 0.9309612, 0.5511791, 0.43713328, 0.5513321, 0.49922934, 0.59497786] + ) + + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=3e-3) + + +@slow +@require_torch_gpu +class KandinskyInpaintPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_inpaint(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinsky/kandinsky_inpaint_cat_with_hat_fp16.npy" + ) + + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" + ) + mask = np.ones((768, 768), dtype=np.float32) + mask[:250, 250:-250] = 0 + + prompt = "a hat" + + pipe_prior = KandinskyPriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyInpaintPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + pipeline.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + ).images + zero_image_emb = pipe_prior("").images + + output = pipeline( + prompt, + image=init_image, + mask_image=mask, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + generator=generator, + num_inference_steps=100, + height=768, + width=768, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (768, 768, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py new file mode 100644 index 000000000000..5ed1f2ac984d --- /dev/null +++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py @@ -0,0 +1,236 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from torch import nn +from transformers import ( + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from diffusers import KandinskyPriorPipeline, PriorTransformer, UnCLIPScheduler +from diffusers.utils import torch_device +from diffusers.utils.testing_utils import enable_full_determinism, skip_mps + +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyPriorPipeline + params = ["prompt"] + batch_params = ["prompt", "negative_prompt"] + required_optional_params = [ + "num_images_per_prompt", + "generator", + "num_inference_steps", + "latents", + "negative_prompt", + "guidance_scale", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_tokenizer(self): + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=self.text_embedder_hidden_size, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + return CLIPTextModelWithProjection(config) + + @property + def dummy_prior(self): + torch.manual_seed(0) + + model_kwargs = { + "num_attention_heads": 2, + "attention_head_dim": 12, + "embedding_dim": self.text_embedder_hidden_size, + "num_layers": 1, + } + + model = PriorTransformer(**model_kwargs) + # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0 + model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape)) + return model + + @property + def dummy_image_encoder(self): + torch.manual_seed(0) + config = CLIPVisionConfig( + hidden_size=self.text_embedder_hidden_size, + image_size=224, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_channels=3, + num_hidden_layers=5, + patch_size=14, + ) + + model = CLIPVisionModelWithProjection(config) + return model + + @property + def dummy_image_processor(self): + image_processor = CLIPImageProcessor( + crop_size=224, + do_center_crop=True, + do_normalize=True, + do_resize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + resample=3, + size=224, + ) + + return image_processor + + def get_dummy_components(self): + prior = self.dummy_prior + image_encoder = self.dummy_image_encoder + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + image_processor = self.dummy_image_processor + + scheduler = UnCLIPScheduler( + variance_type="fixed_small_log", + prediction_type="sample", + num_train_timesteps=1000, + clip_sample=True, + clip_sample_range=10.0, + ) + + components = { + "prior": prior, + "image_encoder": image_encoder, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "scheduler": scheduler, + "image_processor": image_processor, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "horse", + "generator": generator, + "guidance_scale": 4.0, + "num_inference_steps": 2, + "output_type": "np", + } + return inputs + + def test_kandinsky_prior(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -10:] + image_from_tuple_slice = image_from_tuple[0, -10:] + + assert image.shape == (1, 32) + + expected_slice = np.array( + [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + + @skip_mps + def test_inference_batch_single_identical(self): + test_max_difference = torch_device == "cpu" + relax_max_difference = True + test_mean_pixel_difference = False + + self._test_inference_batch_single_identical( + test_max_difference=test_max_difference, + relax_max_difference=relax_max_difference, + test_mean_pixel_difference=test_mean_pixel_difference, + ) + + @skip_mps + def test_attention_slicing_forward_pass(self): + test_max_difference = torch_device == "cpu" + test_mean_pixel_difference = False + + self._test_attention_slicing_forward_pass( + test_max_difference=test_max_difference, + test_mean_pixel_difference=test_mean_pixel_difference, + ) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 3984ed76edce..3ddfd35defb7 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -287,7 +287,7 @@ def _test_inference_batch_consistent( for arg in additional_params_copy_to_batched_inputs: batched_inputs[arg] = inputs[arg] - batched_inputs["output_type"] = None + batched_inputs["output_type"] = "np" if self.pipeline_class.__name__ == "DanceDiffusionPipeline": batched_inputs.pop("output_type") From e5215dee9acbe5d6105f0c40744c4fd676f770bf Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 25 May 2023 14:55:31 -1000 Subject: [PATCH 028/199] fix broken change for vq pipeline (#3563) fix vq_model Co-authored-by: yiyixuxu --- src/diffusers/models/unet_2d_blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index e96f33356870..674e58d7180e 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -2211,7 +2211,7 @@ def __init__( dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, rescale_output_factor=output_scale_factor, eps=resnet_eps, - norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None, + norm_num_groups=resnet_groups if resnet_time_scale_shift != "spatial" else None, spatial_norm_dim=temb_channels if resnet_time_scale_shift == "spatial" else None, residual_connection=True, bias=True, From d114d80fd2b5a1b7c5f3d2ec67b92a442d8f18f9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 26 May 2023 10:47:42 +0200 Subject: [PATCH 029/199] [Stable Diffusion Inpainting] Allow standard text-to-img checkpoints to be useable for SD inpainting (#3533) * Add default to inpaint * Make sure controlnet also works with normal sd for inpaint * Add tests * improve * Correct encode images function * Correct inpaint controlnet * Improve text2img inpanit * make style * up * up * up * up * fix more --- .../controlnet/pipeline_controlnet_inpaint.py | 184 ++++++++++++------ .../pipeline_paint_by_example.py | 27 +-- .../pipeline_stable_diffusion_inpaint.py | 129 +++++++----- ...ipeline_stable_diffusion_inpaint_legacy.py | 8 +- .../controlnet/test_controlnet_inpaint.py | 129 ++++++++++++ .../test_stable_diffusion_inpaint.py | 92 +++++++++ 6 files changed, 456 insertions(+), 113 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 27475dc5ef8b..83ddd51c02f7 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -50,49 +50,59 @@ EXAMPLE_DOC_STRING = """ Examples: ```py - >>> # !pip install opencv-python transformers accelerate - >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler + >>> # !pip install transformers accelerate + >>> from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, DDIMScheduler >>> from diffusers.utils import load_image >>> import numpy as np >>> import torch - >>> import cv2 - >>> from PIL import Image + >>> init_image = load_image( + ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png" + ... ) + >>> init_image = init_image.resize((512, 512)) + + >>> generator = torch.Generator(device="cpu").manual_seed(1) + + >>> mask_image = load_image( + ... "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png" + ... ) + >>> mask_image = mask_image.resize((512, 512)) + - >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" - >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + >>> def make_inpaint_condition(image, image_mask): + ... image = np.array(image.convert("RGB")).astype(np.float32) / 255.0 + ... image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0 - >>> init_image = load_image(img_url).resize((512, 512)) - >>> mask_image = load_image(mask_url).resize((512, 512)) + ... assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size" + ... image[image_mask > 0.5] = -1.0 # set as masked pixel + ... image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) + ... image = torch.from_numpy(image) + ... return image - >>> image = np.array(init_image) - >>> # get canny image - >>> image = cv2.Canny(image, 100, 200) - >>> image = image[:, :, None] - >>> image = np.concatenate([image, image, image], axis=2) - >>> canny_image = Image.fromarray(image) + >>> control_image = make_inpaint_condition(init_image, mask_image) - >>> # load control net and stable diffusion inpainting - >>> controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16) + >>> controlnet = ControlNetModel.from_pretrained( + ... "lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16 + ... ) >>> pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained( - ... "runwayml/stable-diffusion-inpainting", controlnet=controlnet, torch_dtype=torch.float16 + ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16 ... ) >>> # speed up diffusion process with faster scheduler and memory optimization - >>> pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) + >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) >>> pipe.enable_model_cpu_offload() >>> # generate image - >>> generator = torch.manual_seed(0) >>> image = pipe( - ... "spiderman", - ... num_inference_steps=30, + ... "a beautiful man", + ... num_inference_steps=20, ... generator=generator, + ... eta=1.0, ... image=init_image, ... mask_image=mask_image, - ... control_image=canny_image, + ... control_image=control_image, ... ).images[0] ``` """ @@ -226,6 +236,17 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversi In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + + + This pipeline can be used both with checkpoints that have been specifically fine-tuned for inpainting, such as + [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting) + as well as default text-to-image stable diffusion checkpoints, such as + [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5). + Default text-to-image stable diffusion checkpoints might be preferable for controlnets that have been fine-tuned on + those, such as [lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint). + + + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. @@ -597,6 +618,16 @@ def prepare_extra_step_kwargs(self, generator, eta): extra_step_kwargs["generator"] = generator return extra_step_kwargs + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + + return timesteps, num_inference_steps - t_start + def check_inputs( self, prompt, @@ -812,6 +843,8 @@ def prepare_latents( image=None, timestep=None, is_strength_max=True, + return_noise=False, + return_image_latents=False, ): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: @@ -826,32 +859,28 @@ def prepare_latents( "However, either the image or the noise timestep has not been provided." ) + if return_image_latents or (latents is None and not is_strength_max): + image = image.to(device=device, dtype=dtype) + image_latents = self._encode_vae_image(image=image, generator=generator) + if latents is None: noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - if is_strength_max: - # if strength is 100% then simply initialise the latents to noise - latents = noise - else: - # otherwise initialise latents as init image + noise - image = image.to(device=device, dtype=dtype) - if isinstance(generator, list): - image_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i]) - for i in range(batch_size) - ] - else: - image_latents = self.vae.encode(image).latent_dist.sample(generator=generator) - - image_latents = self.vae.config.scaling_factor * image_latents - - latents = self.scheduler.add_noise(image_latents, noise, timestep) + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) else: latents = latents.to(device) # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma - return latents + outputs = (latents,) + + if return_noise: + outputs += (noise,) + + if return_image_latents: + outputs += (image_latents,) + + return outputs def _default_height_width(self, height, width, image): # NOTE: It is possible that a list of images have different @@ -891,17 +920,7 @@ def prepare_mask_latents( mask = mask.to(device=device, dtype=dtype) masked_image = masked_image.to(device=device, dtype=dtype) - - # encode the mask image into latents space so we can concatenate it to the latents - if isinstance(generator, list): - masked_image_latents = [ - self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i]) - for i in range(batch_size) - ] - masked_image_latents = torch.cat(masked_image_latents, dim=0) - else: - masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator) - masked_image_latents = self.vae.config.scaling_factor * masked_image_latents + masked_image_latents = self._encode_vae_image(masked_image, generator=generator) # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method if mask.shape[0] < batch_size: @@ -930,6 +949,21 @@ def prepare_mask_latents( masked_image_latents = masked_image_latents.to(device=device, dtype=dtype) return mask, masked_image_latents + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.sample(generator=generator) + + image_latents = self.vae.config.scaling_factor * image_latents + + return image_latents + # override DiffusionPipeline def save_pretrained( self, @@ -954,6 +988,7 @@ def __call__( ] = None, height: Optional[int] = None, width: Optional[int] = None, + strength: float = 1.0, num_inference_steps: int = 50, guidance_scale: float = 7.5, negative_prompt: Optional[Union[str, List[str]]] = None, @@ -990,6 +1025,13 @@ def __call__( The height in pixels of the generated image. width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): The width in pixels of the generated image. + strength (`float`, *optional*, defaults to 1.): + Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be + between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the + `strength`. The number of denoising steps depends on the amount of noise initially added. When + `strength` is 1, added noise will be maximum and the denoising process will run for the full number of + iterations specified in `num_inference_steps`. A value of 1, therefore, essentially ignores the masked + portion of the reference `image`. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -1145,13 +1187,25 @@ def __call__( assert False # 4. Preprocess mask and image - resizes image and mask w.r.t height and width + mask, masked_image, init_image = prepare_mask_and_masked_image( + image, mask_image, height, width, return_image=True + ) + # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) - timesteps = self.scheduler.timesteps + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps=num_inference_steps, strength=strength, device=device + ) + # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise + is_strength_max = strength == 1.0 # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels - latents = self.prepare_latents( + num_channels_unet = self.unet.config.in_channels + return_image_latents = num_channels_unet == 4 + latents_outputs = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, height, @@ -1160,10 +1214,19 @@ def __call__( device, generator, latents, + image=init_image, + timestep=latent_timestep, + is_strength_max=is_strength_max, + return_noise=True, + return_image_latents=return_image_latents, ) + if return_image_latents: + latents, noise, image_latents = latents_outputs + else: + latents, noise = latents_outputs + # 7. Prepare mask latent variables - mask, masked_image = prepare_mask_and_masked_image(image, mask_image, height, width) mask, masked_image_latents = self.prepare_mask_latents( mask, masked_image, @@ -1213,7 +1276,9 @@ def __call__( mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) # predict the noise residual - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) + if num_channels_unet == 9: + latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) + noise_pred = self.unet( latent_model_input, t, @@ -1232,6 +1297,15 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + if num_channels_unet == 4: + init_latents_proper = image_latents[:1] + init_mask = mask[:1] + + if i < len(timesteps) - 1: + init_latents_proper = self.scheduler.add_noise(init_latents_proper, noise, torch.tensor([t])) + + latents = (1 - init_mask) * init_latents_proper + init_mask * latents + # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() diff --git a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py index 24b05f36f913..c8f3e8a9ee11 100644 --- a/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py +++ b/src/diffusers/pipelines/paint_by_example/pipeline_paint_by_example.py @@ -328,17 +328,7 @@ def prepare_mask_latents( mask = mask.to(device=device, dtype=dtype) masked_image = masked_image.to(device=device, dtype=dtype) - - # encode the mask image into latents space so we can concatenate it to the latents - if isinstance(generator, list): - masked_image_latents = [ - self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i]) - for i in range(batch_size) - ] - masked_image_latents = torch.cat(masked_image_latents, dim=0) - else: - masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator) - masked_image_latents = self.vae.config.scaling_factor * masked_image_latents + masked_image_latents = self._encode_vae_image(masked_image, generator=generator) # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method if mask.shape[0] < batch_size: @@ -367,6 +357,21 @@ def prepare_mask_latents( masked_image_latents = masked_image_latents.to(device=device, dtype=dtype) return mask, masked_image_latents + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint.StableDiffusionInpaintPipeline._encode_vae_image + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.sample(generator=generator) + + image_latents = self.vae.config.scaling_factor * image_latents + + return image_latents + def _encode_image(self, image, device, num_images_per_prompt, do_classifier_free_guidance): dtype = next(self.image_encoder.parameters()).dtype diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index f09db016d956..5dbac9295800 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -155,7 +155,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" - Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. + Pipeline for text-guided image inpainting using Stable Diffusion. This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) @@ -167,6 +167,16 @@ class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMi as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + + + It is recommended to use this pipeline with checkpoints that have been specifically fine-tuned for inpainting, such + as [runwayml/stable-diffusion-inpainting](https://huggingface.co/runwayml/stable-diffusion-inpainting). Default + text-to-image stable diffusion checkpoints, such as + [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) are also compatible with + this pipeline, but might be less performant. + + + Args: vae ([`AutoencoderKL`]): Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. @@ -266,14 +276,10 @@ def __init__( new_config = dict(unet.config) new_config["sample_size"] = 64 unet._internal_dict = FrozenDict(new_config) + # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4 if unet.config.in_channels != 9: - logger.warning( - f"You have loaded a UNet with {unet.config.in_channels} input channels, whereas by default," - f" {self.__class__} assumes that `pipeline.unet` has 9 input channels: 4 for `num_channels_latents`," - " 1 for `num_channels_mask`, and 4 for `num_channels_masked_image`. If you did not intend to modify" - " this behavior, please check whether you have loaded the right checkpoint." - ) + logger.info(f"You have loaded a UNet with {unet.config.in_channels} input channels which.") self.register_modules( vae=vae, @@ -620,6 +626,8 @@ def prepare_latents( image=None, timestep=None, is_strength_max=True, + return_noise=False, + return_image_latents=False, ): shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) if isinstance(generator, list) and len(generator) != batch_size: @@ -634,32 +642,42 @@ def prepare_latents( "However, either the image or the noise timestep has not been provided." ) + if return_image_latents or (latents is None and not is_strength_max): + image = image.to(device=device, dtype=dtype) + image_latents = self._encode_vae_image(image=image, generator=generator) + if latents is None: noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - if is_strength_max: - # if strength is 100% then simply initialise the latents to noise - latents = noise - else: - # otherwise initialise latents as init image + noise - image = image.to(device=device, dtype=dtype) - if isinstance(generator, list): - image_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i]) - for i in range(batch_size) - ] - else: - image_latents = self.vae.encode(image).latent_dist.sample(generator=generator) - - image_latents = self.vae.config.scaling_factor * image_latents - - latents = self.scheduler.add_noise(image_latents, noise, timestep) + latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) else: latents = latents.to(device) # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma - return latents + outputs = (latents,) + + if return_noise: + outputs += (noise,) + + if return_image_latents: + outputs += (image_latents,) + + return outputs + + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i]) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.sample(generator=generator) + + image_latents = self.vae.config.scaling_factor * image_latents + + return image_latents def prepare_mask_latents( self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance @@ -673,17 +691,7 @@ def prepare_mask_latents( mask = mask.to(device=device, dtype=dtype) masked_image = masked_image.to(device=device, dtype=dtype) - - # encode the mask image into latents space so we can concatenate it to the latents - if isinstance(generator, list): - masked_image_latents = [ - self.vae.encode(masked_image[i : i + 1]).latent_dist.sample(generator=generator[i]) - for i in range(batch_size) - ] - masked_image_latents = torch.cat(masked_image_latents, dim=0) - else: - masked_image_latents = self.vae.encode(masked_image).latent_dist.sample(generator=generator) - masked_image_latents = self.vae.config.scaling_factor * masked_image_latents + masked_image_latents = self._encode_vae_image(masked_image, generator=generator) # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method if mask.shape[0] < batch_size: @@ -916,7 +924,10 @@ def __call__( # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels - latents = self.prepare_latents( + num_channels_unet = self.unet.config.in_channels + return_image_latents = num_channels_unet == 4 + + latents_outputs = self.prepare_latents( batch_size * num_images_per_prompt, num_channels_latents, height, @@ -928,8 +939,15 @@ def __call__( image=init_image, timestep=latent_timestep, is_strength_max=is_strength_max, + return_noise=True, + return_image_latents=return_image_latents, ) + if return_image_latents: + latents, noise, image_latents = latents_outputs + else: + latents, noise = latents_outputs + # 7. Prepare mask latent variables mask, masked_image_latents = self.prepare_mask_latents( mask, @@ -942,17 +960,25 @@ def __call__( generator, do_classifier_free_guidance, ) + init_image = init_image.to(device=device, dtype=masked_image_latents.dtype) + init_image = self._encode_vae_image(init_image, generator=generator) # 8. Check that sizes of mask, masked image and latents match - num_channels_mask = mask.shape[1] - num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: + if num_channels_unet == 9: + # default case for runwayml/stable-diffusion-inpainting + num_channels_mask = mask.shape[1] + num_channels_masked_image = masked_image_latents.shape[1] + if num_channels_latents + num_channels_mask + num_channels_masked_image != self.unet.config.in_channels: + raise ValueError( + f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" + f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" + f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" + f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" + " `pipeline.unet` or your `mask_image` or `image` input." + ) + elif num_channels_unet != 4: raise ValueError( - f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" - f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" - f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" - f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input." + f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}." ) # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline @@ -967,7 +993,9 @@ def __call__( # concat latents, mask, masked_image_latents in the channel dimension latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) + + if num_channels_unet == 9: + latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) # predict the noise residual noise_pred = self.unet( @@ -986,6 +1014,15 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + if num_channels_unet == 4: + init_latents_proper = image_latents[:1] + init_mask = mask[:1] + + if i < len(timesteps) - 1: + init_latents_proper = self.scheduler.add_noise(init_latents_proper, noise, torch.tensor([t])) + + latents = (1 - init_mask) * init_latents_proper + init_mask * latents + # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 5a2329a5c51f..c549d869e685 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -123,7 +123,6 @@ class StableDiffusionInpaintPipelineLegacy( """ _optional_components = ["feature_extractor"] - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.__init__ def __init__( self, vae: AutoencoderKL, @@ -137,6 +136,13 @@ def __init__( ): super().__init__() + deprecation_message = ( + f"The class {self.__class__} is deprecated and will be removed in v1.0.0. You can achieve exactly the same functionality" + "by loading your model into `StableDiffusionInpaintPipeline` instead. See https://github.com/huggingface/diffusers/pull/3533" + "for more information." + ) + deprecate("legacy is outdated", "1.0.0", deprecation_message, standard_warn=False) + if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1: deprecation_message = ( f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index 155286630c04..f8cc881e8650 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -163,6 +163,78 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) +class ControlNetSimpleInpaintPipelineFastTests(ControlNetInpaintPipelineFastTests): + pipeline_class = StableDiffusionControlNetInpaintPipeline + params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS + batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + image_params = frozenset([]) + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ) + torch.manual_seed(0) + controlnet = ControlNetModel( + block_out_channels=(32, 64), + layers_per_block=2, + in_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + cross_attention_dim=32, + conditioning_embedding_out_channels=(16, 32), + ) + torch.manual_seed(0) + scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "controlnet": controlnet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } + return components + + class MultiControlNetInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionControlNetInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS @@ -376,3 +448,60 @@ def test_canny(self): ) assert np.abs(expected_image - image).max() < 9e-2 + + def test_inpaint(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint") + + pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) + pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.enable_model_cpu_offload() + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(33) + + init_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy.png" + ) + init_image = init_image.resize((512, 512)) + + mask_image = load_image( + "https://huggingface.co/datasets/diffusers/test-arrays/resolve/main/stable_diffusion_inpaint/boy_mask.png" + ) + mask_image = mask_image.resize((512, 512)) + + prompt = "a handsome man with ray-ban sunglasses" + + def make_inpaint_condition(image, image_mask): + image = np.array(image.convert("RGB")).astype(np.float32) / 255.0 + image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0 + + assert image.shape[0:1] == image_mask.shape[0:1], "image and image_mask must have the same image size" + image[image_mask > 0.5] = -1.0 # set as masked pixel + image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return image + + control_image = make_inpaint_condition(init_image, mask_image) + + output = pipe( + prompt, + image=init_image, + mask_image=mask_image, + control_image=control_image, + guidance_scale=9.0, + eta=1.0, + generator=generator, + num_inference_steps=20, + output_type="np", + ) + image = output.images[0] + + assert image.shape == (512, 512, 3) + + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/boy_ray_ban.npy" + ) + + assert np.abs(expected_image - image).max() < 9e-2 diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index eb1c097dfba0..e355e82e5b35 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -232,6 +232,82 @@ def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) +class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests): + pipeline_class = StableDiffusionInpaintPipeline + params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS + batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS + image_params = frozenset([]) + # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ) + scheduler = PNDMScheduler(skip_prk_steps=True) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } + return components + + def test_stable_diffusion_inpaint(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionInpaintPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.4925, 0.4967, 0.4100, 0.5234, 0.5322, 0.4532, 0.5805, 0.5877, 0.4151]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + @unittest.skip("skipped here because area stays unchanged due to mask") + def test_stable_diffusion_inpaint_lora(self): + ... + + @slow @require_torch_gpu class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase): @@ -403,6 +479,22 @@ def test_stable_diffusion_inpaint_strength_test(self): expected_slice = np.array([0.0021, 0.2350, 0.3712, 0.0575, 0.2485, 0.3451, 0.1857, 0.3156, 0.3943]) assert np.abs(expected_slice - image_slice).max() < 3e-3 + def test_stable_diffusion_simple_inpaint_ddim(self): + pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", safety_checker=None) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs(torch_device) + image = pipe(**inputs).images + + image_slice = image[0, 253:256, 253:256, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.5157, 0.6858, 0.6873, 0.4619, 0.6416, 0.6898, 0.3702, 0.5960, 0.6935]) + + assert np.abs(expected_slice - image_slice).max() < 6e-4 + @nightly @require_torch_gpu From d8ce53a8c441788230a5caed24e23fd55df6c255 Mon Sep 17 00:00:00 2001 From: Emin Demirci Date: Fri, 26 May 2023 12:31:02 +0300 Subject: [PATCH 030/199] Fix loaded_token reference before definition (#3523) --- src/diffusers/loaders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index e50bc31a5c63..cea2abe40c3f 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -682,6 +682,7 @@ def load_textual_inversion( state_dict = torch.load(model_file, map_location="cpu") # 2. Load token and embedding correcly from file + loaded_token = None if isinstance(state_dict, torch.Tensor): if token is None: raise ValueError( From ffa33d631a7ceca1e67eb29f9646658dfdb8f3a8 Mon Sep 17 00:00:00 2001 From: vikasmech Date: Fri, 26 May 2023 15:04:11 +0530 Subject: [PATCH 031/199] renamed variable to input_ and output_ (#3507) * renamed variable to input_ and output_ * changed input _ to intputs and output_ to outputs --- src/diffusers/models/resnet.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index cf9e3182d400..3380a4909372 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -52,17 +52,17 @@ def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_chann elif use_conv: self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1) - def forward(self, x): - assert x.shape[1] == self.channels + def forward(self, inputs): + assert inputs.shape[1] == self.channels if self.use_conv_transpose: - return self.conv(x) + return self.conv(inputs) - x = F.interpolate(x, scale_factor=2.0, mode="nearest") + outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest") if self.use_conv: - x = self.conv(x) + outputs = self.conv(outputs) - return x + return outputs class Downsample1D(nn.Module): From 66356e7dd5612ec19808891f99c52799efcd92be Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 26 May 2023 12:02:30 +0200 Subject: [PATCH 032/199] Correct inpainting controlnet docs (#3572) --- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 83ddd51c02f7..57a0e42ccbf8 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -89,14 +89,12 @@ ... "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16 ... ) - >>> # speed up diffusion process with faster scheduler and memory optimization >>> pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - >>> pipe.enable_model_cpu_offload() >>> # generate image >>> image = pipe( - ... "a beautiful man", + ... "a handsome man with ray-ban sunglasses", ... num_inference_steps=20, ... generator=generator, ... eta=1.0, From bf16a97018fcb351b552043c89cb0152317ac3f9 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 26 May 2023 12:31:51 +0200 Subject: [PATCH 033/199] Fix controlnet guess mode euler (#3571) * Fix guess mode controlnet for euler-like schedulers * make style * Co-authored-by: Chanchana Sornsoontorn * Add co author Co-authored-by: Chanchana Sornsoontorn * 2nd try Co-authored-by: Chanchana Sornsoontorn --- .../controlnet/pipeline_controlnet.py | 7 ++-- .../controlnet/pipeline_controlnet_img2img.py | 7 ++-- .../controlnet/pipeline_controlnet_inpaint.py | 8 +++-- tests/pipelines/controlnet/test_controlnet.py | 34 +++++++++++++++++++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 8a2ffbbff171..632cd546ed0a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -956,14 +956,15 @@ def __call__( # controlnet(s) inference if guess_mode and do_classifier_free_guidance: # Infer ControlNet only for the conditional batch. - controlnet_latent_model_input = latents + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] else: - controlnet_latent_model_input = latent_model_input + control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds down_block_res_samples, mid_block_res_sample = self.controlnet( - controlnet_latent_model_input, + control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=image, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index cb5492790353..72b90f334725 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -1034,14 +1034,15 @@ def __call__( # controlnet(s) inference if guess_mode and do_classifier_free_guidance: # Infer ControlNet only for the conditional batch. - controlnet_latent_model_input = latents + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] else: - controlnet_latent_model_input = latent_model_input + control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds down_block_res_samples, mid_block_res_sample = self.controlnet( - controlnet_latent_model_input, + control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=control_image, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 57a0e42ccbf8..f57d88bd8d8a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1248,16 +1248,18 @@ def __call__( latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # controlnet(s) inference if guess_mode and do_classifier_free_guidance: # Infer ControlNet only for the conditional batch. - controlnet_latent_model_input = latents + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] else: - controlnet_latent_model_input = latent_model_input + control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds down_block_res_samples, mid_block_res_sample = self.controlnet( - controlnet_latent_model_input, + control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=control_image, diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index ee6f8fce2508..b2312a4e94d0 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -26,6 +26,7 @@ AutoencoderKL, ControlNetModel, DDIMScheduler, + EulerDiscreteScheduler, StableDiffusionControlNetPipeline, UNet2DConditionModel, ) @@ -644,6 +645,39 @@ def test_canny_guess_mode(self): expected_slice = np.array([0.2724, 0.2846, 0.2724, 0.3843, 0.3682, 0.2736, 0.4675, 0.3862, 0.2887]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_canny_guess_mode_euler(self): + controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny") + + pipe = StableDiffusionControlNetPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet + ) + pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) + pipe.enable_model_cpu_offload() + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + prompt = "" + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png" + ) + + output = pipe( + prompt, + image, + generator=generator, + output_type="np", + num_inference_steps=3, + guidance_scale=3.0, + guess_mode=True, + ) + + image = output.images[0] + assert image.shape == (768, 512, 3) + + image_slice = image[-3:, -3:, -1] + expected_slice = np.array([0.1655, 0.1721, 0.1623, 0.1685, 0.1711, 0.1646, 0.1651, 0.1631, 0.1494]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + @require_torch_2 def test_stable_diffusion_compile(self): run_test_in_subprocess(test_case=self, target_func=_test_stable_diffusion_compile, inputs=None) From 067a9efd5476b679a9a05def3738e83eeee03eda Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Fri, 26 May 2023 16:47:05 +0530 Subject: [PATCH 034/199] Add initial training script --- examples/consistency_models/requirements.txt | 3 + .../train_consistency_distillation.py | 694 ++++++++++++++++++ 2 files changed, 697 insertions(+) create mode 100644 examples/consistency_models/requirements.txt create mode 100644 examples/consistency_models/train_consistency_distillation.py diff --git a/examples/consistency_models/requirements.txt b/examples/consistency_models/requirements.txt new file mode 100644 index 000000000000..f366720afd11 --- /dev/null +++ b/examples/consistency_models/requirements.txt @@ -0,0 +1,3 @@ +accelerate>=0.16.0 +torchvision +datasets diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py new file mode 100644 index 000000000000..a14d652c208b --- /dev/null +++ b/examples/consistency_models/train_consistency_distillation.py @@ -0,0 +1,694 @@ +import argparse +import inspect +import logging +import math +import os +from pathlib import Path +from typing import Optional + +import accelerate +import datasets +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration +from datasets import load_dataset +from huggingface_hub import HfFolder, Repository, create_repo, whoami +from packaging import version +from torchvision import transforms +from tqdm.auto import tqdm + +import diffusers +from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel +from diffusers.optimization import get_scheduler +from diffusers.training_utils import EMAModel +from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available +from diffusers.utils.import_utils import is_xformers_available + + +#Copied from examples/unconditional_image_generation/train_unconditional.py for now + +# Will error if the minimal version of diffusers is not installed. Remove at your own risks. +check_min_version("0.17.0.dev0") + +logger = get_logger(__name__, log_level="INFO") + + +def _extract_into_tensor(arr, timesteps, broadcast_shape): + """ + Extract values from a 1-D numpy array for a batch of indices. + + :param arr: the 1-D numpy array. + :param timesteps: a tensor of indices into the array to extract. + :param broadcast_shape: a larger shape of K dimensions with the batch + dimension equal to the length of timesteps. + :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. + """ + if not isinstance(arr, torch.Tensor): + arr = torch.from_numpy(arr) + res = arr[timesteps].float().to(timesteps.device) + while len(res.shape) < len(broadcast_shape): + res = res[..., None] + return res.expand(broadcast_shape) + + +def parse_args(): + parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--dataset_name", + type=str, + default=None, + help=( + "The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private," + " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem," + " or to a folder containing files that HF Datasets can understand." + ), + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The config of the Dataset, leave as None if there's only one config.", + ) + parser.add_argument( + "--model_config_name_or_path", + type=str, + default=None, + help="The config of the UNet model to train, leave as None to use standard DDPM configuration.", + ) + parser.add_argument( + "--train_data_dir", + type=str, + default=None, + help=( + "A folder containing the training data. Folder contents must follow the structure described in" + " https://huggingface.co/docs/datasets/image_dataset#imagefolder. In particular, a `metadata.jsonl` file" + " must exist to provide the captions for the images. Ignored if `dataset_name` is specified." + ), + ) + parser.add_argument( + "--output_dir", + type=str, + default="ddpm-model-64", + help="The output directory where the model predictions and checkpoints will be written.", + ) + parser.add_argument("--overwrite_output_dir", action="store_true") + parser.add_argument( + "--cache_dir", + type=str, + default=None, + help="The directory where the downloaded models and datasets will be stored.", + ) + parser.add_argument( + "--resolution", + type=int, + default=64, + help=( + "The resolution for input images, all the images in the train/validation dataset will be resized to this" + " resolution" + ), + ) + parser.add_argument( + "--center_crop", + default=False, + action="store_true", + help=( + "Whether to center crop the input images to the resolution. If not set, the images will be randomly" + " cropped. The images will be resized to the resolution first before cropping." + ), + ) + parser.add_argument( + "--random_flip", + default=False, + action="store_true", + help="whether to randomly flip images horizontally", + ) + parser.add_argument( + "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader." + ) + parser.add_argument( + "--eval_batch_size", type=int, default=16, help="The number of images to generate for evaluation." + ) + parser.add_argument( + "--dataloader_num_workers", + type=int, + default=0, + help=( + "The number of subprocesses to use for data loading. 0 means that the data will be loaded in the main" + " process." + ), + ) + parser.add_argument("--num_epochs", type=int, default=100) + parser.add_argument("--save_images_epochs", type=int, default=10, help="How often to save images during training.") + parser.add_argument( + "--save_model_epochs", type=int, default=10, help="How often to save the model during training." + ) + parser.add_argument( + "--gradient_accumulation_steps", + type=int, + default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.", + ) + parser.add_argument( + "--learning_rate", + type=float, + default=1e-4, + help="Initial learning rate (after the potential warmup period) to use.", + ) + parser.add_argument( + "--lr_scheduler", + type=str, + default="cosine", + help=( + 'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",' + ' "constant", "constant_with_warmup"]' + ), + ) + parser.add_argument( + "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." + ) + parser.add_argument("--adam_beta1", type=float, default=0.95, help="The beta1 parameter for the Adam optimizer.") + parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.") + parser.add_argument( + "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer." + ) + parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.") + parser.add_argument( + "--use_ema", + action="store_true", + help="Whether to use Exponential Moving Average for the final model weights.", + ) + parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.") + parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.") + parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.") + parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--hub_model_id", + type=str, + default=None, + help="The name of the repository to keep in sync with the local `output_dir`.", + ) + parser.add_argument( + "--hub_private_repo", action="store_true", help="Whether or not to create a private repository." + ) + parser.add_argument( + "--logger", + type=str, + default="tensorboard", + choices=["tensorboard", "wandb"], + help=( + "Whether to use [tensorboard](https://www.tensorflow.org/tensorboard) or [wandb](https://www.wandb.ai)" + " for experiment tracking and logging of model metrics and model checkpoints" + ), + ) + parser.add_argument( + "--logging_dir", + type=str, + default="logs", + help=( + "[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to" + " *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***." + ), + ) + parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") + parser.add_argument( + "--mixed_precision", + type=str, + default="no", + choices=["no", "fp16", "bf16"], + help=( + "Whether to use mixed precision. Choose" + "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10." + "and an Nvidia Ampere GPU." + ), + ) + parser.add_argument( + "--prediction_type", + type=str, + default="epsilon", + choices=["epsilon", "sample"], + help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.", + ) + parser.add_argument("--ddpm_num_steps", type=int, default=1000) + parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000) + parser.add_argument("--ddpm_beta_schedule", type=str, default="linear") + parser.add_argument( + "--checkpointing_steps", + type=int, + default=500, + help=( + "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" + " training using `--resume_from_checkpoint`." + ), + ) + parser.add_argument( + "--checkpoints_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) + parser.add_argument( + "--resume_from_checkpoint", + type=str, + default=None, + help=( + "Whether training should be resumed from a previous checkpoint. Use a path saved by" + ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' + ), + ) + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) + + args = parser.parse_args() + env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if env_local_rank != -1 and env_local_rank != args.local_rank: + args.local_rank = env_local_rank + + if args.dataset_name is None and args.train_data_dir is None: + raise ValueError("You must specify either a dataset name from the hub or a train data directory.") + + return args + + +def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None): + if token is None: + token = HfFolder.get_token() + if organization is None: + username = whoami(token)["name"] + return f"{username}/{model_id}" + else: + return f"{organization}/{model_id}" + + +def main(args): + logging_dir = os.path.join(args.output_dir, args.logging_dir) + + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + + accelerator = Accelerator( + gradient_accumulation_steps=args.gradient_accumulation_steps, + mixed_precision=args.mixed_precision, + log_with=args.logger, + logging_dir=logging_dir, + project_config=accelerator_project_config, + ) + + if args.logger == "tensorboard": + if not is_tensorboard_available(): + raise ImportError("Make sure to install tensorboard if you want to use it for logging during training.") + + elif args.logger == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + import wandb + + # `accelerate` 0.16.0 will have better support for customized saving + if version.parse(accelerate.__version__) >= version.parse("0.16.0"): + # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format + def save_model_hook(models, weights, output_dir): + if args.use_ema: + ema_model.save_pretrained(os.path.join(output_dir, "unet_ema")) + + for i, model in enumerate(models): + model.save_pretrained(os.path.join(output_dir, "unet")) + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + def load_model_hook(models, input_dir): + if args.use_ema: + load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel) + ema_model.load_state_dict(load_model.state_dict()) + ema_model.to(accelerator.device) + del load_model + + for i in range(len(models)): + # pop models so that they are not loaded again + model = models.pop() + + # load diffusers style into model + load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet") + model.register_to_config(**load_model.config) + + model.load_state_dict(load_model.state_dict()) + del load_model + + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + + # Make one log on every process with the configuration for debugging. + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + ) + logger.info(accelerator.state, main_process_only=False) + if accelerator.is_local_main_process: + datasets.utils.logging.set_verbosity_warning() + diffusers.utils.logging.set_verbosity_info() + else: + datasets.utils.logging.set_verbosity_error() + diffusers.utils.logging.set_verbosity_error() + + # Handle the repository creation + if accelerator.is_main_process: + if args.push_to_hub: + if args.hub_model_id is None: + repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) + else: + repo_name = args.hub_model_id + create_repo(repo_name, exist_ok=True, token=args.hub_token) + repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) + + with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: + if "step_*" not in gitignore: + gitignore.write("step_*\n") + if "epoch_*" not in gitignore: + gitignore.write("epoch_*\n") + elif args.output_dir is not None: + os.makedirs(args.output_dir, exist_ok=True) + + # Initialize the model + if args.model_config_name_or_path is None: + model = UNet2DModel( + sample_size=args.resolution, + in_channels=3, + out_channels=3, + layers_per_block=2, + block_out_channels=(128, 128, 256, 256, 512, 512), + down_block_types=( + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + "AttnDownBlock2D", + "DownBlock2D", + ), + up_block_types=( + "UpBlock2D", + "AttnUpBlock2D", + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + ), + ) + else: + config = UNet2DModel.load_config(args.model_config_name_or_path) + model = UNet2DModel.from_config(config) + + # Create EMA for the model. + if args.use_ema: + ema_model = EMAModel( + model.parameters(), + decay=args.ema_max_decay, + use_ema_warmup=True, + inv_gamma=args.ema_inv_gamma, + power=args.ema_power, + model_cls=UNet2DModel, + model_config=model.config, + ) + + if args.enable_xformers_memory_efficient_attention: + if is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warn( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) + model.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + + # Initialize the scheduler + accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys()) + if accepts_prediction_type: + noise_scheduler = DDPMScheduler( + num_train_timesteps=args.ddpm_num_steps, + beta_schedule=args.ddpm_beta_schedule, + prediction_type=args.prediction_type, + ) + else: + noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule) + + # Initialize the optimizer + optimizer = torch.optim.AdamW( + model.parameters(), + lr=args.learning_rate, + betas=(args.adam_beta1, args.adam_beta2), + weight_decay=args.adam_weight_decay, + eps=args.adam_epsilon, + ) + + # Get the datasets: you can either provide your own training and evaluation files (see below) + # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). + + # In distributed training, the load_dataset function guarantees that only one local process can concurrently + # download the dataset. + if args.dataset_name is not None: + dataset = load_dataset( + args.dataset_name, + args.dataset_config_name, + cache_dir=args.cache_dir, + split="train", + ) + else: + dataset = load_dataset("imagefolder", data_dir=args.train_data_dir, cache_dir=args.cache_dir, split="train") + # See more about loading custom images at + # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder + + # Preprocessing the datasets and DataLoaders creation. + augmentations = transforms.Compose( + [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), + transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x), + transforms.ToTensor(), + transforms.Normalize([0.5], [0.5]), + ] + ) + + def transform_images(examples): + images = [augmentations(image.convert("RGB")) for image in examples["image"]] + return {"input": images} + + logger.info(f"Dataset size: {len(dataset)}") + + dataset.set_transform(transform_images) + train_dataloader = torch.utils.data.DataLoader( + dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers + ) + + # Initialize the learning rate scheduler + lr_scheduler = get_scheduler( + args.lr_scheduler, + optimizer=optimizer, + num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, + num_training_steps=(len(train_dataloader) * args.num_epochs), + ) + + # Prepare everything with our `accelerator`. + model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler + ) + + if args.use_ema: + ema_model.to(accelerator.device) + + # We need to initialize the trackers we use, and also store our configuration. + # The trackers initializes automatically on the main process. + if accelerator.is_main_process: + run = os.path.split(__file__)[-1].split(".")[0] + accelerator.init_trackers(run) + + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps + num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) + max_train_steps = args.num_epochs * num_update_steps_per_epoch + + logger.info("***** Running training *****") + logger.info(f" Num examples = {len(dataset)}") + logger.info(f" Num Epochs = {args.num_epochs}") + logger.info(f" Instantaneous batch size per device = {args.train_batch_size}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {max_train_steps}") + + global_step = 0 + first_epoch = 0 + + # Potentially load in the weights and states from a previous save + if args.resume_from_checkpoint: + if args.resume_from_checkpoint != "latest": + path = os.path.basename(args.resume_from_checkpoint) + else: + # Get the most recent checkpoint + dirs = os.listdir(args.output_dir) + dirs = [d for d in dirs if d.startswith("checkpoint")] + dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) + path = dirs[-1] if len(dirs) > 0 else None + + if path is None: + accelerator.print( + f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run." + ) + args.resume_from_checkpoint = None + else: + accelerator.print(f"Resuming from checkpoint {path}") + accelerator.load_state(os.path.join(args.output_dir, path)) + global_step = int(path.split("-")[1]) + + resume_global_step = global_step * args.gradient_accumulation_steps + first_epoch = global_step // num_update_steps_per_epoch + resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps) + + # Train! + for epoch in range(first_epoch, args.num_epochs): + model.train() + progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process) + progress_bar.set_description(f"Epoch {epoch}") + for step, batch in enumerate(train_dataloader): + # Skip steps until we reach the resumed step + if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: + if step % args.gradient_accumulation_steps == 0: + progress_bar.update(1) + continue + + clean_images = batch["input"] + # Sample noise that we'll add to the images + noise = torch.randn(clean_images.shape).to(clean_images.device) + bsz = clean_images.shape[0] + # Sample a random timestep for each image + timesteps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device + ).long() + + # Add noise to the clean images according to the noise magnitude at each timestep + # (this is the forward diffusion process) + noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps) + + with accelerator.accumulate(model): + # Predict the noise residual + model_output = model(noisy_images, timesteps).sample + + if args.prediction_type == "epsilon": + loss = F.mse_loss(model_output, noise) # this could have different weights! + elif args.prediction_type == "sample": + alpha_t = _extract_into_tensor( + noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1) + ) + snr_weights = alpha_t / (1 - alpha_t) + loss = snr_weights * F.mse_loss( + model_output, clean_images, reduction="none" + ) # use SNR weighting from distillation paper + loss = loss.mean() + else: + raise ValueError(f"Unsupported prediction type: {args.prediction_type}") + + accelerator.backward(loss) + + if accelerator.sync_gradients: + accelerator.clip_grad_norm_(model.parameters(), 1.0) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + # Checks if the accelerator has performed an optimization step behind the scenes + if accelerator.sync_gradients: + if args.use_ema: + ema_model.step(model.parameters()) + progress_bar.update(1) + global_step += 1 + + if global_step % args.checkpointing_steps == 0: + if accelerator.is_main_process: + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + accelerator.save_state(save_path) + logger.info(f"Saved state to {save_path}") + + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step} + if args.use_ema: + logs["ema_decay"] = ema_model.cur_decay_value + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + progress_bar.close() + + accelerator.wait_for_everyone() + + # Generate sample images for visual inspection + if accelerator.is_main_process: + if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1: + unet = accelerator.unwrap_model(model) + + if args.use_ema: + ema_model.store(unet.parameters()) + ema_model.copy_to(unet.parameters()) + + pipeline = DDPMPipeline( + unet=unet, + scheduler=noise_scheduler, + ) + + generator = torch.Generator(device=pipeline.device).manual_seed(0) + # run pipeline in inference (sample random noise and denoise) + images = pipeline( + generator=generator, + batch_size=args.eval_batch_size, + num_inference_steps=args.ddpm_num_inference_steps, + output_type="numpy", + ).images + + if args.use_ema: + ema_model.restore(unet.parameters()) + + # denormalize the images and save to tensorboard + images_processed = (images * 255).round().astype("uint8") + + if args.logger == "tensorboard": + if is_accelerate_version(">=", "0.17.0.dev0"): + tracker = accelerator.get_tracker("tensorboard", unwrap=True) + else: + tracker = accelerator.get_tracker("tensorboard") + tracker.add_images("test_samples", images_processed.transpose(0, 3, 1, 2), epoch) + elif args.logger == "wandb": + # Upcoming `log_images` helper coming in https://github.com/huggingface/accelerate/pull/962/files + accelerator.get_tracker("wandb").log( + {"test_samples": [wandb.Image(img) for img in images_processed], "epoch": epoch}, + step=global_step, + ) + + if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1: + # save the model + unet = accelerator.unwrap_model(model) + + if args.use_ema: + ema_model.store(unet.parameters()) + ema_model.copy_to(unet.parameters()) + + pipeline = DDPMPipeline( + unet=unet, + scheduler=noise_scheduler, + ) + + pipeline.save_pretrained(args.output_dir) + + if args.use_ema: + ema_model.restore(unet.parameters()) + + if args.push_to_hub: + repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False) + + accelerator.end_training() + + +if __name__ == "__main__": + args = parse_args() + main(args) From 7948db81c58cc8ce3c6070088389b28ff487b02a Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 26 May 2023 04:41:42 -0700 Subject: [PATCH 035/199] [docs] Add AttnProcessor to docs (#3474) * add attnprocessor to docs * fix path to class * create separate page for attnprocessors * fix path * fix path for real * fill in docstrings * apply feedback * apply feedback --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/attnprocessor.mdx | 39 ++++++ src/diffusers/models/attention_processor.py | 129 ++++++++++++++++++++ 3 files changed, 170 insertions(+) create mode 100644 docs/source/en/api/attnprocessor.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 368ea30a2690..704fb4d5290d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -132,6 +132,8 @@ - sections: - local: api/models title: Models + - local: api/attnprocessor + title: Attention Processor - local: api/diffusion_pipeline title: Diffusion Pipeline - local: api/logging diff --git a/docs/source/en/api/attnprocessor.mdx b/docs/source/en/api/attnprocessor.mdx new file mode 100644 index 000000000000..ead639feffe0 --- /dev/null +++ b/docs/source/en/api/attnprocessor.mdx @@ -0,0 +1,39 @@ +# Attention Processor + +An attention processor is a class for applying different types of attention mechanisms. + +## AttnProcessor +[[autodoc]] models.attention_processor.AttnProcessor + +## AttnProcessor2_0 +[[autodoc]] models.attention_processor.AttnProcessor2_0 + +## LoRAAttnProcessor +[[autodoc]] models.attention_processor.LoRAAttnProcessor + +## CustomDiffusionAttnProcessor +[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor + +## AttnAddedKVProcessor +[[autodoc]] models.attention_processor.AttnAddedKVProcessor + +## AttnAddedKVProcessor2_0 +[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0 + +## LoRAAttnAddedKVProcessor +[[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor + +## XFormersAttnProcessor +[[autodoc]] models.attention_processor.XFormersAttnProcessor + +## LoRAXFormersAttnProcessor +[[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor + +## CustomDiffusionXFormersAttnProcessor +[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor + +## SlicedAttnProcessor +[[autodoc]] models.attention_processor.SlicedAttnProcessor + +## SlicedAttnAddedKVProcessor +[[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor \ No newline at end of file diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index acdee10c7674..4b65d164bda1 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -431,6 +431,10 @@ def norm_encoder_hidden_states(self, encoder_hidden_states): class AttnProcessor: + r""" + Default processor for performing attention-related computations. + """ + def __call__( self, attn: Attention, @@ -516,6 +520,18 @@ def forward(self, hidden_states): class LoRAAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + """ + def __init__(self, hidden_size, cross_attention_dim=None, rank=4): super().__init__() @@ -580,6 +596,24 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class CustomDiffusionAttnProcessor(nn.Module): + r""" + Processor for implementing attention for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + """ + def __init__( self, train_kv=True, @@ -658,6 +692,11 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class AttnAddedKVProcessor: + r""" + Processor for performing attention-related computations with extra learnable key and value matrices for the text + encoder. + """ + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): residual = hidden_states hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) @@ -707,6 +746,11 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class AttnAddedKVProcessor2_0: + r""" + Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra + learnable key and value matrices for the text encoder. + """ + def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): raise ImportError( @@ -765,6 +809,19 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class LoRAAttnAddedKVProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text + encoder. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + """ + def __init__(self, hidden_size, cross_attention_dim=None, rank=4): super().__init__() @@ -832,6 +889,17 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class XFormersAttnProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + def __init__(self, attention_op: Optional[Callable] = None): self.attention_op = attention_op @@ -905,6 +973,10 @@ def __call__( class AttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). + """ + def __init__(self): if not hasattr(F, "scaled_dot_product_attention"): raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") @@ -983,6 +1055,23 @@ def __call__( class LoRAXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + def __init__(self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None): super().__init__() @@ -1049,6 +1138,28 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class CustomDiffusionXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use + as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator. + """ + def __init__( self, train_kv=True, @@ -1134,6 +1245,15 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class SlicedAttnProcessor: + r""" + Processor for implementing sliced attention. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + def __init__(self, slice_size): self.slice_size = slice_size @@ -1206,6 +1326,15 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a class SlicedAttnAddedKVProcessor: + r""" + Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + def __init__(self, slice_size): self.slice_size = slice_size From 352ca3198cb25e6098f795568547075ff28e3133 Mon Sep 17 00:00:00 2001 From: dg845 <58458699+dg845@users.noreply.github.com> Date: Fri, 26 May 2023 04:57:30 -0700 Subject: [PATCH 036/199] [WIP] Add UniDiffuser model and pipeline (#2963) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix a bug of pano when not doing CFG (#3030) * Fix a bug of pano when not doing CFG * enhance code quality * apply formatting. --------- Co-authored-by: Sayak Paul * Text2video zero refinements (#3070) * fix progress bar issue in pipeline_text_to_video_zero.py. Copy scheduler after first backward * fix tensor loading in test_text_to_video_zero.py * make style && make quality * Release: v0.15.0 * [Tests] Speed up panorama tests (#3067) * fix: norm group test for UNet3D. * chore: speed up the panorama tests (fast). * set default value of _test_inference_batch_single_identical. * fix: batch_sizes default value. * [Post release] v0.16.0dev (#3072) * Adds profiling flags, computes train metrics average. (#3053) * WIP controlnet training - bugfix --streaming - bugfix running report_to!='wandb' - adds memory profile before validation * Adds final logging statement. * Sets train epochs to 11. Looking at a longer ~16ep run, we see only good validation images after ~11ep: https://wandb.ai/andsteing/controlnet_fill50k/runs/3j2hx6n8 * Removes --logging_dir (it's not used). * Adds --profile flags. * Updates --output_dir=runs/fill-circle-{timestamp}. * Compute mean of `train_metrics`. Previously `train_metrics[-1]` was logged, resulting in very bumpy train metrics. * Improves logging a bit. - adds l2_grads gradient norm logging - adds steps_per_sec - sets walltime as x coordinate of train/step - logs controlnet_params config * Adds --ccache (doesn't really help though). * minor fix in controlnet flax example (#2986) * fix the error when push_to_hub but not log validation * contronet_from_pt & controlnet_revision * add intermediate checkpointing to the guide * Bugfix --profile_steps * Sets `RACKER_PROJECT_NAME='controlnet_fill50k'`. * Logs fractional epoch. * Adds relative `walltime` metric. * Adds `StepTraceAnnotation` and uses `global_step` insetad of `step`. * Applied `black`. * Streamlines commands in README a bit. * Removes `--ccache`. This makes only a very small difference (~1 min) with this model size, so removing the option introduced in cdb3cc. * Re-ran `black`. * Update examples/controlnet/README.md Co-authored-by: Sayak Paul * Converts spaces to tab. * Removes repeated args. * Skips first step (compilation) in profiling * Updates README with profiling instructions. * Unifies tabs/spaces in README. * Re-ran style & quality. --------- Co-authored-by: Sayak Paul * [Pipelines] Make sure that None functions are correctly not saved (#3080) * doc string example remove from_pt (#3083) * [Tests] parallelize (#3078) * [Tests] parallelize * finish folder structuring * Parallelize tests more * Correct saving of pipelines * make sure logging level is correct * try again * Apply suggestions from code review Co-authored-by: Pedro Cuenca --------- Co-authored-by: Pedro Cuenca * Throw deprecation warning for return_cached_folder (#3092) Throw deprecation warning * Allow SD attend and excite pipeline to work with any size output images (#2835) Allow stable diffusion attend and excite pipeline to work with any size output image. Re: #2476, #2603 * [docs] Update community pipeline docs (#2989) * update community pipeline docs * fix formatting * explain sharing workflows * Add to support Guess Mode for StableDiffusionControlnetPipleline (#2998) * add guess mode (WIP) * fix uncond/cond order * support guidance_scale=1.0 and batch != 1 * remove magic coeff * add docstring * add intergration test * add document to controlnet.mdx * made the comments a bit more explanatory * fix table * fix default value for attend-and-excite (#3099) * fix default * remvoe one line as requested by gc team (#3077) remvoe one line * ddpm custom timesteps (#3007) add custom timesteps test add custom timesteps descending order check docs timesteps -> custom_timesteps can only pass one of num_inference_steps and timesteps * Fix breaking change in `pipeline_stable_diffusion_controlnet.py` (#3118) fix breaking change * Add global pooling to controlnet (#3121) * [Bug fix] Fix img2img processor with safety checker (#3127) Fix img2img processor with safety checker * [Bug fix] Make sure correct timesteps are chosen for img2img (#3128) Make sure correct timesteps are chosen for img2img * Improve deprecation warnings (#3131) * Fix config deprecation (#3129) * Better deprecation message * Better deprecation message * Better doc string * Fixes * fix more * fix more * Improve __getattr__ * correct more * fix more * fix * Improve more * more improvements * fix more * Apply suggestions from code review Co-authored-by: Pedro Cuenca * make style * Fix all rest & add tests & remove old deprecation fns --------- Co-authored-by: Pedro Cuenca * feat: verfication of multi-gpu support for select examples. (#3126) * feat: verfication of multi-gpu support for select examples. * add: multi-gpu training sections to the relvant doc pages. * speed up attend-and-excite fast tests (#3079) * Optimize log_validation in train_controlnet_flax (#3110) extract pipeline from log_validation * make style * Correct textual inversion readme (#3145) * Update README.md * Apply suggestions from code review * Add unet act fn to other model components (#3136) Adding act fn config to the unet timestep class embedding and conv activation. The custom activation defaults to silu which is the default activation function for both the conv act and the timestep class embeddings so default behavior is not changed. The only unet which use the custom activation is the stable diffusion latent upscaler https://huggingface.co/stabilityai/sd-x2-latent-upscaler/blob/main/unet/config.json (I ran a script against the hub to confirm). The latent upscaler does not use the conv activation nor the timestep class embeddings so we don't change its behavior. * class labels timestep embeddings projection dtype cast (#3137) This mimics the dtype cast for the standard time embeddings * [ckpt loader] Allow loading the Inpaint and Img2Img pipelines, while loading a ckpt model (#2705) * [ckpt loader] Allow loading the Inpaint and Img2Img pipelines, while loading a ckpt model * Address review comment from PR * PyLint formatting * Some more pylint fixes, unrelated to our change * Another pylint fix * Styling fix * add from_ckpt method as Mixin (#2318) * add mixin class for pipeline from original sd ckpt * Improve * make style * merge main into * Improve more * fix more * up * Apply suggestions from code review * finish docs * rename * make style --------- Co-authored-by: Patrick von Platen * Add TensorRT SD/txt2img Community Pipeline to diffusers along with TensorRT utils (#2974) * Add SD/txt2img Community Pipeline to diffusers along with TensorRT utils Signed-off-by: Asfiya Baig * update installation command Signed-off-by: Asfiya Baig * update tensorrt installation Signed-off-by: Asfiya Baig * changes 1. Update setting of cache directory 2. Address comments: merge utils and pipeline code. 3. Address comments: Add section in README Signed-off-by: Asfiya Baig * apply make style Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig Co-authored-by: Patrick von Platen * Correct `Transformer2DModel.forward` docstring (#3074) ⚙️chore(transformer_2d) update function signature for encoder_hidden_states * Update pipeline_stable_diffusion_inpaint_legacy.py (#2903) * Update pipeline_stable_diffusion_inpaint_legacy.py * fix preprocessing of Pil images with adequate batch size * revert map * add tests * reformat * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * next try to fix the style * wth is this * Update testing_utils.py * Update testing_utils.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py * Update test_stable_diffusion_inpaint_legacy.py --------- Co-authored-by: Patrick von Platen * Modified altdiffusion pipline to support altdiffusion-m18 (#2993) * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 * Modified altdiffusion pipline to support altdiffusion-m18 --------- Co-authored-by: root * controlnet training resize inputs to multiple of 8 (#3135) controlnet training center crop input images to multiple of 8 The pipeline code resizes inputs to multiples of 8. Not doing this resizing in the training script is causing the encoded image to have different height/width dimensions than the encoded conditioning image (which uses a separate encoder that's part of the controlnet model). We resize and center crop the inputs to make sure they're the same size (as well as all other images in the batch). We also check that the initial resolution is a multiple of 8. * adding custom diffusion training to diffusers examples (#3031) * diffusers==0.14.0 update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion update * custom diffusion * custom diffusion * custom diffusion * custom diffusion * custom diffusion * apply formatting and get rid of bare except. * refactor readme and other minor changes. * misc refactor. * fix: repo_id issue and loaders logging bug. * fix: save_model_card. * fix: save_model_card. * fix: save_model_card. * add: doc entry. * refactor doc,. * custom diffusion * custom diffusion * custom diffusion * apply style. * remove tralining whitespace. * fix: toctree entry. * remove unnecessary print. * custom diffusion * custom diffusion * custom diffusion test * custom diffusion xformer update * custom diffusion xformer update * custom diffusion xformer update --------- Co-authored-by: Nupur Kumari Co-authored-by: Sayak Paul Co-authored-by: Patrick von Platen Co-authored-by: Nupur Kumari * make style * Update custom_diffusion.mdx (#3165) Add missing newlines for rendering the links correctly * Added distillation for quantization example on textual inversion. (#2760) * Added distillation for quantization example on textual inversion. Signed-off-by: Ye, Xinyu * refined readme and code style. Signed-off-by: Ye, Xinyu * Update text2images.py * refined code of model load and added compatibility check. Signed-off-by: Ye, Xinyu * fixed code style. Signed-off-by: Ye, Xinyu * fix C403 [*] Unnecessary `list` comprehension (rewrite as a `set` comprehension) Signed-off-by: Ye, Xinyu --------- Signed-off-by: Ye, Xinyu * Update Noise Autocorrelation Loss Function for Pix2PixZero Pipeline (#2942) * Update Pix2PixZero Auto-correlation Loss * Add fast inversion tests * Clarify purpose and mark as deprecated Fix inversion prompt broadcasting * Register modules set to `None` in config for `test_save_load_optional_components` * Update new tests to coordinate with #2953 * [DreamBooth] add text encoder LoRA support in the DreamBooth training script (#3130) * add: LoRA text encoder support for DreamBooth example. * fix initialization. * fix: modification call. * add: entry in the readme. * use dog dataset from hub. * fix: params to clip. * add entry to the LoRA doc. * add: tests for lora. * remove unnecessary list comprehension./ * Update Habana Gaudi documentation (#3169) * Update Habana Gaudi doc * Fix tables * Add model offload to x4 upscaler (#3187) * Add model offload to x4 upscaler * fix * [docs] Deterministic algorithms (#3172) deterministic algos * Update custom_diffusion.mdx to credit the author (#3163) * Update custom_diffusion.mdx * fix: unnecessary list comprehension. * Fix TensorRT community pipeline device set function (#3157) pass silence_dtype_warnings as kwarg Signed-off-by: Asfiya Baig Co-authored-by: Patrick von Platen * make `from_flax` work for controlnet (#3161) fix from_flax Co-authored-by: Patrick von Platen * [docs] Clarify training args (#3146) * clarify training arg * apply feedback * Multi Vector Textual Inversion (#3144) * Multi Vector * Improve * fix multi token * improve test * make style * Update examples/test_examples.py * Apply suggestions from code review Co-authored-by: Suraj Patil * update * Finish * Apply suggestions from code review --------- Co-authored-by: Suraj Patil * Add `Karras sigmas` to HeunDiscreteScheduler (#3160) * Add karras pattern to discrete heun scheduler * Add integration test * Fix failing CI on pytorch test on M1 (mps) --------- Co-authored-by: Patrick von Platen * [AudioLDM] Fix dtype of returned waveform (#3189) * Fix bug in train_dreambooth_lora (#3183) * Update train_dreambooth_lora.py fix bug * Update train_dreambooth_lora.py * [Community Pipelines] Update lpw_stable_diffusion pipeline (#3197) * Update lpw_stable_diffusion.py * fix cpu offload * Make sure VAE attention works with Torch 2_0 (#3200) * Make sure attention works with Torch 2_0 * make style * Fix more * Revert "[Community Pipelines] Update lpw_stable_diffusion pipeline" (#3201) Revert "[Community Pipelines] Update lpw_stable_diffusion pipeline (#3197)" This reverts commit 9965cb50eac12e397473f01535aab43aae76b4ab. * [Bug fix] Fix batch size attention head size mismatch (#3214) * fix mixed precision training on train_dreambooth_inpaint_lora (#3138) cast to weight dtype * adding enable_vae_tiling and disable_vae_tiling functions (#3225) adding enable_vae_tiling and disable_val_tiling functions * Add ControlNet v1.1 docs (#3226) Add v1.1 docs * Fix issue in maybe_convert_prompt (#3188) When the token used for textual inversion does not have any special symbols (e.g. it is not surrounded by <>), the tokenizer does not properly split the replacement tokens. Adding a space for the padding tokens fixes this. * Sync cache version check from transformers (#3179) sync cache version check from transformers * Fix docs text inversion (#3166) * Fix docs text inversion * Apply suggestions from code review * add model (#3230) * add * clean * up * clean up more * fix more tests * Improve docs further * improve * more fixes docs * Improve docs more * Update src/diffusers/models/unet_2d_condition.py * fix * up * update doc links * make fix-copies * add safety checker and watermarker to stage 3 doc page code snippets * speed optimizations docs * memory optimization docs * make style * add watermarking snippets to doc string examples * make style * use pt_to_pil helper functions in doc strings * skip mps tests * Improve safety * make style * new logic * fix * fix bad onnx design * make new stable diffusion upscale pipeline model arguments optional * define has_nsfw_concept when non-pil output type * lowercase linked to notebook name --------- Co-authored-by: William Berman * Allow return pt x4 (#3236) * Add all files * update * Allow fp16 attn for x4 upscaler (#3239) * Add all files * update * Make sure vae is memory efficient for PT 1 * make style * fix fast test (#3241) * Adds a document on token merging (#3208) * add document on token merging. * fix headline. * fix: headline. * add some samples for comparison. * [AudioLDM] Update docs to use updated ckpt (#3240) * [AudioLDM] Update docs to use updated ckpt * make style * Release: v0.16.0 * Post release for 0.16.0 (#3244) * Post release * fix more * [docs] only mention one stage (#3246) * [docs] only mention one stage * add blurb on auto accepting --------- Co-authored-by: William Berman * Write model card in controlnet training script (#3229) Write model card in controlnet training script. * [2064]: Add stochastic sampler (sample_dpmpp_sde) (#3020) * [2064]: Add stochastic sampler * [2064]: Add stochastic sampler * [2064]: Add stochastic sampler * [2064]: Add stochastic sampler * [2064]: Add stochastic sampler * [2064]: Add stochastic sampler * [2064]: Add stochastic sampler * Review comments * [Review comment]: Add is_torchsde_available() * [Review comment]: Test and docs * [Review comment] * [Review comment] * [Review comment] * [Review comment] * [Review comment] --------- Co-authored-by: njindal * [Stochastic Sampler][Slow Test]: Cuda test fixes (#3257) [Slow Test]: Cuda test fixes Co-authored-by: njindal * Remove required from tracker_project_name (#3260) Remove required from tracker_project_name. As observed by https://github.com/off99555 in https://github.com/huggingface/diffusers/issues/2695#issuecomment-1470755050, it already has a default value. * adding required parameters while calling the get_up_block and get_down_block (#3210) * removed unnecessary parameters from get_up_block and get_down_block functions * adding resnet_skip_time_act, resnet_out_scale_factor and cross_attention_norm to get_up_block and get_down_block functions --------- Co-authored-by: Sayak Paul * [docs] Update interface in repaint.mdx (#3119) Update repaint.mdx accomodate to #1701 * Update IF name to XL (#3262) Co-authored-by: multimodalart * fix typo in score sde pipeline (#3132) * Fix typo in textual inversion JAX training script (#3123) The pipeline is built as `pipe` but then used as `pipeline`. * AudioDiffusionPipeline - fix encode method after config changes (#3114) * config fixes * deprecate get_input_dims * Revert "Revert "[Community Pipelines] Update lpw_stable_diffusion pipeline"" (#3265) Revert "Revert "[Community Pipelines] Update lpw_stable_diffusion pipeline" (#3201)" This reverts commit 91a2a80eb2f98a9f64b9e287715add244dc6f2f3. * Fix community pipelines (#3266) * update notebook (#3259) Co-authored-by: yiyixuxu * [docs] add notes for stateful model changes (#3252) * [docs] add notes for stateful model changes * Update docs/source/en/optimization/fp16.mdx Co-authored-by: Pedro Cuenca * link to accelerate docs for discarding hooks --------- Co-authored-by: Pedro Cuenca * [LoRA] quality of life improvements in the loading semantics and docs (#3180) * 👽 qol improvements for LoRA. * better function name? * fix: LoRA weight loading with the new format. * address Patrick's comments. * Apply suggestions from code review Co-authored-by: Patrick von Platen * change wording around encouraging the use of load_lora_weights(). * fix: function name. --------- Co-authored-by: Patrick von Platen * [Community Pipelines] EDICT pipeline implementation (#3153) * EDICT pipeline initial commit - Starting point taking from https://github.com/Joqsan/edict-diffusion * refactor __init__() method * minor refactoring * refactor scheduler code - remove scheduler and move its methods to the EDICTPipeline class * make CFG optional - refactor encode_prompt(). - include optional generator for sampling with vae. - minor variable renaming * add EDICT pipeline description to README.md * replace preprocess() with VaeImageProcessor * run make style and make quality commands --------- Co-authored-by: Patrick von Platen * [Docs]zh translated docs update (#3245) * zh translated docs update * update _toctree * Update logging.mdx (#2863) Fix typos * Add multiple conditions to StableDiffusionControlNetInpaintPipeline (#3125) * try multi controlnet inpaint * multi controlnet inpaint * multi controlnet inpaint * Let's make sure that dreambooth always uploads to the Hub (#3272) * Update Dreambooth README * Adapt all docs as well * automatically write model card * fix * make style * Diffedit Zero-Shot Inpainting Pipeline (#2837) * Update Pix2PixZero Auto-correlation Loss * Add Stable Diffusion DiffEdit pipeline * Add draft documentation and import code * Bugfixes and refactoring * Add option to not decode latents in the inversion process * Harmonize preprocessing * Revert "Update Pix2PixZero Auto-correlation Loss" This reverts commit b218062fed08d6cc164206d6cb852b2b7b00847a. * Update annotations * rename `compute_mask` to `generate_mask` * Update documentation * Update docs * Update Docs * Fix copy * Change shape of output latents to batch first * Update docs * Add first draft for tests * Bugfix and update tests * Add `cross_attention_kwargs` support for all pipeline methods * Fix Copies * Add support for PIL image latents Add support for mask broadcasting Update docs and tests Align `mask` argument to `mask_image` Remove height and width arguments * Enable MPS Tests * Move example docstrings * Fix test * Fix test * fix pipeline inheritance * Harmonize `prepare_image_latents` with StableDiffusionPix2PixZeroPipeline * Register modules set to `None` in config for `test_save_load_optional_components` * Move fixed logic to specific test class * Clean changes to other pipelines * Update new tests to coordinate with #2953 * Update slow tests for better results * Safety to avoid potential problems with torch.inference_mode * Add reference in SD Pipeline Overview * Fix tests again * Enforce determinism in noise for generate_mask * Fix copies * Widen test tolerance for fp16 based on `test_stable_diffusion_upscale_pipeline_fp16` * Add LoraLoaderMixin and update `prepare_image_latents` * clean up repeat and reg * bugfix * Remove invalid args from docs Suppress spurious warning by repeating image before latent to mask gen * add constant learning rate with custom rule (#3133) * add constant lr with rules * add constant with rules in TYPE_TO_SCHEDULER_FUNCTION * add constant lr rate with rule * hotfix code quality * fix doc style * change name constant_with_rules to piecewise constant * Allow disabling torch 2_0 attention (#3273) * Allow disabling torch 2_0 attention * make style * Update src/diffusers/models/attention.py * [doc] add link to training script (#3271) add link to training script Co-authored-by: yiyixuxu * temp disable spectogram diffusion tests (#3278) The note-seq package throws an error on import because the default installed version of Ipython is not compatible with python 3.8 which we run in the CI. https://github.com/huggingface/diffusers/actions/runs/4830121056/jobs/8605954838#step:7:9 * Changed sample[0] to images[0] (#3304) A pipeline object stores the results in `images` not in `sample`. Current code blocks don't work. * Typo in tutorial (#3295) * Torch compile graph fix (#3286) * fix more * Fix more * fix more * Apply suggestions from code review * fix * make style * make fix-copies * fix * make sure torch compile * Clean * fix test * Postprocessing refactor img2img (#3268) * refactor img2img VaeImageProcessor.postprocess * remove copy from for init, run_safety_checker, decode_latents Co-authored-by: Sayak Paul --------- Co-authored-by: yiyixuxu Co-authored-by: Sayak Paul * [Torch 2.0 compile] Fix more torch compile breaks (#3313) * Fix more torch compile breaks * add tests * Fix all * fix controlnet * fix more * Add Horace He as co-author. > > Co-authored-by: Horace He * Add Horace He as co-author. Co-authored-by: Horace He --------- Co-authored-by: Horace He * fix: scale_lr and sync example readme and docs. (#3299) * fix: scale_lr and sync example readme and docs. * fix doc link. * Update stable_diffusion.mdx (#3310) fixed import statement * Fix missing variable assign in DeepFloyd-IF-II (#3315) Fix missing variable assign lol * Correct doc build for patch releases (#3316) Update build_documentation.yml * Add Stable Diffusion RePaint to community pipelines (#3320) * Add Stable Diffsuion RePaint to community pipelines - Adds Stable Diffsuion RePaint to community pipelines - Add Readme enty for pipeline * Fix: Remove wrong import - Remove wrong import - Minor change in comments * Fix: Code formatting of stable_diffusion_repaint * Fix: ruff errors in stable_diffusion_repaint * Fix multistep dpmsolver for cosine schedule (suitable for deepfloyd-if) (#3314) * fix multistep dpmsolver for cosine schedule (deepfloy-if) * fix a typo * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * update all dpmsolver (singlestep, multistep, dpm, dpm++) for cosine noise schedule * add test, fix style --------- Co-authored-by: Patrick von Platen * [docs] Improve LoRA docs (#3311) * update docs * add to toctree * apply feedback * Added input pretubation (#3292) * Added input pretubation * Fixed spelling * Update write_own_pipeline.mdx (#3323) * update controlling generation doc with latest goodies. (#3321) * [Quality] Make style (#3341) * Fix config dpm (#3343) * Add the SDE variant of DPM-Solver and DPM-Solver++ (#3344) * add SDE variant of DPM-Solver and DPM-Solver++ * add test * fix typo * fix typo * Add upsample_size to AttnUpBlock2D, AttnDownBlock2D (#3275) The argument `upsample_size` needs to be added to these modules to allow compatibility with other blocks that require this argument. * Add UniDiffuser classes to __init__ files, modify transformer block to support pre- and post-LN, add fast default tests, fix some bugs. * Update fast tests to use test checkpoints stored on the hub and to better match the reference UniDiffuser implementation. * Fix code with make style. * Revert "Fix code style with make style." This reverts commit 10a174a12c82e6abd3d5a57665719a03dbb85ca7. * Add self.image_encoder, self.text_decoder to list of models to offload to CPU in the enable_sequential_cpu_offload(...)/enable_model_cpu_offload(...) methods to make test_cpu_offload_forward_pass pass. * Fix code quality with make style. * Support using a data type embedding for UniDiffuser-v1. * Add fast test for checking UniDiffuser-v1 sampling. * Make changes so that the repository consistency tests pass. * Add UniDiffuser dummy objects via make fix-copies. * Fix bugs and make improvements to the UniDiffuser pipeline: - Improve batch size inference and fix bugs when num_images_per_prompt or num_prompts_per_image > 1 - Add tests for num_images_per_prompt, num_prompts_per_image > 1 - Improve check_inputs, especially regarding checking supplied latents - Add reset_mode method so that mode inference can be re-enabled after mode is set manually - Fix some warnings related to accessing class members directly instead of through their config - Small amount of refactoring in pipeline_unidiffuser.py * Fix code style with make style. * Add/edit docstrings for added classes and public pipeline methods. Also do some light refactoring. * Add documentation for UniDiffuser and fix some typos/formatting in docstrings. * Fix code with make style. * Refactor and improve the UniDiffuser convert_from_ckpt.py script. * Move the UniDiffusers convert_from_ckpy.py script to diffusers/scripts/convert_unidiffuser_to_diffusers.py * Fix code quality via make style. * Improve UniDiffuser slow tests. * make style * Fix some typos in the UniDiffuser docs. * Remove outdated logic based on transformers version in UniDiffuser pipeline __init__.py * Remove dependency on einops by refactoring einops operations to pure torch operations. * make style * Add slow test on full checkpoint for joint mode and correct expected image slices/text prefixes. * make style * Fix mixed precision issue by wrapping the offending code with the torch.autocast context manager. * Revert "Fix mixed precision issue by wrapping the offending code with the torch.autocast context manager." This reverts commit 1a58958ab4f024dbc4c90a6404c2e66210db6d00. * Add fast test for CUDA/fp16 model behavior (currently failing). * Fix the mixed precision issue and add additional tests of the pipeline cuda/fp16 functionality. * make style * Use a CLIPVisionModelWithProjection instead of CLIPVisionModel for image_encoder to better match the original UniDiffuser implementation. * Make style and remove some testing code. * Fix shape errors for the 'joint' and 'img2text' modes. * Fix tests and remove some testing code. * Add option to use fixed latents for UniDiffuserPipelineSlowTests and fix issue in modeling_text_decoder.py. * Improve UniDiffuser docs, particularly the usage examples, and improve slow tests with new expected outputs. * make style * Fix examples to load model in float16. * In image-to-text mode, sample from the autoencoder moment distribution instead of always getting its mode. * make style * When encoding the image using the VAE, scale the image latents by the VAE's scaling factor. * make style * Clean up code and make slow tests pass. * make fix-copies * [docs] Fix docstring (#3334) fix docstring Co-authored-by: Patrick von Platen * if dreambooth lora (#3360) * update IF stage I pipelines add fixed variance schedulers and lora loading * added kv lora attn processor * allow loading into alternative lora attn processor * make vae optional * throw away predicted variance * allow loading into added kv lora layer * allow load T5 * allow pre compute text embeddings * set new variance type in schedulers * fix copies * refactor all prompt embedding code class prompts are now included in pre-encoding code max tokenizer length is now configurable embedding attention mask is now configurable * fix for when variance type is not defined on scheduler * do not pre compute validation prompt if not present * add example test for if lora dreambooth * add check for train text encoder and pre compute text embeddings * Postprocessing refactor all others (#3337) * add text2img * fix-copies * add * add all other pipelines * add * add * add * add * add * make style * style + fix copies --------- Co-authored-by: yiyixuxu * [docs] Improve safetensors docstring (#3368) * clarify safetensor docstring * fix typo * apply feedback * add: a warning message when using xformers in a PT 2.0 env. (#3365) * add: a warning message when using xformers in a PT 2.0 env. * Apply suggestions from code review Co-authored-by: Patrick von Platen --------- Co-authored-by: Patrick von Platen * StableDiffusionInpaintingPipeline - resize image w.r.t height and width (#3322) * StableDiffusionInpaintingPipeline now resizes input images and masks w.r.t to passed input height and width. Default is already set to 512. This addresses the common tensor mismatch error. Also moved type check into relevant funciton to keep main pipeline body tidy. * Fixed StableDiffusionInpaintingPrepareMaskAndMaskedImageTests Due to previous commit these tests were failing as height and width need to be passed into the prepare_mask_and_masked_image function, I have updated the code and added a height/width variable per unit test as it seemed more appropriate than the current hard coded solution * Added a resolution test to StableDiffusionInpaintPipelineSlowTests this unit test simply gets the input and resizes it into some that would fail (e.g. would throw a tensor mismatch error/not a mult of 8). Then passes it through the pipeline and verifies it produces output with correct dims w.r.t the passed height and width --------- Co-authored-by: Patrick von Platen * make style * [docs] Adapt a model (#3326) * first draft * apply feedback * conv_in.weight thrown away * [docs] Load safetensors (#3333) * safetensors * apply feedback * apply feedback * Apply suggestions from code review --------- Co-authored-by: Patrick von Platen * make style * [Docs] Fix stable_diffusion.mdx typo (#3398) Fix typo in last code block. Correct "prommpts" to "prompt" * Support ControlNet v1.1 shuffle properly (#3340) * add inferring_controlnet_cond_batch * Revert "add inferring_controlnet_cond_batch" This reverts commit abe8d6311d4b7f5b9409ca709c7fabf80d06c1a9. * set guess_mode to True whenever global_pool_conditions is True Co-authored-by: Patrick von Platen * nit * add integration test --------- Co-authored-by: Patrick von Platen * [Tests] better determinism (#3374) * enable deterministic pytorch and cuda operations. * disable manual seeding. * make style && make quality for unet_2d tests. * enable determinism for the unet2dconditional model. * add CUBLAS_WORKSPACE_CONFIG for better reproducibility. * relax tolerance (very weird issue, though). * revert to torch manual_seed() where needed. * relax more tolerance. * better placement of the cuda variable and relax more tolerance. * enable determinism for 3d condition model. * relax tolerance. * add: determinism to alt_diffusion. * relax tolerance for alt diffusion. * dance diffusion. * dance diffusion is flaky. * test_dict_tuple_outputs_equivalent edit. * fix two more tests. * fix more ddim tests. * fix: argument. * change to diff in place of difference. * fix: test_save_load call. * test_save_load_float16 call. * fix: expected_max_diff * fix: paint by example. * relax tolerance. * add determinism to 1d unet model. * torch 2.0 regressions seem to be brutal * determinism to vae. * add reason to skipping. * up tolerance. * determinism to vq. * determinism to cuda. * determinism to the generic test pipeline file. * refactor general pipelines testing a bit. * determinism to alt diffusion i2i * up tolerance for alt diff i2i and audio diff * up tolerance. * determinism to audioldm * increase tolerance for audioldm lms. * increase tolerance for paint by paint. * increase tolerance for repaint. * determinism to cycle diffusion and sd 1. * relax tol for cycle diffusion 🚲 * relax tol for sd 1.0 * relax tol for controlnet. * determinism to img var. * relax tol for img variation. * tolerance to i2i sd * make style * determinism to inpaint. * relax tolerance for inpaiting. * determinism for inpainting legacy * relax tolerance. * determinism to instruct pix2pix * determinism to model editing. * model editing tolerance. * panorama determinism * determinism to pix2pix zero. * determinism to sag. * sd 2. determinism * sd. tolerance * disallow tf32 matmul. * relax tolerance is all you need. * make style and determinism to sd 2 depth * relax tolerance for depth. * tolerance to diffedit. * tolerance to sd 2 inpaint. * up tolerance. * determinism in upscaling. * tolerance in upscaler. * more tolerance relaxation. * determinism to v pred. * up tol for v_pred * unclip determinism * determinism to unclip img2img * determinism to text to video. * determinism to last set of tests * up tol. * vq cumsum doesn't have a deterministic kernel * relax tol * relax tol * [docs] Add transformers to install (#3388) add transformers to install * [deepspeed] partial ZeRO-3 support (#3076) * [deepspeed] partial ZeRO-3 support * cleanup * improve deepspeed fixes * Improve * make style --------- Co-authored-by: Patrick von Platen * Add omegaconf for tests (#3400) Add omegaconfg * Fix various bugs with LoRA Dreambooth and Dreambooth script (#3353) * Improve checkpointing lora * fix more * Improve doc string * Update src/diffusers/loaders.py * make stytle * Apply suggestions from code review * Update src/diffusers/loaders.py * Apply suggestions from code review * Apply suggestions from code review * better * Fix all * Fix multi-GPU dreambooth * Apply suggestions from code review Co-authored-by: Pedro Cuenca * Fix all * make style * make style --------- Co-authored-by: Pedro Cuenca * Fix docker file (#3402) * up * up * fix: deepseepd_plugin retrieval from accelerate state (#3410) * [Docs] Add `sigmoid` beta_scheduler to docstrings of relevant Schedulers (#3399) * Add `sigmoid` beta scheduler to `DDPMScheduler` docstring * Add `sigmoid` beta scheduler to `RePaintScheduler` docstring --------- Co-authored-by: Patrick von Platen * Don't install accelerate and transformers from source (#3415) * Don't install transformers and accelerate from source (#3414) * Improve fast tests (#3416) Update pr_tests.yml * attention refactor: the trilogy (#3387) * Replace `AttentionBlock` with `Attention` * use _from_deprecated_attn_block check re: @patrickvonplaten * [Docs] update the PT 2.0 optimization doc with latest findings (#3370) * add: benchmarking stats for A100 and V100. * Apply suggestions from code review Co-authored-by: Patrick von Platen * address patrick's comments. * add: rtx 4090 stats * ⚔ benchmark reports done * Apply suggestions from code review Co-authored-by: Pedro Cuenca * 3313 pr link. * add: plots. Co-authored-by: Pedro * fix formattimg * update number percent. --------- Co-authored-by: Patrick von Platen Co-authored-by: Pedro Cuenca * Fix style rendering (#3433) * Fix style rendering. * Fix typo * unCLIP scheduler do not use note (#3417) * Replace deprecated command with environment file (#3409) Co-authored-by: Patrick von Platen * fix warning message pipeline loading (#3446) * add stable diffusion tensorrt img2img pipeline (#3419) * add stable diffusion tensorrt img2img pipeline Signed-off-by: Asfiya Baig * update docstrings Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig * Refactor controlnet and add img2img and inpaint (#3386) * refactor controlnet and add img2img and inpaint * First draft to get pipelines to work * make style * Fix more * Fix more * More tests * Fix more * Make inpainting work * make style and more tests * Apply suggestions from code review * up * make style * Fix imports * Fix more * Fix more * Improve examples * add test * Make sure import is correctly deprecated * Make sure everything works in compile mode * make sure authorship is correctly attributed * [Scheduler] DPM-Solver (++) Inverse Scheduler (#3335) * Add DPM-Solver Multistep Inverse Scheduler * Add draft tests for DiffEdit * Add inverse sde-dpmsolver steps to tune image diversity from inverted latents * Fix tests --------- Co-authored-by: Patrick von Platen * [Docs] Fix incomplete docstring for resnet.py (#3438) Fix incomplete docstrings for resnet.py * fix tiled vae blend extent range (#3384) fix tiled vae bleand extent range * Small update to "Next steps" section (#3443) Small update to "Next steps" section: - PyTorch 2 is recommended. - Updated improvement figures. * Allow arbitrary aspect ratio in IFSuperResolutionPipeline (#3298) * Update pipeline_if_superresolution.py Allow arbitrary aspect ratio in IFSuperResolutionPipeline by using the input image shape * IFSuperResolutionPipeline: allow the user to override the height and width through the arguments * update IFSuperResolutionPipeline width/height doc string to match StableDiffusionInpaintPipeline conventions --------- Co-authored-by: Patrick von Platen * Adding 'strength' parameter to StableDiffusionInpaintingPipeline (#3424) * Added explanation of 'strength' parameter * Added get_timesteps function which relies on new strength parameter * Added `strength` parameter which defaults to 1. * Swapped ordering so `noise_timestep` can be calculated before masking the image this is required when you aren't applying 100% noise to the masked region, e.g. strength < 1. * Added strength to check_inputs, throws error if out of range * Changed `prepare_latents` to initialise latents w.r.t strength inspired from the stable diffusion img2img pipeline, init latents are initialised by converting the init image into a VAE latent and adding noise (based upon the strength parameter passed in), e.g. random when strength = 1, or the init image at strength = 0. * WIP: Added a unit test for the new strength parameter in the StableDiffusionInpaintingPipeline still need to add correct regression values * Created a is_strength_max to initialise from pure random noise * Updated unit tests w.r.t new strength parameter + fixed new strength unit test * renamed parameter to avoid confusion with variable of same name * Updated regression values for new strength test - now passes * removed 'copied from' comment as this method is now different and divergent from the cpy * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py Co-authored-by: Patrick von Platen * Ensure backwards compatibility for prepare_mask_and_masked_image created a return_image boolean and initialised to false * Ensure backwards compatibility for prepare_latents * Fixed copy check typo * Fixes w.r.t backward compibility changes * make style * keep function argument ordering same for backwards compatibility in callees with copied from statements * make fix-copies --------- Co-authored-by: Patrick von Platen Co-authored-by: William Berman * [WIP] Bugfix - Pipeline.from_pretrained is broken when the pipeline is partially downloaded (#3448) Added bugfix using f strings. * Fix gradient checkpointing bugs in freezing part of models (requires_grad=False) (#3404) * gradient checkpointing bug fix * bug fix; changes for reviews * reformat * reformat --------- Co-authored-by: Patrick von Platen * Make dreambooth lora more robust to orig unet (#3462) * Make dreambooth lora more robust to orig unet * up * Reduce peak VRAM by releasing large attention tensors (as soon as they're unnecessary) (#3463) Release large tensors in attention (as soon as they're no longer required). Reduces peak VRAM by nearly 2 GB for 1024x1024 (even after slicing), and the savings scale up with image size. * Add min snr to text2img lora training script (#3459) add min snr to text2img lora training script * Add inpaint lora scale support (#3460) * add inpaint lora scale support * add inpaint lora scale test --------- Co-authored-by: yueyang.hyy * [From ckpt] Fix from_ckpt (#3466) * Correct from_ckpt * make style * Update full dreambooth script to work with IF (#3425) * Add IF dreambooth docs (#3470) * parameterize pass single args through tuple (#3477) * attend and excite tests disable determinism on the class level (#3478) * dreambooth docs torch.compile note (#3471) * dreambooth docs torch.compile note * Update examples/dreambooth/README.md Co-authored-by: Sayak Paul * Update examples/dreambooth/README.md Co-authored-by: Pedro Cuenca --------- Co-authored-by: Sayak Paul Co-authored-by: Pedro Cuenca * add: if entry in the dreambooth training docs. (#3472) * [docs] Textual inversion inference (#3473) * add textual inversion inference to docs * add to toctree --------- Co-authored-by: Sayak Paul * [docs] Distributed inference (#3376) * distributed inference * move to inference section * apply feedback * update with split_between_processes * apply feedback * [{Up,Down}sample1d] explicit view kernel size as number elements in flattened indices (#3479) explicit view kernel size as number elements in flattened indices * mps & onnx tests rework (#3449) * Remove ONNX tests from PR. They are already a part of push_tests.yml. * Remove mps tests from PRs. They are already performed on push. * Fix workflow name for fast push tests. * Extract mps tests to a workflow. For better control/filtering. * Remove --extra-index-url from mps tests * Increase tolerance of mps test This test passes in my Mac (Ventura 13.3) but fails in the CI hardware (Ventura 13.2). I ran the local tests following the same steps that exist in the CI workflow. * Temporarily run mps tests on pr So we can test. * Revert "Temporarily run mps tests on pr" Tests passed, go back to running on push. * [Attention processor] Better warning message when shifting to `AttnProcessor2_0` (#3457) * add: debugging to enabling memory efficient processing * add: better warning message. * [Docs] add note on local directory path. (#3397) add note on local directory path. Co-authored-by: Patrick von Platen * Refactor full determinism (#3485) * up * fix more * Apply suggestions from code review * fix more * fix more * Check it * Remove 16:8 * fix more * fix more * fix more * up * up * Test only stable diffusion * Test only two files * up * Try out spinning up processes that can be killed * up * Apply suggestions from code review * up * up * Fix DPM single (#3413) * Fix DPM single * add test * fix one more bug * Apply suggestions from code review Co-authored-by: StAlKeR7779 --------- Co-authored-by: StAlKeR7779 * Add `use_Karras_sigmas` to DPMSolverSinglestepScheduler (#3476) * add use_karras_sigmas * add karras test * add doc * Adds local_files_only bool to prevent forced online connection (#3486) * make style * [Docs] Korean translation (optimization, training) (#3488) * feat) optimization kr translation * fix) typo, italic setting * feat) dreambooth, text2image kr * feat) lora kr * fix) LoRA * fix) fp16 fix * fix) doc-builder style * fix) fp16 일부 단어 수정 * fix) fp16 style fix * fix) opt, training docs update * feat) toctree update * feat) toctree update --------- Co-authored-by: Chanran Kim * DataLoader respecting EXIF data in Training Images (#3465) * DataLoader will now bake in any transforms or image manipulations contained in the EXIF Images may have rotations stored in EXIF. Training using such images will cause those transforms to be ignored while training and thus produce unexpected results * Fixed the Dataloading EXIF issue in main DreamBooth training as well * Run make style (black & isort) * make style * feat: allow disk offload for diffuser models (#3285) * allow disk offload for diffuser models * sort import * add max_memory argument * Changed sample[0] to images[0] (#3304) A pipeline object stores the results in `images` not in `sample`. Current code blocks don't work. * Typo in tutorial (#3295) * Torch compile graph fix (#3286) * fix more * Fix more * fix more * Apply suggestions from code review * fix * make style * make fix-copies * fix * make sure torch compile * Clean * fix test * Postprocessing refactor img2img (#3268) * refactor img2img VaeImageProcessor.postprocess * remove copy from for init, run_safety_checker, decode_latents Co-authored-by: Sayak Paul --------- Co-authored-by: yiyixuxu Co-authored-by: Sayak Paul * [Torch 2.0 compile] Fix more torch compile breaks (#3313) * Fix more torch compile breaks * add tests * Fix all * fix controlnet * fix more * Add Horace He as co-author. > > Co-authored-by: Horace He * Add Horace He as co-author. Co-authored-by: Horace He --------- Co-authored-by: Horace He * fix: scale_lr and sync example readme and docs. (#3299) * fix: scale_lr and sync example readme and docs. * fix doc link. * Update stable_diffusion.mdx (#3310) fixed import statement * Fix missing variable assign in DeepFloyd-IF-II (#3315) Fix missing variable assign lol * Correct doc build for patch releases (#3316) Update build_documentation.yml * Add Stable Diffusion RePaint to community pipelines (#3320) * Add Stable Diffsuion RePaint to community pipelines - Adds Stable Diffsuion RePaint to community pipelines - Add Readme enty for pipeline * Fix: Remove wrong import - Remove wrong import - Minor change in comments * Fix: Code formatting of stable_diffusion_repaint * Fix: ruff errors in stable_diffusion_repaint * Fix multistep dpmsolver for cosine schedule (suitable for deepfloyd-if) (#3314) * fix multistep dpmsolver for cosine schedule (deepfloy-if) * fix a typo * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * Update src/diffusers/schedulers/scheduling_dpmsolver_multistep.py Co-authored-by: Patrick von Platen * update all dpmsolver (singlestep, multistep, dpm, dpm++) for cosine noise schedule * add test, fix style --------- Co-authored-by: Patrick von Platen * [docs] Improve LoRA docs (#3311) * update docs * add to toctree * apply feedback * Added input pretubation (#3292) * Added input pretubation * Fixed spelling * Update write_own_pipeline.mdx (#3323) * update controlling generation doc with latest goodies. (#3321) * [Quality] Make style (#3341) * Fix config dpm (#3343) * Add the SDE variant of DPM-Solver and DPM-Solver++ (#3344) * add SDE variant of DPM-Solver and DPM-Solver++ * add test * fix typo * fix typo * Add upsample_size to AttnUpBlock2D, AttnDownBlock2D (#3275) The argument `upsample_size` needs to be added to these modules to allow compatibility with other blocks that require this argument. * Rename --only_save_embeds to --save_as_full_pipeline (#3206) * Set --only_save_embeds to False by default Due to how the option is named, it makes more sense to behave like this. * Refactor only_save_embeds to save_as_full_pipeline * [AudioLDM] Generalise conversion script (#3328) Co-authored-by: Patrick von Platen * Fix TypeError when using prompt_embeds and negative_prompt (#2982) * test: Added test case * fix: fixed type checking issue on _encode_prompt * fix: fixed copies consistency * fix: one copy was not sufficient * Fix pipeline class on README (#3345) Update README.md * Inpainting: typo in docs (#3331) Typo in docs Co-authored-by: Patrick von Platen * Add `use_Karras_sigmas` to LMSDiscreteScheduler (#3351) * add karras sigma to lms discrete scheduler * add test for lms_scheduler karras * reformat test lms * Batched load of textual inversions (#3277) * Batched load of textual inversions - Only call resize_token_embeddings once per batch as it is the most expensive operation - Allow pretrained_model_name_or_path and token to be an optional list - Remove Dict from type annotation pretrained_model_name_or_path as it was not supported in this function - Add comment that single files (e.g. .pt/.safetensors) are supported - Add comment for token parameter - Convert token override log message from warning to info * Update src/diffusers/loaders.py Check for duplicate tokens Co-authored-by: Patrick von Platen * Update condition for None tokens --------- Co-authored-by: Patrick von Platen * make fix-copies * [docs] Fix docstring (#3334) fix docstring Co-authored-by: Patrick von Platen * if dreambooth lora (#3360) * update IF stage I pipelines add fixed variance schedulers and lora loading * added kv lora attn processor * allow loading into alternative lora attn processor * make vae optional * throw away predicted variance * allow loading into added kv lora layer * allow load T5 * allow pre compute text embeddings * set new variance type in schedulers * fix copies * refactor all prompt embedding code class prompts are now included in pre-encoding code max tokenizer length is now configurable embedding attention mask is now configurable * fix for when variance type is not defined on scheduler * do not pre compute validation prompt if not present * add example test for if lora dreambooth * add check for train text encoder and pre compute text embeddings * Postprocessing refactor all others (#3337) * add text2img * fix-copies * add * add all other pipelines * add * add * add * add * add * make style * style + fix copies --------- Co-authored-by: yiyixuxu * [docs] Improve safetensors docstring (#3368) * clarify safetensor docstring * fix typo * apply feedback * add: a warning message when using xformers in a PT 2.0 env. (#3365) * add: a warning message when using xformers in a PT 2.0 env. * Apply suggestions from code review Co-authored-by: Patrick von Platen --------- Co-authored-by: Patrick von Platen * StableDiffusionInpaintingPipeline - resize image w.r.t height and width (#3322) * StableDiffusionInpaintingPipeline now resizes input images and masks w.r.t to passed input height and width. Default is already set to 512. This addresses the common tensor mismatch error. Also moved type check into relevant funciton to keep main pipeline body tidy. * Fixed StableDiffusionInpaintingPrepareMaskAndMaskedImageTests Due to previous commit these tests were failing as height and width need to be passed into the prepare_mask_and_masked_image function, I have updated the code and added a height/width variable per unit test as it seemed more appropriate than the current hard coded solution * Added a resolution test to StableDiffusionInpaintPipelineSlowTests this unit test simply gets the input and resizes it into some that would fail (e.g. would throw a tensor mismatch error/not a mult of 8). Then passes it through the pipeline and verifies it produces output with correct dims w.r.t the passed height and width --------- Co-authored-by: Patrick von Platen * make style * [docs] Adapt a model (#3326) * first draft * apply feedback * conv_in.weight thrown away * [docs] Load safetensors (#3333) * safetensors * apply feedback * apply feedback * Apply suggestions from code review --------- Co-authored-by: Patrick von Platen * make style * [Docs] Fix stable_diffusion.mdx typo (#3398) Fix typo in last code block. Correct "prommpts" to "prompt" * Support ControlNet v1.1 shuffle properly (#3340) * add inferring_controlnet_cond_batch * Revert "add inferring_controlnet_cond_batch" This reverts commit abe8d6311d4b7f5b9409ca709c7fabf80d06c1a9. * set guess_mode to True whenever global_pool_conditions is True Co-authored-by: Patrick von Platen * nit * add integration test --------- Co-authored-by: Patrick von Platen * [Tests] better determinism (#3374) * enable deterministic pytorch and cuda operations. * disable manual seeding. * make style && make quality for unet_2d tests. * enable determinism for the unet2dconditional model. * add CUBLAS_WORKSPACE_CONFIG for better reproducibility. * relax tolerance (very weird issue, though). * revert to torch manual_seed() where needed. * relax more tolerance. * better placement of the cuda variable and relax more tolerance. * enable determinism for 3d condition model. * relax tolerance. * add: determinism to alt_diffusion. * relax tolerance for alt diffusion. * dance diffusion. * dance diffusion is flaky. * test_dict_tuple_outputs_equivalent edit. * fix two more tests. * fix more ddim tests. * fix: argument. * change to diff in place of difference. * fix: test_save_load call. * test_save_load_float16 call. * fix: expected_max_diff * fix: paint by example. * relax tolerance. * add determinism to 1d unet model. * torch 2.0 regressions seem to be brutal * determinism to vae. * add reason to skipping. * up tolerance. * determinism to vq. * determinism to cuda. * determinism to the generic test pipeline file. * refactor general pipelines testing a bit. * determinism to alt diffusion i2i * up tolerance for alt diff i2i and audio diff * up tolerance. * determinism to audioldm * increase tolerance for audioldm lms. * increase tolerance for paint by paint. * increase tolerance for repaint. * determinism to cycle diffusion and sd 1. * relax tol for cycle diffusion 🚲 * relax tol for sd 1.0 * relax tol for controlnet. * determinism to img var. * relax tol for img variation. * tolerance to i2i sd * make style * determinism to inpaint. * relax tolerance for inpaiting. * determinism for inpainting legacy * relax tolerance. * determinism to instruct pix2pix * determinism to model editing. * model editing tolerance. * panorama determinism * determinism to pix2pix zero. * determinism to sag. * sd 2. determinism * sd. tolerance * disallow tf32 matmul. * relax tolerance is all you need. * make style and determinism to sd 2 depth * relax tolerance for depth. * tolerance to diffedit. * tolerance to sd 2 inpaint. * up tolerance. * determinism in upscaling. * tolerance in upscaler. * more tolerance relaxation. * determinism to v pred. * up tol for v_pred * unclip determinism * determinism to unclip img2img * determinism to text to video. * determinism to last set of tests * up tol. * vq cumsum doesn't have a deterministic kernel * relax tol * relax tol * [docs] Add transformers to install (#3388) add transformers to install * [deepspeed] partial ZeRO-3 support (#3076) * [deepspeed] partial ZeRO-3 support * cleanup * improve deepspeed fixes * Improve * make style --------- Co-authored-by: Patrick von Platen * Add omegaconf for tests (#3400) Add omegaconfg * Fix various bugs with LoRA Dreambooth and Dreambooth script (#3353) * Improve checkpointing lora * fix more * Improve doc string * Update src/diffusers/loaders.py * make stytle * Apply suggestions from code review * Update src/diffusers/loaders.py * Apply suggestions from code review * Apply suggestions from code review * better * Fix all * Fix multi-GPU dreambooth * Apply suggestions from code review Co-authored-by: Pedro Cuenca * Fix all * make style * make style --------- Co-authored-by: Pedro Cuenca * Fix docker file (#3402) * up * up * fix: deepseepd_plugin retrieval from accelerate state (#3410) * [Docs] Add `sigmoid` beta_scheduler to docstrings of relevant Schedulers (#3399) * Add `sigmoid` beta scheduler to `DDPMScheduler` docstring * Add `sigmoid` beta scheduler to `RePaintScheduler` docstring --------- Co-authored-by: Patrick von Platen * Don't install accelerate and transformers from source (#3415) * Don't install transformers and accelerate from source (#3414) * Improve fast tests (#3416) Update pr_tests.yml * attention refactor: the trilogy (#3387) * Replace `AttentionBlock` with `Attention` * use _from_deprecated_attn_block check re: @patrickvonplaten * [Docs] update the PT 2.0 optimization doc with latest findings (#3370) * add: benchmarking stats for A100 and V100. * Apply suggestions from code review Co-authored-by: Patrick von Platen * address patrick's comments. * add: rtx 4090 stats * ⚔ benchmark reports done * Apply suggestions from code review Co-authored-by: Pedro Cuenca * 3313 pr link. * add: plots. Co-authored-by: Pedro * fix formattimg * update number percent. --------- Co-authored-by: Patrick von Platen Co-authored-by: Pedro Cuenca * Fix style rendering (#3433) * Fix style rendering. * Fix typo * unCLIP scheduler do not use note (#3417) * Replace deprecated command with environment file (#3409) Co-authored-by: Patrick von Platen * fix warning message pipeline loading (#3446) * add stable diffusion tensorrt img2img pipeline (#3419) * add stable diffusion tensorrt img2img pipeline Signed-off-by: Asfiya Baig * update docstrings Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig * Refactor controlnet and add img2img and inpaint (#3386) * refactor controlnet and add img2img and inpaint * First draft to get pipelines to work * make style * Fix more * Fix more * More tests * Fix more * Make inpainting work * make style and more tests * Apply suggestions from code review * up * make style * Fix imports * Fix more * Fix more * Improve examples * add test * Make sure import is correctly deprecated * Make sure everything works in compile mode * make sure authorship is correctly attributed * [Scheduler] DPM-Solver (++) Inverse Scheduler (#3335) * Add DPM-Solver Multistep Inverse Scheduler * Add draft tests for DiffEdit * Add inverse sde-dpmsolver steps to tune image diversity from inverted latents * Fix tests --------- Co-authored-by: Patrick von Platen * [Docs] Fix incomplete docstring for resnet.py (#3438) Fix incomplete docstrings for resnet.py * fix tiled vae blend extent range (#3384) fix tiled vae bleand extent range * Small update to "Next steps" section (#3443) Small update to "Next steps" section: - PyTorch 2 is recommended. - Updated improvement figures. * Allow arbitrary aspect ratio in IFSuperResolutionPipeline (#3298) * Update pipeline_if_superresolution.py Allow arbitrary aspect ratio in IFSuperResolutionPipeline by using the input image shape * IFSuperResolutionPipeline: allow the user to override the height and width through the arguments * update IFSuperResolutionPipeline width/height doc string to match StableDiffusionInpaintPipeline conventions --------- Co-authored-by: Patrick von Platen * Adding 'strength' parameter to StableDiffusionInpaintingPipeline (#3424) * Added explanation of 'strength' parameter * Added get_timesteps function which relies on new strength parameter * Added `strength` parameter which defaults to 1. * Swapped ordering so `noise_timestep` can be calculated before masking the image this is required when you aren't applying 100% noise to the masked region, e.g. strength < 1. * Added strength to check_inputs, throws error if out of range * Changed `prepare_latents` to initialise latents w.r.t strength inspired from the stable diffusion img2img pipeline, init latents are initialised by converting the init image into a VAE latent and adding noise (based upon the strength parameter passed in), e.g. random when strength = 1, or the init image at strength = 0. * WIP: Added a unit test for the new strength parameter in the StableDiffusionInpaintingPipeline still need to add correct regression values * Created a is_strength_max to initialise from pure random noise * Updated unit tests w.r.t new strength parameter + fixed new strength unit test * renamed parameter to avoid confusion with variable of same name * Updated regression values for new strength test - now passes * removed 'copied from' comment as this method is now different and divergent from the cpy * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py Co-authored-by: Patrick von Platen * Ensure backwards compatibility for prepare_mask_and_masked_image created a return_image boolean and initialised to false * Ensure backwards compatibility for prepare_latents * Fixed copy check typo * Fixes w.r.t backward compibility changes * make style * keep function argument ordering same for backwards compatibility in callees with copied from statements * make fix-copies --------- Co-authored-by: Patrick von Platen Co-authored-by: William Berman * [WIP] Bugfix - Pipeline.from_pretrained is broken when the pipeline is partially downloaded (#3448) Added bugfix using f strings. * Fix gradient checkpointing bugs in freezing part of models (requires_grad=False) (#3404) * gradient checkpointing bug fix * bug fix; changes for reviews * reformat * reformat --------- Co-authored-by: Patrick von Platen * Make dreambooth lora more robust to orig unet (#3462) * Make dreambooth lora more robust to orig unet * up * Reduce peak VRAM by releasing large attention tensors (as soon as they're unnecessary) (#3463) Release large tensors in attention (as soon as they're no longer required). Reduces peak VRAM by nearly 2 GB for 1024x1024 (even after slicing), and the savings scale up with image size. * Add min snr to text2img lora training script (#3459) add min snr to text2img lora training script * Add inpaint lora scale support (#3460) * add inpaint lora scale support * add inpaint lora scale test --------- Co-authored-by: yueyang.hyy * [From ckpt] Fix from_ckpt (#3466) * Correct from_ckpt * make style * Update full dreambooth script to work with IF (#3425) * Add IF dreambooth docs (#3470) * parameterize pass single args through tuple (#3477) * attend and excite tests disable determinism on the class level (#3478) * dreambooth docs torch.compile note (#3471) * dreambooth docs torch.compile note * Update examples/dreambooth/README.md Co-authored-by: Sayak Paul * Update examples/dreambooth/README.md Co-authored-by: Pedro Cuenca --------- Co-authored-by: Sayak Paul Co-authored-by: Pedro Cuenca * add: if entry in the dreambooth training docs. (#3472) * [docs] Textual inversion inference (#3473) * add textual inversion inference to docs * add to toctree --------- Co-authored-by: Sayak Paul * [docs] Distributed inference (#3376) * distributed inference * move to inference section * apply feedback * update with split_between_processes * apply feedback * [{Up,Down}sample1d] explicit view kernel size as number elements in flattened indices (#3479) explicit view kernel size as number elements in flattened indices * mps & onnx tests rework (#3449) * Remove ONNX tests from PR. They are already a part of push_tests.yml. * Remove mps tests from PRs. They are already performed on push. * Fix workflow name for fast push tests. * Extract mps tests to a workflow. For better control/filtering. * Remove --extra-index-url from mps tests * Increase tolerance of mps test This test passes in my Mac (Ventura 13.3) but fails in the CI hardware (Ventura 13.2). I ran the local tests following the same steps that exist in the CI workflow. * Temporarily run mps tests on pr So we can test. * Revert "Temporarily run mps tests on pr" Tests passed, go back to running on push. --------- Signed-off-by: Asfiya Baig Co-authored-by: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com> Co-authored-by: Patrick von Platen Co-authored-by: YiYi Xu Co-authored-by: yiyixuxu Co-authored-by: Sayak Paul Co-authored-by: Horace He Co-authored-by: Umar <55330742+mu94-csl@users.noreply.github.com> Co-authored-by: Mylo <36931363+gitmylo@users.noreply.github.com> Co-authored-by: Markus Pobitzer Co-authored-by: Cheng Lu Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Isamu Isozaki Co-authored-by: Cesar Aybar Co-authored-by: Will Rice Co-authored-by: Adrià Arrufat <1671644+arrufat@users.noreply.github.com> Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Co-authored-by: At-sushi Co-authored-by: Lucca Zenóbio Co-authored-by: Lysandre Debut Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: pdoane Co-authored-by: Will Berman Co-authored-by: yiyixuxu Co-authored-by: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com> Co-authored-by: sudowind Co-authored-by: Takuma Mori Co-authored-by: Stas Bekman Co-authored-by: Pedro Cuenca Co-authored-by: Laureηt Co-authored-by: Jongwoo Han Co-authored-by: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Co-authored-by: clarencechen Co-authored-by: Laureηt Co-authored-by: superlabs-dev <133080491+superlabs-dev@users.noreply.github.com> Co-authored-by: Dev Aggarwal Co-authored-by: Vimarsh Chaturvedi Co-authored-by: 7eu7d7 <31194890+7eu7d7@users.noreply.github.com> Co-authored-by: cmdr2 Co-authored-by: wfng92 <43742196+wfng92@users.noreply.github.com> Co-authored-by: Glaceon-Hyy Co-authored-by: yueyang.hyy * [Community] reference only control (#3435) * add reference only control * add reference only control * add reference only control * fix lint * fix lint * reference adain * bugfix EulerAncestralDiscreteScheduler * fix style fidelity rule * fix default output size * del unused line * fix deterministic * Support for cross-attention bias / mask (#2634) * Cross-attention masks prefer qualified symbol, fix accidental Optional prefer qualified symbol in AttentionProcessor prefer qualified symbol in embeddings.py qualified symbol in transformed_2d qualify FloatTensor in unet_2d_blocks move new transformer_2d params attention_mask, encoder_attention_mask to the end of the section which is assumed (e.g. by functions such as checkpoint()) to have a stable positional param interface. regard return_dict as a special-case which is assumed to be injected separately from positional params (e.g. by create_custom_forward()). move new encoder_attention_mask param to end of CrossAttn block interfaces and Unet2DCondition interface, to maintain positional param interface. regenerate modeling_text_unet.py remove unused import unet_2d_condition encoder_attention_mask docs Co-authored-by: Pedro Cuenca versatile_diffusion/modeling_text_unet.py encoder_attention_mask docs Co-authored-by: Pedro Cuenca transformer_2d encoder_attention_mask docs Co-authored-by: Pedro Cuenca unet_2d_blocks.py: add parameter name comments Co-authored-by: Pedro Cuenca revert description. bool-to-bias treatment happens in unet_2d_condition only. comment parameter names fix copies, style * encoder_attention_mask for SimpleCrossAttnDownBlock2D, SimpleCrossAttnUpBlock2D * encoder_attention_mask for UNetMidBlock2DSimpleCrossAttn * support attention_mask, encoder_attention_mask in KCrossAttnDownBlock2D, KCrossAttnUpBlock2D, KAttentionBlock. fix binding of attention_mask, cross_attention_kwargs params in KCrossAttnDownBlock2D, KCrossAttnUpBlock2D checkpoint invocations. * fix mistake made during merge conflict resolution * regenerate versatile_diffusion * pass time embedding into checkpointed attention invocation * always assume encoder_attention_mask is a mask (i.e. not a bias). * style, fix-copies * add tests for cross-attention masks * add test for padding of attention mask * explain mask's query_tokens dim. fix explanation about broadcasting over channels; we actually broadcast over query tokens * support both masks and biases in Transformer2DModel#forward. document behaviour * fix-copies * delete attention_mask docs on the basis I never tested self-attention masking myself. not comfortable explaining it, since I don't actually understand how a self-attn mask can work in its current form: the key length will be different in every ResBlock (we don't downsample the mask when we downsample the image). * review feedback: the standard Unet blocks shouldn't pass temb to attn (only to resnet). remove from KCrossAttnDownBlock2D,KCrossAttnUpBlock2D#forward. * remove encoder_attention_mask param from SimpleCrossAttn{Up,Down}Block2D,UNetMidBlock2DSimpleCrossAttn, and mask-choice in those blocks' #forward, on the basis that they only do one type of attention, so the consumer can pass whichever type of attention_mask is appropriate. * put attention mask padding back to how it was (since the SD use-case it enabled wasn't important, and it breaks the original unclip use-case). disable the test which was added. * fix-copies * style * fix-copies * put encoder_attention_mask param back into Simple block forward interfaces, to ensure consistency of forward interface. * restore passing of emb to KAttentionBlock#forward, on the basis that removal caused test failures. restore also the passing of emb to checkpointed calls to KAttentionBlock#forward. * make simple unet2d blocks use encoder_attention_mask, but only when attention_mask is None. this should fix UnCLIP compatibility. * fix copies * do not scale the initial global step by gradient accumulation steps when loading from checkpoint (#3506) * Remove CPU latents logic for UniDiffuserPipelineFastTests. * make style * Revert "Clean up code and make slow tests pass." This reverts commit ec7fb8735bfdb051de7110cbe678327b461aa88e. * Revert bad commit and clean up code. * add: contributor note. * Batched load of textual inversions (#3277) * Batched load of textual inversions - Only call resize_token_embeddings once per batch as it is the most expensive operation - Allow pretrained_model_name_or_path and token to be an optional list - Remove Dict from type annotation pretrained_model_name_or_path as it was not supported in this function - Add comment that single files (e.g. .pt/.safetensors) are supported - Add comment for token parameter - Convert token override log message from warning to info * Update src/diffusers/loaders.py Check for duplicate tokens Co-authored-by: Patrick von Platen * Update condition for None tokens --------- Co-authored-by: Patrick von Platen * Revert "add: contributor note." This reverts commit 302fde940901093be9188553ec27ffc02c3256f2. * Re-add contributor note and refactored fast tests fixed latents code to remove CPU specific logic. * make style * Refactored the code: - Updated the checkpoint ids to the new ids where appropriate - Refactored the UniDiffuserTextDecoder methods to return only tensors (and made other changes to support this) - Cleaned up the code following suggestions by patrickvonplaten * make style * Remove padding logic from UniDiffuserTextDecoder.generate_beam since the inputs are already padded to a consistent length. * Update checkpoint id for small test v1 checkpoint to hf-internal-testing/unidiffuser-test-v1. * make style * Make improvements to the documentation. * Move ImageTextPipelineOutput documentation from /api/pipelines/unidiffuser.mdx to /api/diffusion_pipeline.mdx. * Change order of arguments for UniDiffuserTextDecoder.generate_beam. * make style * Update docs/source/en/api/pipelines/unidiffuser.mdx --------- Signed-off-by: Asfiya Baig Signed-off-by: Ye, Xinyu Co-authored-by: Ernie Chu <51432514+ernestchu@users.noreply.github.com> Co-authored-by: Sayak Paul Co-authored-by: Andranik Movsisyan <48154088+19and99@users.noreply.github.com> Co-authored-by: Patrick von Platen Co-authored-by: Andreas Steiner Co-authored-by: YiYi Xu Co-authored-by: Pedro Cuenca Co-authored-by: Joseph Coffland Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Takuma Mori Co-authored-by: Will Berman Co-authored-by: Tommaso De Rossi Co-authored-by: Cristian Garcia Co-authored-by: cmdr2 Co-authored-by: 1lint <105617163+1lint@users.noreply.github.com> Co-authored-by: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Co-authored-by: Chanchana Sornsoontorn Co-authored-by: hwuebben Co-authored-by: superhero-7 <57797766+superhero-7@users.noreply.github.com> Co-authored-by: root Co-authored-by: nupurkmr9 Co-authored-by: Nupur Kumari Co-authored-by: Nupur Kumari Co-authored-by: Mishig Co-authored-by: XinyuYe-Intel Co-authored-by: clarencechen Co-authored-by: regisss <15324346+regisss@users.noreply.github.com> Co-authored-by: Suraj Patil Co-authored-by: Youssef Adarrab <104783077+youssefadr@users.noreply.github.com> Co-authored-by: Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com> Co-authored-by: Chengrui Wang <80876977+crywang@users.noreply.github.com> Co-authored-by: SkyTNT Co-authored-by: Lucca Zenóbio Co-authored-by: Isaac <34376531+init-22@users.noreply.github.com> Co-authored-by: pdoane Co-authored-by: Yuchen Fan Co-authored-by: Nipun Jindal Co-authored-by: njindal Co-authored-by: apolinário Co-authored-by: multimodalart Co-authored-by: Xie Zejian Co-authored-by: Jair Trejo Co-authored-by: Robert Dargavel Smith Co-authored-by: yiyixuxu Co-authored-by: Joqsan <6027118+Joqsan@users.noreply.github.com> Co-authored-by: NimenDavid <312648004@qq.com> Co-authored-by: M. Tolga Cangöz <46008593+standardAI@users.noreply.github.com> Co-authored-by: timegate Co-authored-by: Jason Kuan Co-authored-by: Ilia Larchenko <41329713+IliaLarchenko@users.noreply.github.com> Co-authored-by: Horace He Co-authored-by: Umar <55330742+mu94-csl@users.noreply.github.com> Co-authored-by: Mylo <36931363+gitmylo@users.noreply.github.com> Co-authored-by: Markus Pobitzer Co-authored-by: Cheng Lu Co-authored-by: Isamu Isozaki Co-authored-by: Cesar Aybar Co-authored-by: Will Rice Co-authored-by: yiyixuxu Co-authored-by: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com> Co-authored-by: sudowind Co-authored-by: Stas Bekman Co-authored-by: Laureηt Co-authored-by: Jongwoo Han Co-authored-by: Laureηt Co-authored-by: superlabs-dev <133080491+superlabs-dev@users.noreply.github.com> Co-authored-by: Dev Aggarwal Co-authored-by: Vimarsh Chaturvedi Co-authored-by: 7eu7d7 <31194890+7eu7d7@users.noreply.github.com> Co-authored-by: cmdr2 Co-authored-by: wfng92 <43742196+wfng92@users.noreply.github.com> Co-authored-by: Glaceon-Hyy Co-authored-by: yueyang.hyy Co-authored-by: StAlKeR7779 Co-authored-by: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Co-authored-by: w4ffl35 Co-authored-by: Seongsu Park Co-authored-by: Chanran Kim Co-authored-by: Ambrosiussen Co-authored-by: Hari Krishna <37787894+hari10599@users.noreply.github.com> Co-authored-by: Adrià Arrufat <1671644+arrufat@users.noreply.github.com> Co-authored-by: At-sushi Co-authored-by: Lysandre Debut Co-authored-by: takuoko Co-authored-by: Birch-san --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/diffusion_pipeline.mdx | 5 + docs/source/en/api/pipelines/unidiffuser.mdx | 204 +++ scripts/convert_unidiffuser_to_diffusers.py | 776 +++++++++ src/diffusers/__init__.py | 4 + src/diffusers/pipelines/__init__.py | 1 + .../pipelines/unidiffuser/__init__.py | 20 + .../unidiffuser/modeling_text_decoder.py | 294 ++++ .../pipelines/unidiffuser/modeling_uvit.py | 1196 ++++++++++++++ .../unidiffuser/pipeline_unidiffuser.py | 1422 +++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 60 + tests/pipelines/unidiffuser/__init__.py | 0 .../pipelines/unidiffuser/test_unidiffuser.py | 670 ++++++++ 13 files changed, 4654 insertions(+) create mode 100644 docs/source/en/api/pipelines/unidiffuser.mdx create mode 100644 scripts/convert_unidiffuser_to_diffusers.py create mode 100644 src/diffusers/pipelines/unidiffuser/__init__.py create mode 100644 src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py create mode 100644 src/diffusers/pipelines/unidiffuser/modeling_uvit.py create mode 100644 src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py create mode 100644 tests/pipelines/unidiffuser/__init__.py create mode 100644 tests/pipelines/unidiffuser/test_unidiffuser.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 704fb4d5290d..86b0da3de303 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -232,6 +232,8 @@ title: UnCLIP - local: api/pipelines/latent_diffusion_uncond title: Unconditional Latent Diffusion + - local: api/pipelines/unidiffuser + title: UniDiffuser - local: api/pipelines/versatile_diffusion title: Versatile Diffusion - local: api/pipelines/vq_diffusion diff --git a/docs/source/en/api/diffusion_pipeline.mdx b/docs/source/en/api/diffusion_pipeline.mdx index 280802d6a89a..66e5b7b23bbb 100644 --- a/docs/source/en/api/diffusion_pipeline.mdx +++ b/docs/source/en/api/diffusion_pipeline.mdx @@ -45,3 +45,8 @@ By default diffusion pipelines return an object of class By default diffusion pipelines return an object of class [[autodoc]] pipelines.AudioPipelineOutput + +## ImageTextPipelineOutput +By default diffusion pipelines return an object of class + +[[autodoc]] ImageTextPipelineOutput diff --git a/docs/source/en/api/pipelines/unidiffuser.mdx b/docs/source/en/api/pipelines/unidiffuser.mdx new file mode 100644 index 000000000000..10290e263e6d --- /dev/null +++ b/docs/source/en/api/pipelines/unidiffuser.mdx @@ -0,0 +1,204 @@ + + +# UniDiffuser + +The UniDiffuser model was proposed in [One Transformer Fits All Distributions in Multi-Modal Diffusion at Scale](https://arxiv.org/abs/2303.06555) by Fan Bao, Shen Nie, Kaiwen Xue, Chongxuan Li, Shi Pu, Yaole Wang, Gang Yue, Yue Cao, Hang Su, Jun Zhu. + +The abstract of the [paper](https://arxiv.org/abs/2303.06555) is the following: + +*This paper proposes a unified diffusion framework (dubbed UniDiffuser) to fit all distributions relevant to a set of multi-modal data in one model. Our key insight is -- learning diffusion models for marginal, conditional, and joint distributions can be unified as predicting the noise in the perturbed data, where the perturbation levels (i.e. timesteps) can be different for different modalities. Inspired by the unified view, UniDiffuser learns all distributions simultaneously with a minimal modification to the original diffusion model -- perturbs data in all modalities instead of a single modality, inputs individual timesteps in different modalities, and predicts the noise of all modalities instead of a single modality. UniDiffuser is parameterized by a transformer for diffusion models to handle input types of different modalities. Implemented on large-scale paired image-text data, UniDiffuser is able to perform image, text, text-to-image, image-to-text, and image-text pair generation by setting proper timesteps without additional overhead. In particular, UniDiffuser is able to produce perceptually realistic samples in all tasks and its quantitative results (e.g., the FID and CLIP score) are not only superior to existing general-purpose models but also comparable to the bespoken models (e.g., Stable Diffusion and DALL-E 2) in representative tasks (e.g., text-to-image generation).* + +Resources: + +* [Paper](https://arxiv.org/abs/2303.06555). +* [Original Code](https://github.com/thu-ml/unidiffuser). + +Available Checkpoints are: +- *UniDiffuser-v0 (512x512 resolution)* [thu-ml/unidiffuser-v0](https://huggingface.co/thu-ml/unidiffuser-v0) +- *UniDiffuser-v1 (512x512 resolution)* [thu-ml/unidiffuser-v1](https://huggingface.co/thu-ml/unidiffuser-v1) + +This pipeline was contributed by our community member [dg845](https://github.com/dg845). + +## Available Pipelines: + +| Pipeline | Tasks | Demo | Colab | +|:---:|:---:|:---:|:---:| +| [UniDiffuserPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_unidiffuser.py) | *Joint Image-Text Gen*, *Text-to-Image*, *Image-to-Text*,
*Image Gen*, *Text Gen*, *Image Variation*, *Text Variation* | [🤗 Spaces](https://huggingface.co/spaces/thu-ml/unidiffuser) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/unidiffuser.ipynb) | + +## Usage Examples + +Because the UniDiffuser model is trained to model the joint distribution of (image, text) pairs, it is capable of performing a diverse range of generation tasks. + +### Unconditional Image and Text Generation + +Unconditional generation (where we start from only latents sampled from a standard Gaussian prior) from a [`UniDiffuserPipeline`] will produce a (image, text) pair: + +```python +import torch + +from diffusers import UniDiffuserPipeline + +device = "cuda" +model_id_or_path = "thu-ml/unidiffuser-v1" +pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +# Unconditional image and text generation. The generation task is automatically inferred. +sample = pipe(num_inference_steps=20, guidance_scale=8.0) +image = sample.images[0] +text = sample.text[0] +image.save("unidiffuser_joint_sample_image.png") +print(text) +``` + +This is also called "joint" generation in the UniDiffusers paper, since we are sampling from the joint image-text distribution. + +Note that the generation task is inferred from the inputs used when calling the pipeline. +It is also possible to manually specify the unconditional generation task ("mode") manually with [`UniDiffuserPipeline.set_joint_mode`]: + +```python +# Equivalent to the above. +pipe.set_joint_mode() +sample = pipe(num_inference_steps=20, guidance_scale=8.0) +``` + +When the mode is set manually, subsequent calls to the pipeline will use the set mode without attempting the infer the mode. +You can reset the mode with [`UniDiffuserPipeline.reset_mode`], after which the pipeline will once again infer the mode. + +You can also generate only an image or only text (which the UniDiffuser paper calls "marginal" generation since we sample from the marginal distribution of images and text, respectively): + +```python +# Unlike other generation tasks, image-only and text-only generation don't use classifier-free guidance +# Image-only generation +pipe.set_image_mode() +sample_image = pipe(num_inference_steps=20).images[0] +# Text-only generation +pipe.set_text_mode() +sample_text = pipe(num_inference_steps=20).text[0] +``` + +### Text-to-Image Generation + +UniDiffuser is also capable of sampling from conditional distributions; that is, the distribution of images conditioned on a text prompt or the distribution of texts conditioned on an image. +Here is an example of sampling from the conditional image distribution (text-to-image generation or text-conditioned image generation): + +```python +import torch + +from diffusers import UniDiffuserPipeline + +device = "cuda" +model_id_or_path = "thu-ml/unidiffuser-v1" +pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +# Text-to-image generation +prompt = "an elephant under the sea" + +sample = pipe(prompt=prompt, num_inference_steps=20, guidance_scale=8.0) +t2i_image = sample.images[0] +t2i_image.save("unidiffuser_text2img_sample_image.png") +``` + +The `text2img` mode requires that either an input `prompt` or `prompt_embeds` be supplied. You can set the `text2img` mode manually with [`UniDiffuserPipeline.set_text_to_image_mode`]. + +### Image-to-Text Generation + +Similarly, UniDiffuser can also produce text samples given an image (image-to-text or image-conditioned text generation): + +```python +import torch + +from diffusers import UniDiffuserPipeline +from diffusers.utils import load_image + +device = "cuda" +model_id_or_path = "thu-ml/unidiffuser-v1" +pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +# Image-to-text generation +image_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg" +init_image = load_image(image_url).resize((512, 512)) + +sample = pipe(image=init_image, num_inference_steps=20, guidance_scale=8.0) +i2t_text = sample.text[0] +print(i2t_text) +``` + +The `img2text` mode requires that an input `image` be supplied. You can set the `img2text` mode manually with [`UniDiffuserPipeline.set_image_to_text_mode`]. + +### Image Variation + +The UniDiffuser authors suggest performing image variation through a "round-trip" generation method, where given an input image, we first perform an image-to-text generation, and the perform a text-to-image generation on the outputs of the first generation. +This produces a new image which is semantically similar to the input image: + +```python +import torch + +from diffusers import UniDiffuserPipeline +from diffusers.utils import load_image + +device = "cuda" +model_id_or_path = "thu-ml/unidiffuser-v1" +pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +# Image variation can be performed with a image-to-text generation followed by a text-to-image generation: +# 1. Image-to-text generation +image_url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg" +init_image = load_image(image_url).resize((512, 512)) + +sample = pipe(image=init_image, num_inference_steps=20, guidance_scale=8.0) +i2t_text = sample.text[0] +print(i2t_text) + +# 2. Text-to-image generation +sample = pipe(prompt=i2t_text, num_inference_steps=20, guidance_scale=8.0) +final_image = sample.images[0] +final_image.save("unidiffuser_image_variation_sample.png") +``` + +### Text Variation + + +Similarly, text variation can be performed on an input prompt with a text-to-image generation followed by a image-to-text generation: + +```python +import torch + +from diffusers import UniDiffuserPipeline + +device = "cuda" +model_id_or_path = "thu-ml/unidiffuser-v1" +pipe = UniDiffuserPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +# Text variation can be performed with a text-to-image generation followed by a image-to-text generation: +# 1. Text-to-image generation +prompt = "an elephant under the sea" + +sample = pipe(prompt=prompt, num_inference_steps=20, guidance_scale=8.0) +t2i_image = sample.images[0] +t2i_image.save("unidiffuser_text2img_sample_image.png") + +# 2. Image-to-text generation +sample = pipe(image=t2i_image, num_inference_steps=20, guidance_scale=8.0) +final_prompt = sample.text[0] +print(final_prompt) +``` + +## UniDiffuserPipeline +[[autodoc]] UniDiffuserPipeline + - all + - __call__ diff --git a/scripts/convert_unidiffuser_to_diffusers.py b/scripts/convert_unidiffuser_to_diffusers.py new file mode 100644 index 000000000000..891d289d8c76 --- /dev/null +++ b/scripts/convert_unidiffuser_to_diffusers.py @@ -0,0 +1,776 @@ +# Convert the original UniDiffuser checkpoints into diffusers equivalents. + +import argparse +from argparse import Namespace + +import torch +from transformers import ( + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, + GPT2Tokenizer, +) + +from diffusers import ( + AutoencoderKL, + DPMSolverMultistepScheduler, + UniDiffuserModel, + UniDiffuserPipeline, + UniDiffuserTextDecoder, +) + + +SCHEDULER_CONFIG = Namespace( + **{ + "beta_start": 0.00085, + "beta_end": 0.012, + "beta_schedule": "scaled_linear", + "solver_order": 3, + } +) + + +# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments +def shave_segments(path, n_shave_prefix_segments=1): + """ + Removes segments. Positive values shave the first segments, negative shave the last segments. + """ + if n_shave_prefix_segments >= 0: + return ".".join(path.split(".")[n_shave_prefix_segments:]) + else: + return ".".join(path.split(".")[:n_shave_prefix_segments]) + + +# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_resnet_paths +def renew_vae_resnet_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside resnets to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace("nin_shortcut", "conv_shortcut") + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.renew_vae_attention_paths +def renew_vae_attention_paths(old_list, n_shave_prefix_segments=0): + """ + Updates paths inside attentions to the new naming scheme (local renaming) + """ + mapping = [] + for old_item in old_list: + new_item = old_item + + new_item = new_item.replace("norm.weight", "group_norm.weight") + new_item = new_item.replace("norm.bias", "group_norm.bias") + + new_item = new_item.replace("q.weight", "query.weight") + new_item = new_item.replace("q.bias", "query.bias") + + new_item = new_item.replace("k.weight", "key.weight") + new_item = new_item.replace("k.bias", "key.bias") + + new_item = new_item.replace("v.weight", "value.weight") + new_item = new_item.replace("v.bias", "value.bias") + + new_item = new_item.replace("proj_out.weight", "proj_attn.weight") + new_item = new_item.replace("proj_out.bias", "proj_attn.bias") + + new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments) + + mapping.append({"old": old_item, "new": new_item}) + + return mapping + + +# Modified from diffusers.pipelines.stable_diffusion.convert_from_ckpt.assign_to_checkpoint +# config.num_head_channels => num_head_channels +def assign_to_checkpoint( + paths, + checkpoint, + old_checkpoint, + attention_paths_to_split=None, + additional_replacements=None, + num_head_channels=1, +): + """ + This does the final conversion step: take locally converted weights and apply a global renaming to them. It splits + attention layers, and takes into account additional replacements that may arise. Assigns the weights to the new + checkpoint. + """ + assert isinstance(paths, list), "Paths should be a list of dicts containing 'old' and 'new' keys." + + # Splits the attention layers into three variables. + if attention_paths_to_split is not None: + for path, path_map in attention_paths_to_split.items(): + old_tensor = old_checkpoint[path] + channels = old_tensor.shape[0] // 3 + + target_shape = (-1, channels) if len(old_tensor.shape) == 3 else (-1) + + num_heads = old_tensor.shape[0] // num_head_channels // 3 + + old_tensor = old_tensor.reshape((num_heads, 3 * channels // num_heads) + old_tensor.shape[1:]) + query, key, value = old_tensor.split(channels // num_heads, dim=1) + + checkpoint[path_map["query"]] = query.reshape(target_shape) + checkpoint[path_map["key"]] = key.reshape(target_shape) + checkpoint[path_map["value"]] = value.reshape(target_shape) + + for path in paths: + new_path = path["new"] + + # These have already been assigned + if attention_paths_to_split is not None and new_path in attention_paths_to_split: + continue + + # Global renaming happens here + new_path = new_path.replace("middle_block.0", "mid_block.resnets.0") + new_path = new_path.replace("middle_block.1", "mid_block.attentions.0") + new_path = new_path.replace("middle_block.2", "mid_block.resnets.1") + + if additional_replacements is not None: + for replacement in additional_replacements: + new_path = new_path.replace(replacement["old"], replacement["new"]) + + # proj_attn.weight has to be converted from conv 1D to linear + if "proj_attn.weight" in new_path: + checkpoint[new_path] = old_checkpoint[path["old"]][:, :, 0] + else: + checkpoint[new_path] = old_checkpoint[path["old"]] + + +# Copied from diffusers.pipelines.stable_diffusion.convert_from_ckpt.conv_attn_to_linear +def conv_attn_to_linear(checkpoint): + keys = list(checkpoint.keys()) + attn_keys = ["query.weight", "key.weight", "value.weight"] + for key in keys: + if ".".join(key.split(".")[-2:]) in attn_keys: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0, 0] + elif "proj_attn.weight" in key: + if checkpoint[key].ndim > 2: + checkpoint[key] = checkpoint[key][:, :, 0] + + +def create_vae_diffusers_config(config_type): + # Hardcoded for now + if args.config_type == "test": + vae_config = create_vae_diffusers_config_test() + elif args.config_type == "big": + vae_config = create_vae_diffusers_config_big() + else: + raise NotImplementedError( + f"Config type {config_type} is not implemented, currently only config types" + " 'test' and 'big' are available." + ) + return vae_config + + +def create_unidiffuser_unet_config(config_type, version): + # Hardcoded for now + if args.config_type == "test": + unet_config = create_unidiffuser_unet_config_test() + elif args.config_type == "big": + unet_config = create_unidiffuser_unet_config_big() + else: + raise NotImplementedError( + f"Config type {config_type} is not implemented, currently only config types" + " 'test' and 'big' are available." + ) + # Unidiffuser-v1 uses data type embeddings + if version == 1: + unet_config["use_data_type_embedding"] = True + return unet_config + + +def create_text_decoder_config(config_type): + # Hardcoded for now + if args.config_type == "test": + text_decoder_config = create_text_decoder_config_test() + elif args.config_type == "big": + text_decoder_config = create_text_decoder_config_big() + else: + raise NotImplementedError( + f"Config type {config_type} is not implemented, currently only config types" + " 'test' and 'big' are available." + ) + return text_decoder_config + + +# Hardcoded configs for test versions of the UniDiffuser models, corresponding to those in the fast default tests. +def create_vae_diffusers_config_test(): + vae_config = { + "sample_size": 32, + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D"], + "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D"], + "block_out_channels": [32, 64], + "latent_channels": 4, + "layers_per_block": 1, + } + return vae_config + + +def create_unidiffuser_unet_config_test(): + unet_config = { + "text_dim": 32, + "clip_img_dim": 32, + "num_text_tokens": 77, + "num_attention_heads": 2, + "attention_head_dim": 8, + "in_channels": 4, + "out_channels": 4, + "num_layers": 2, + "dropout": 0.0, + "norm_num_groups": 32, + "attention_bias": False, + "sample_size": 16, + "patch_size": 2, + "activation_fn": "gelu", + "num_embeds_ada_norm": 1000, + "norm_type": "layer_norm", + "block_type": "unidiffuser", + "pre_layer_norm": False, + "use_timestep_embedding": False, + "norm_elementwise_affine": True, + "use_patch_pos_embed": False, + "ff_final_dropout": True, + "use_data_type_embedding": False, + } + return unet_config + + +def create_text_decoder_config_test(): + text_decoder_config = { + "prefix_length": 77, + "prefix_inner_dim": 32, + "prefix_hidden_dim": 32, + "vocab_size": 1025, # 1024 + 1 for new EOS token + "n_positions": 1024, + "n_embd": 32, + "n_layer": 5, + "n_head": 4, + "n_inner": 37, + "activation_function": "gelu", + "resid_pdrop": 0.1, + "embd_pdrop": 0.1, + "attn_pdrop": 0.1, + "layer_norm_epsilon": 1e-5, + "initializer_range": 0.02, + } + return text_decoder_config + + +# Hardcoded configs for the UniDiffuser V1 model at https://huggingface.co/thu-ml/unidiffuser-v1 +# See also https://github.com/thu-ml/unidiffuser/blob/main/configs/sample_unidiffuser_v1.py +def create_vae_diffusers_config_big(): + vae_config = { + "sample_size": 256, + "in_channels": 3, + "out_channels": 3, + "down_block_types": ["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"], + "up_block_types": ["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], + "block_out_channels": [128, 256, 512, 512], + "latent_channels": 4, + "layers_per_block": 2, + } + return vae_config + + +def create_unidiffuser_unet_config_big(): + unet_config = { + "text_dim": 64, + "clip_img_dim": 512, + "num_text_tokens": 77, + "num_attention_heads": 24, + "attention_head_dim": 64, + "in_channels": 4, + "out_channels": 4, + "num_layers": 30, + "dropout": 0.0, + "norm_num_groups": 32, + "attention_bias": False, + "sample_size": 64, + "patch_size": 2, + "activation_fn": "gelu", + "num_embeds_ada_norm": 1000, + "norm_type": "layer_norm", + "block_type": "unidiffuser", + "pre_layer_norm": False, + "use_timestep_embedding": False, + "norm_elementwise_affine": True, + "use_patch_pos_embed": False, + "ff_final_dropout": True, + "use_data_type_embedding": False, + } + return unet_config + + +# From https://huggingface.co/gpt2/blob/main/config.json, the GPT2 checkpoint used by UniDiffuser +def create_text_decoder_config_big(): + text_decoder_config = { + "prefix_length": 77, + "prefix_inner_dim": 768, + "prefix_hidden_dim": 64, + "vocab_size": 50258, # 50257 + 1 for new EOS token + "n_positions": 1024, + "n_embd": 768, + "n_layer": 12, + "n_head": 12, + "n_inner": 3072, + "activation_function": "gelu", + "resid_pdrop": 0.1, + "embd_pdrop": 0.1, + "attn_pdrop": 0.1, + "layer_norm_epsilon": 1e-5, + "initializer_range": 0.02, + } + return text_decoder_config + + +# Based on diffusers.pipelines.stable_diffusion.convert_from_ckpt.shave_segments.convert_ldm_vae_checkpoint +def convert_vae_to_diffusers(ckpt, diffusers_model, num_head_channels=1): + """ + Converts a UniDiffuser autoencoder_kl.pth checkpoint to a diffusers AutoencoderKL. + """ + # autoencoder_kl.pth ckpt is a torch state dict + vae_state_dict = torch.load(ckpt, map_location="cpu") + + new_checkpoint = {} + + new_checkpoint["encoder.conv_in.weight"] = vae_state_dict["encoder.conv_in.weight"] + new_checkpoint["encoder.conv_in.bias"] = vae_state_dict["encoder.conv_in.bias"] + new_checkpoint["encoder.conv_out.weight"] = vae_state_dict["encoder.conv_out.weight"] + new_checkpoint["encoder.conv_out.bias"] = vae_state_dict["encoder.conv_out.bias"] + new_checkpoint["encoder.conv_norm_out.weight"] = vae_state_dict["encoder.norm_out.weight"] + new_checkpoint["encoder.conv_norm_out.bias"] = vae_state_dict["encoder.norm_out.bias"] + + new_checkpoint["decoder.conv_in.weight"] = vae_state_dict["decoder.conv_in.weight"] + new_checkpoint["decoder.conv_in.bias"] = vae_state_dict["decoder.conv_in.bias"] + new_checkpoint["decoder.conv_out.weight"] = vae_state_dict["decoder.conv_out.weight"] + new_checkpoint["decoder.conv_out.bias"] = vae_state_dict["decoder.conv_out.bias"] + new_checkpoint["decoder.conv_norm_out.weight"] = vae_state_dict["decoder.norm_out.weight"] + new_checkpoint["decoder.conv_norm_out.bias"] = vae_state_dict["decoder.norm_out.bias"] + + new_checkpoint["quant_conv.weight"] = vae_state_dict["quant_conv.weight"] + new_checkpoint["quant_conv.bias"] = vae_state_dict["quant_conv.bias"] + new_checkpoint["post_quant_conv.weight"] = vae_state_dict["post_quant_conv.weight"] + new_checkpoint["post_quant_conv.bias"] = vae_state_dict["post_quant_conv.bias"] + + # Retrieves the keys for the encoder down blocks only + num_down_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "encoder.down" in layer}) + down_blocks = { + layer_id: [key for key in vae_state_dict if f"down.{layer_id}" in key] for layer_id in range(num_down_blocks) + } + + # Retrieves the keys for the decoder up blocks only + num_up_blocks = len({".".join(layer.split(".")[:3]) for layer in vae_state_dict if "decoder.up" in layer}) + up_blocks = { + layer_id: [key for key in vae_state_dict if f"up.{layer_id}" in key] for layer_id in range(num_up_blocks) + } + + for i in range(num_down_blocks): + resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key] + + if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict: + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.weight" + ) + new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.bias"] = vae_state_dict.pop( + f"encoder.down.{i}.downsample.conv.bias" + ) + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"} + assign_to_checkpoint( + paths, + new_checkpoint, + vae_state_dict, + additional_replacements=[meta_path], + num_head_channels=num_head_channels, # not used in vae + ) + + mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"encoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} + assign_to_checkpoint( + paths, + new_checkpoint, + vae_state_dict, + additional_replacements=[meta_path], + num_head_channels=num_head_channels, # not used in vae + ) + + mid_attentions = [key for key in vae_state_dict if "encoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} + assign_to_checkpoint( + paths, + new_checkpoint, + vae_state_dict, + additional_replacements=[meta_path], + num_head_channels=num_head_channels, # not used in vae + ) + conv_attn_to_linear(new_checkpoint) + + for i in range(num_up_blocks): + block_id = num_up_blocks - 1 - i + resnets = [ + key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key + ] + + if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict: + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.weight" + ] + new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.bias"] = vae_state_dict[ + f"decoder.up.{block_id}.upsample.conv.bias" + ] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"} + assign_to_checkpoint( + paths, + new_checkpoint, + vae_state_dict, + additional_replacements=[meta_path], + num_head_channels=num_head_channels, # not used in vae + ) + + mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key] + num_mid_res_blocks = 2 + for i in range(1, num_mid_res_blocks + 1): + resnets = [key for key in mid_resnets if f"decoder.mid.block_{i}" in key] + + paths = renew_vae_resnet_paths(resnets) + meta_path = {"old": f"mid.block_{i}", "new": f"mid_block.resnets.{i - 1}"} + assign_to_checkpoint( + paths, + new_checkpoint, + vae_state_dict, + additional_replacements=[meta_path], + num_head_channels=num_head_channels, # not used in vae + ) + + mid_attentions = [key for key in vae_state_dict if "decoder.mid.attn" in key] + paths = renew_vae_attention_paths(mid_attentions) + meta_path = {"old": "mid.attn_1", "new": "mid_block.attentions.0"} + assign_to_checkpoint( + paths, + new_checkpoint, + vae_state_dict, + additional_replacements=[meta_path], + num_head_channels=num_head_channels, # not used in vae + ) + conv_attn_to_linear(new_checkpoint) + + missing_keys, unexpected_keys = diffusers_model.load_state_dict(new_checkpoint) + for missing_key in missing_keys: + print(f"Missing key: {missing_key}") + for unexpected_key in unexpected_keys: + print(f"Unexpected key: {unexpected_key}") + + return diffusers_model + + +def convert_uvit_block_to_diffusers_block( + uvit_state_dict, + new_state_dict, + block_prefix, + new_prefix="transformer.transformer_", + skip_connection=False, +): + """ + Maps the keys in a UniDiffuser transformer block (`Block`) to the keys in a diffusers transformer block + (`UTransformerBlock`/`UniDiffuserBlock`). + """ + prefix = new_prefix + block_prefix + if skip_connection: + new_state_dict[prefix + ".skip.skip_linear.weight"] = uvit_state_dict[block_prefix + ".skip_linear.weight"] + new_state_dict[prefix + ".skip.skip_linear.bias"] = uvit_state_dict[block_prefix + ".skip_linear.bias"] + new_state_dict[prefix + ".skip.norm.weight"] = uvit_state_dict[block_prefix + ".norm1.weight"] + new_state_dict[prefix + ".skip.norm.bias"] = uvit_state_dict[block_prefix + ".norm1.bias"] + + # Create the prefix string for out_blocks. + prefix += ".block" + + # Split up attention qkv.weight into to_q.weight, to_k.weight, to_v.weight + qkv = uvit_state_dict[block_prefix + ".attn.qkv.weight"] + new_attn_keys = [".attn1.to_q.weight", ".attn1.to_k.weight", ".attn1.to_v.weight"] + new_attn_keys = [prefix + key for key in new_attn_keys] + shape = qkv.shape[0] // len(new_attn_keys) + for i, attn_key in enumerate(new_attn_keys): + new_state_dict[attn_key] = qkv[i * shape : (i + 1) * shape] + + new_state_dict[prefix + ".attn1.to_out.0.weight"] = uvit_state_dict[block_prefix + ".attn.proj.weight"] + new_state_dict[prefix + ".attn1.to_out.0.bias"] = uvit_state_dict[block_prefix + ".attn.proj.bias"] + new_state_dict[prefix + ".norm1.weight"] = uvit_state_dict[block_prefix + ".norm2.weight"] + new_state_dict[prefix + ".norm1.bias"] = uvit_state_dict[block_prefix + ".norm2.bias"] + new_state_dict[prefix + ".ff.net.0.proj.weight"] = uvit_state_dict[block_prefix + ".mlp.fc1.weight"] + new_state_dict[prefix + ".ff.net.0.proj.bias"] = uvit_state_dict[block_prefix + ".mlp.fc1.bias"] + new_state_dict[prefix + ".ff.net.2.weight"] = uvit_state_dict[block_prefix + ".mlp.fc2.weight"] + new_state_dict[prefix + ".ff.net.2.bias"] = uvit_state_dict[block_prefix + ".mlp.fc2.bias"] + new_state_dict[prefix + ".norm3.weight"] = uvit_state_dict[block_prefix + ".norm3.weight"] + new_state_dict[prefix + ".norm3.bias"] = uvit_state_dict[block_prefix + ".norm3.bias"] + + return uvit_state_dict, new_state_dict + + +def convert_uvit_to_diffusers(ckpt, diffusers_model): + """ + Converts a UniDiffuser uvit_v*.pth checkpoint to a diffusers UniDiffusersModel. + """ + # uvit_v*.pth ckpt is a torch state dict + uvit_state_dict = torch.load(ckpt, map_location="cpu") + + new_state_dict = {} + + # Input layers + new_state_dict["vae_img_in.proj.weight"] = uvit_state_dict["patch_embed.proj.weight"] + new_state_dict["vae_img_in.proj.bias"] = uvit_state_dict["patch_embed.proj.bias"] + new_state_dict["clip_img_in.weight"] = uvit_state_dict["clip_img_embed.weight"] + new_state_dict["clip_img_in.bias"] = uvit_state_dict["clip_img_embed.bias"] + new_state_dict["text_in.weight"] = uvit_state_dict["text_embed.weight"] + new_state_dict["text_in.bias"] = uvit_state_dict["text_embed.bias"] + + new_state_dict["pos_embed"] = uvit_state_dict["pos_embed"] + + # Handle data type token embeddings for UniDiffuser-v1 + if "token_embedding.weight" in uvit_state_dict and diffusers_model.use_data_type_embedding: + new_state_dict["data_type_pos_embed_token"] = uvit_state_dict["pos_embed_token"] + new_state_dict["data_type_token_embedding.weight"] = uvit_state_dict["token_embedding.weight"] + + # Also initialize the PatchEmbedding in UTransformer2DModel with the PatchEmbedding from the checkpoint. + # This isn't used in the current implementation, so might want to remove. + new_state_dict["transformer.pos_embed.proj.weight"] = uvit_state_dict["patch_embed.proj.weight"] + new_state_dict["transformer.pos_embed.proj.bias"] = uvit_state_dict["patch_embed.proj.bias"] + + # Output layers + new_state_dict["transformer.norm_out.weight"] = uvit_state_dict["norm.weight"] + new_state_dict["transformer.norm_out.bias"] = uvit_state_dict["norm.bias"] + + new_state_dict["vae_img_out.weight"] = uvit_state_dict["decoder_pred.weight"] + new_state_dict["vae_img_out.bias"] = uvit_state_dict["decoder_pred.bias"] + new_state_dict["clip_img_out.weight"] = uvit_state_dict["clip_img_out.weight"] + new_state_dict["clip_img_out.bias"] = uvit_state_dict["clip_img_out.bias"] + new_state_dict["text_out.weight"] = uvit_state_dict["text_out.weight"] + new_state_dict["text_out.bias"] = uvit_state_dict["text_out.bias"] + + # in_blocks + in_blocks_prefixes = {".".join(layer.split(".")[:2]) for layer in uvit_state_dict if "in_blocks" in layer} + for in_block_prefix in list(in_blocks_prefixes): + convert_uvit_block_to_diffusers_block(uvit_state_dict, new_state_dict, in_block_prefix) + + # mid_block + # Assume there's only one mid block + convert_uvit_block_to_diffusers_block(uvit_state_dict, new_state_dict, "mid_block") + + # out_blocks + out_blocks_prefixes = {".".join(layer.split(".")[:2]) for layer in uvit_state_dict if "out_blocks" in layer} + for out_block_prefix in list(out_blocks_prefixes): + convert_uvit_block_to_diffusers_block(uvit_state_dict, new_state_dict, out_block_prefix, skip_connection=True) + + missing_keys, unexpected_keys = diffusers_model.load_state_dict(new_state_dict) + for missing_key in missing_keys: + print(f"Missing key: {missing_key}") + for unexpected_key in unexpected_keys: + print(f"Unexpected key: {unexpected_key}") + + return diffusers_model + + +def convert_caption_decoder_to_diffusers(ckpt, diffusers_model): + """ + Converts a UniDiffuser caption_decoder.pth checkpoint to a diffusers UniDiffuserTextDecoder. + """ + # caption_decoder.pth ckpt is a torch state dict + checkpoint_state_dict = torch.load(ckpt, map_location="cpu") + decoder_state_dict = {} + # Remove the "module." prefix, if necessary + caption_decoder_key = "module." + for key in checkpoint_state_dict: + if key.startswith(caption_decoder_key): + decoder_state_dict[key.replace(caption_decoder_key, "")] = checkpoint_state_dict.get(key) + else: + decoder_state_dict[key] = checkpoint_state_dict.get(key) + + new_state_dict = {} + + # Encoder and Decoder + new_state_dict["encode_prefix.weight"] = decoder_state_dict["encode_prefix.weight"] + new_state_dict["encode_prefix.bias"] = decoder_state_dict["encode_prefix.bias"] + new_state_dict["decode_prefix.weight"] = decoder_state_dict["decode_prefix.weight"] + new_state_dict["decode_prefix.bias"] = decoder_state_dict["decode_prefix.bias"] + + # Internal GPT2LMHeadModel transformer model + for key, val in decoder_state_dict.items(): + if key.startswith("gpt"): + suffix = key[len("gpt") :] + new_state_dict["transformer" + suffix] = val + + missing_keys, unexpected_keys = diffusers_model.load_state_dict(new_state_dict) + for missing_key in missing_keys: + print(f"Missing key: {missing_key}") + for unexpected_key in unexpected_keys: + print(f"Unexpected key: {unexpected_key}") + + return diffusers_model + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--caption_decoder_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to caption decoder checkpoint to convert.", + ) + parser.add_argument( + "--uvit_checkpoint_path", default=None, type=str, required=False, help="Path to U-ViT checkpoint to convert." + ) + parser.add_argument( + "--vae_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to VAE checkpoint to convert.", + ) + parser.add_argument( + "--pipeline_output_path", + default=None, + type=str, + required=True, + help="Path to save the output pipeline to.", + ) + parser.add_argument( + "--config_type", + default="test", + type=str, + help=( + "Config type to use. Should be 'test' to create small models for testing or 'big' to convert a full" + " checkpoint." + ), + ) + parser.add_argument( + "--version", + default=0, + type=int, + help="The UniDiffuser model type to convert to. Should be 0 for UniDiffuser-v0 and 1 for UniDiffuser-v1.", + ) + + args = parser.parse_args() + + # Convert the VAE model. + if args.vae_checkpoint_path is not None: + vae_config = create_vae_diffusers_config(args.config_type) + vae = AutoencoderKL(**vae_config) + vae = convert_vae_to_diffusers(args.vae_checkpoint_path, vae) + + # Convert the U-ViT ("unet") model. + if args.uvit_checkpoint_path is not None: + unet_config = create_unidiffuser_unet_config(args.config_type, args.version) + unet = UniDiffuserModel(**unet_config) + unet = convert_uvit_to_diffusers(args.uvit_checkpoint_path, unet) + + # Convert the caption decoder ("text_decoder") model. + if args.caption_decoder_checkpoint_path is not None: + text_decoder_config = create_text_decoder_config(args.config_type) + text_decoder = UniDiffuserTextDecoder(**text_decoder_config) + text_decoder = convert_caption_decoder_to_diffusers(args.caption_decoder_checkpoint_path, text_decoder) + + # Scheduler is the same for both the test and big models. + scheduler_config = SCHEDULER_CONFIG + scheduler = DPMSolverMultistepScheduler( + beta_start=scheduler_config.beta_start, + beta_end=scheduler_config.beta_end, + beta_schedule=scheduler_config.beta_schedule, + solver_order=scheduler_config.solver_order, + ) + + if args.config_type == "test": + # Make a small random CLIPTextModel + torch.manual_seed(0) + clip_text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(clip_text_encoder_config) + clip_tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + # Make a small random CLIPVisionModel and accompanying CLIPImageProcessor + torch.manual_seed(0) + clip_image_encoder_config = CLIPVisionConfig( + image_size=32, + patch_size=2, + num_channels=3, + hidden_size=32, + projection_dim=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + ) + image_encoder = CLIPVisionModelWithProjection(clip_image_encoder_config) + image_processor = CLIPImageProcessor(crop_size=32, size=32) + + # Note that the text_decoder should already have its token embeddings resized. + text_tokenizer = GPT2Tokenizer.from_pretrained("hf-internal-testing/tiny-random-GPT2Model") + eos = "<|EOS|>" + special_tokens_dict = {"eos_token": eos} + text_tokenizer.add_special_tokens(special_tokens_dict) + elif args.config_type == "big": + text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") + clip_tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + + image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32") + image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-base-patch32") + + # Note that the text_decoder should already have its token embeddings resized. + text_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + eos = "<|EOS|>" + special_tokens_dict = {"eos_token": eos} + text_tokenizer.add_special_tokens(special_tokens_dict) + else: + raise NotImplementedError( + f"Config type {args.config_type} is not implemented, currently only config types" + " 'test' and 'big' are available." + ) + + pipeline = UniDiffuserPipeline( + vae=vae, + text_encoder=text_encoder, + image_encoder=image_encoder, + image_processor=image_processor, + clip_tokenizer=clip_tokenizer, + text_decoder=text_decoder, + text_tokenizer=text_tokenizer, + unet=unet, + scheduler=scheduler, + ) + pipeline.save_pretrained(args.pipeline_output_path) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f6d8c254d157..402f6eaa749a 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -129,6 +129,7 @@ IFInpaintingSuperResolutionPipeline, IFPipeline, IFSuperResolutionPipeline, + ImageTextPipelineOutput, KandinskyImg2ImgPipeline, KandinskyInpaintPipeline, KandinskyPipeline, @@ -161,6 +162,9 @@ TextToVideoZeroPipeline, UnCLIPImageVariationPipeline, UnCLIPPipeline, + UniDiffuserModel, + UniDiffuserPipeline, + UniDiffuserTextDecoder, VersatileDiffusionDualGuidedPipeline, VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index bb3fc5d04cb6..9e68538f233c 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -89,6 +89,7 @@ from .stable_diffusion_safe import StableDiffusionPipelineSafe from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline + from .unidiffuser import ImageTextPipelineOutput, UniDiffuserModel, UniDiffuserPipeline, UniDiffuserTextDecoder from .versatile_diffusion import ( VersatileDiffusionDualGuidedPipeline, VersatileDiffusionImageVariationPipeline, diff --git a/src/diffusers/pipelines/unidiffuser/__init__.py b/src/diffusers/pipelines/unidiffuser/__init__.py new file mode 100644 index 000000000000..a774e3274030 --- /dev/null +++ b/src/diffusers/pipelines/unidiffuser/__init__.py @@ -0,0 +1,20 @@ +from ...utils import ( + OptionalDependencyNotAvailable, + is_torch_available, + is_transformers_available, + is_transformers_version, +) + + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import ( + ImageTextPipelineOutput, + UniDiffuserPipeline, + ) +else: + from .modeling_text_decoder import UniDiffuserTextDecoder + from .modeling_uvit import UniDiffuserModel, UTransformer2DModel + from .pipeline_unidiffuser import ImageTextPipelineOutput, UniDiffuserPipeline diff --git a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py new file mode 100644 index 000000000000..febc8e09e6ab --- /dev/null +++ b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py @@ -0,0 +1,294 @@ +from typing import Optional + +import numpy as np +import torch +from torch import nn +from transformers import GPT2Config, GPT2LMHeadModel +from transformers.modeling_utils import ModuleUtilsMixin + +from ...configuration_utils import ConfigMixin, register_to_config +from ...models import ModelMixin + + +# Modified from ClipCaptionModel in https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py +class UniDiffuserTextDecoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): + """ + Text decoder model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is used to + generate text from the UniDiffuser image-text embedding. + + Parameters: + prefix_length (`int`): + Max number of prefix tokens that will be supplied to the model. + prefix_inner_dim (`int`): + The hidden size of the the incoming prefix embeddings. For UniDiffuser, this would be the hidden dim of the + CLIP text encoder. + prefix_hidden_dim (`int`, *optional*): + Hidden dim of the MLP if we encode the prefix. + vocab_size (`int`, *optional*, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`GPT2Model`] or [`TFGPT2Model`]. + n_positions (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_embd (`int`, *optional*, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (`int`, *optional*, defaults to None): + Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd + activation_function (`str`, *optional*, defaults to `"gelu"`): + Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + scale_attn_weights (`bool`, *optional*, defaults to `True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): + Whether to additionally scale attention weights by `1 / layer_idx + 1`. + reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): + Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention + dot-product/softmax to float() when training with mixed precision. + """ + + @register_to_config + def __init__( + self, + prefix_length: int, + prefix_inner_dim: int, + prefix_hidden_dim: Optional[int] = None, + vocab_size: int = 50257, # Start of GPT2 config args + n_positions: int = 1024, + n_embd: int = 768, + n_layer: int = 12, + n_head: int = 12, + n_inner: Optional[int] = None, + activation_function: str = "gelu_new", + resid_pdrop: float = 0.1, + embd_pdrop: float = 0.1, + attn_pdrop: float = 0.1, + layer_norm_epsilon: float = 1e-5, + initializer_range: float = 0.02, + scale_attn_weights: bool = True, + use_cache: bool = True, + scale_attn_by_inverse_layer_idx: bool = False, + reorder_and_upcast_attn: bool = False, + ): + super().__init__() + + self.prefix_length = prefix_length + + if prefix_inner_dim != n_embd and prefix_hidden_dim is None: + raise ValueError( + f"`prefix_hidden_dim` cannot be `None` when `prefix_inner_dim`: {prefix_hidden_dim} and" + f" `n_embd`: {n_embd} are not equal." + ) + + self.prefix_inner_dim = prefix_inner_dim + self.prefix_hidden_dim = prefix_hidden_dim + + self.encode_prefix = ( + nn.Linear(self.prefix_inner_dim, self.prefix_hidden_dim) + if self.prefix_hidden_dim is not None + else nn.Identity() + ) + self.decode_prefix = ( + nn.Linear(self.prefix_hidden_dim, n_embd) if self.prefix_hidden_dim is not None else nn.Identity() + ) + + gpt_config = GPT2Config( + vocab_size=vocab_size, + n_positions=n_positions, + n_embd=n_embd, + n_layer=n_layer, + n_head=n_head, + n_inner=n_inner, + activation_function=activation_function, + resid_pdrop=resid_pdrop, + embd_pdrop=embd_pdrop, + attn_pdrop=attn_pdrop, + layer_norm_epsilon=layer_norm_epsilon, + initializer_range=initializer_range, + scale_attn_weights=scale_attn_weights, + use_cache=use_cache, + scale_attn_by_inverse_layer_idx=scale_attn_by_inverse_layer_idx, + reorder_and_upcast_attn=reorder_and_upcast_attn, + ) + self.transformer = GPT2LMHeadModel(gpt_config) + + def forward( + self, + input_ids: torch.Tensor, + prefix_embeds: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + ): + """ + Args: + input_ids (`torch.Tensor` of shape `(N, max_seq_len)`): + Text tokens to use for inference. + prefix_embeds (`torch.Tensor` of shape `(N, prefix_length, 768)`): + Prefix embedding to preprend to the embedded tokens. + attention_mask (`torch.Tensor` of shape `(N, prefix_length + max_seq_len, 768)`, *optional*): + Attention mask for the prefix embedding. + labels (`torch.Tensor`, *optional*): + Labels to use for language modeling. + """ + embedding_text = self.transformer.transformer.wte(input_ids) + hidden = self.encode_prefix(prefix_embeds) + prefix_embeds = self.decode_prefix(hidden) + embedding_cat = torch.cat((prefix_embeds, embedding_text), dim=1) + + if labels is not None: + dummy_token = self.get_dummy_token(input_ids.shape[0], input_ids.device) + labels = torch.cat((dummy_token, input_ids), dim=1) + out = self.transformer(inputs_embeds=embedding_cat, labels=labels, attention_mask=attention_mask) + if self.prefix_hidden_dim is not None: + return out, hidden + else: + return out + + def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor: + return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device) + + def encode(self, prefix): + return self.encode_prefix(prefix) + + @torch.no_grad() + def generate_captions(self, features, eos_token_id, device): + """ + Generate captions given text embedding features. Returns list[L]. + + Args: + features (`torch.Tensor` of shape `(B, L, D)`): + Text embedding features to generate captions from. + eos_token_id (`int`): + The token ID of the EOS token for the text decoder model. + device: + Device to perform text generation on. + + Returns: + `List[str]`: A list of strings generated from the decoder model. + """ + + features = torch.split(features, 1, dim=0) + generated_tokens = [] + generated_seq_lengths = [] + for feature in features: + feature = self.decode_prefix(feature.to(device)) # back to the clip feature + # Only support beam search for now + output_tokens, seq_lengths = self.generate_beam( + input_embeds=feature, device=device, eos_token_id=eos_token_id + ) + generated_tokens.append(output_tokens[0]) + generated_seq_lengths.append(seq_lengths[0]) + generated_tokens = torch.stack(generated_tokens) + generated_seq_lengths = torch.stack(generated_seq_lengths) + return generated_tokens, generated_seq_lengths + + @torch.no_grad() + def generate_beam( + self, + input_ids=None, + input_embeds=None, + device=None, + beam_size: int = 5, + entry_length: int = 67, + temperature: float = 1.0, + eos_token_id: Optional[int] = None, + ): + """ + Generates text using the given tokenizer and text prompt or token embedding via beam search. This + implementation is based on the beam search implementation from the [original UniDiffuser + code](https://github.com/thu-ml/unidiffuser/blob/main/libs/caption_decoder.py#L89). + + Args: + eos_token_id (`int`, *optional*): + The token ID of the EOS token for the text decoder model. + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*): + Tokenizer indices of input sequence tokens in the vocabulary. One of `input_ids` and `input_embeds` + must be supplied. + input_embeds (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + An embedded representation to directly pass to the transformer as a prefix for beam search. One of + `input_ids` and `input_embeds` must be supplied. + device: + The device to perform beam search on. + beam_size (`int`, *optional*, defaults to `5`): + The number of best states to store during beam search. + entry_length (`int`, *optional*, defaults to `67`): + The number of iterations to run beam search. + temperature (`float`, *optional*, defaults to 1.0): + The temperature to use when performing the softmax over logits from the decoding model. + + Returns: + `Tuple(torch.Tensor, torch.Tensor)`: A tuple of tensors where the first element is a tensor of generated + token sequences sorted by score in descending order, and the second element is the sequence lengths + corresponding to those sequences. + """ + # Generates text until stop_token is reached using beam search with the desired beam size. + stop_token_index = eos_token_id + tokens = None + scores = None + seq_lengths = torch.ones(beam_size, device=device, dtype=torch.int) + is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool) + + if input_embeds is not None: + generated = input_embeds + else: + generated = self.transformer.transformer.wte(input_ids) + + for i in range(entry_length): + outputs = self.transformer(inputs_embeds=generated) + logits = outputs.logits + logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0) + logits = logits.softmax(-1).log() + + if scores is None: + scores, next_tokens = logits.topk(beam_size, -1) + generated = generated.expand(beam_size, *generated.shape[1:]) + next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0) + if tokens is None: + tokens = next_tokens + else: + tokens = tokens.expand(beam_size, *tokens.shape[1:]) + tokens = torch.cat((tokens, next_tokens), dim=1) + else: + logits[is_stopped] = -float(np.inf) + logits[is_stopped, 0] = 0 + scores_sum = scores[:, None] + logits + seq_lengths[~is_stopped] += 1 + scores_sum_average = scores_sum / seq_lengths[:, None] + scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(beam_size, -1) + next_tokens_source = next_tokens // scores_sum.shape[1] + seq_lengths = seq_lengths[next_tokens_source] + next_tokens = next_tokens % scores_sum.shape[1] + next_tokens = next_tokens.unsqueeze(1) + tokens = tokens[next_tokens_source] + tokens = torch.cat((tokens, next_tokens), dim=1) + generated = generated[next_tokens_source] + scores = scores_sum_average * seq_lengths + is_stopped = is_stopped[next_tokens_source] + + next_token_embed = self.transformer.transformer.wte(next_tokens.squeeze()).view(generated.shape[0], 1, -1) + generated = torch.cat((generated, next_token_embed), dim=1) + is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze() + if is_stopped.all(): + break + + scores = scores / seq_lengths + order = scores.argsort(descending=True) + # tokens tensors are already padded to max_seq_length + output_texts = [tokens[i] for i in order] + output_texts = torch.stack(output_texts, dim=0) + seq_lengths = torch.tensor([seq_lengths[i] for i in order], dtype=seq_lengths.dtype) + return output_texts, seq_lengths diff --git a/src/diffusers/pipelines/unidiffuser/modeling_uvit.py b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py new file mode 100644 index 000000000000..b7829f76ec12 --- /dev/null +++ b/src/diffusers/pipelines/unidiffuser/modeling_uvit.py @@ -0,0 +1,1196 @@ +import math +from typing import Optional, Union + +import torch +from torch import nn + +from ...configuration_utils import ConfigMixin, register_to_config +from ...models import ModelMixin +from ...models.attention import AdaLayerNorm, FeedForward +from ...models.attention_processor import Attention +from ...models.embeddings import TimestepEmbedding, Timesteps, get_2d_sincos_pos_embed +from ...models.transformer_2d import Transformer2DModelOutput +from ...utils import logging + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + logger.warning( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect." + ) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + # type: (torch.Tensor, float, float, float, float) -> torch.Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the normal distribution :math:`\mathcal{N}(\text{mean}, + \text{std}^2)` with values outside :math:`[a, b]` redrawn until they are within the bounds. The method used for + generating the random values works best when :math:`a \leq \text{mean} \leq b`. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +class PatchEmbed(nn.Module): + """2D Image to Patch Embedding""" + + def __init__( + self, + height=224, + width=224, + patch_size=16, + in_channels=3, + embed_dim=768, + layer_norm=False, + flatten=True, + bias=True, + use_pos_embed=True, + ): + super().__init__() + + num_patches = (height // patch_size) * (width // patch_size) + self.flatten = flatten + self.layer_norm = layer_norm + + self.proj = nn.Conv2d( + in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias + ) + if layer_norm: + self.norm = nn.LayerNorm(embed_dim, elementwise_affine=False, eps=1e-6) + else: + self.norm = None + + self.use_pos_embed = use_pos_embed + if self.use_pos_embed: + pos_embed = get_2d_sincos_pos_embed(embed_dim, int(num_patches**0.5)) + self.register_buffer("pos_embed", torch.from_numpy(pos_embed).float().unsqueeze(0), persistent=False) + + def forward(self, latent): + latent = self.proj(latent) + if self.flatten: + latent = latent.flatten(2).transpose(1, 2) # BCHW -> BNC + if self.layer_norm: + latent = self.norm(latent) + if self.use_pos_embed: + return latent + self.pos_embed + else: + return latent + + +class SkipBlock(nn.Module): + def __init__(self, dim: int): + super().__init__() + + self.skip_linear = nn.Linear(2 * dim, dim) + + # Use torch.nn.LayerNorm for now, following the original code + self.norm = nn.LayerNorm(dim) + + def forward(self, x, skip): + x = self.skip_linear(torch.cat([x, skip], dim=-1)) + x = self.norm(x) + + return x + + +# Modified to support both pre-LayerNorm and post-LayerNorm configurations +# Don't support AdaLayerNormZero for now +# Modified from diffusers.models.attention.BasicTransformerBlock +class UTransformerBlock(nn.Module): + r""" + A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + activation_fn (`str`, *optional*, defaults to `"geglu"`): + Activation function to be used in feed-forward. + num_embeds_ada_norm (:obj: `int`, *optional*): + The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (:obj: `bool`, *optional*, defaults to `False`): + Configure if the attentions should contain a bias parameter. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + upcast_attention (`bool`, *optional*): + Whether to upcast the query and key to float32 when performing the attention calculation. + norm_elementwise_affine (`bool`, *optional*): + Whether to use learnable per-element affine parameters during layer normalization. + norm_type (`str`, defaults to `"layer_norm"`): + The layer norm implementation to use. + pre_layer_norm (`bool`, *optional*): + Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"), + as opposed to after ("post-LayerNorm"). Note that `BasicTransformerBlock` uses pre-LayerNorm, e.g. + `pre_layer_norm = True`. + final_dropout (`bool`, *optional*): + Whether to use a final Dropout layer after the feedforward network. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", + pre_layer_norm: bool = True, + final_dropout: bool = False, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + + self.pre_layer_norm = pre_layer_norm + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + # 1. Self-Attn + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) # is self-attn if encoder_hidden_states is none + else: + self.attn2 = None + + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + ) + else: + self.norm2 = None + + # 3. Feed-forward + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + timestep=None, + cross_attention_kwargs=None, + class_labels=None, + ): + # Pre-LayerNorm + if self.pre_layer_norm: + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + else: + norm_hidden_states = self.norm1(hidden_states) + else: + norm_hidden_states = hidden_states + + # 1. Self-Attention + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + # Post-LayerNorm + if not self.pre_layer_norm: + if self.use_ada_layer_norm: + attn_output = self.norm1(attn_output, timestep) + else: + attn_output = self.norm1(attn_output) + + hidden_states = attn_output + hidden_states + + if self.attn2 is not None: + # Pre-LayerNorm + if self.pre_layer_norm: + norm_hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + else: + norm_hidden_states = hidden_states + # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly + # prepare attention mask here + + # 2. Cross-Attention + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + + # Post-LayerNorm + if not self.pre_layer_norm: + attn_output = self.norm2(attn_output, timestep) if self.use_ada_layer_norm else self.norm2(attn_output) + + hidden_states = attn_output + hidden_states + + # 3. Feed-forward + # Pre-LayerNorm + if self.pre_layer_norm: + norm_hidden_states = self.norm3(hidden_states) + else: + norm_hidden_states = hidden_states + + ff_output = self.ff(norm_hidden_states) + + # Post-LayerNorm + if not self.pre_layer_norm: + ff_output = self.norm3(ff_output) + + hidden_states = ff_output + hidden_states + + return hidden_states + + +# Like UTransformerBlock except with LayerNorms on the residual backbone of the block +# Modified from diffusers.models.attention.BasicTransformerBlock +class UniDiffuserBlock(nn.Module): + r""" + A modification of BasicTransformerBlock which supports pre-LayerNorm and post-LayerNorm configurations and puts the + LayerNorms on the residual backbone of the block. This matches the transformer block in the [original UniDiffuser + implementation](https://github.com/thu-ml/unidiffuser/blob/main/libs/uvit_multi_post_ln_v1.py#L104). + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + activation_fn (`str`, *optional*, defaults to `"geglu"`): + Activation function to be used in feed-forward. + num_embeds_ada_norm (:obj: `int`, *optional*): + The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (:obj: `bool`, *optional*, defaults to `False`): + Configure if the attentions should contain a bias parameter. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + upcast_attention (`bool`, *optional*): + Whether to upcast the query and key to float() when performing the attention calculation. + norm_elementwise_affine (`bool`, *optional*): + Whether to use learnable per-element affine parameters during layer normalization. + norm_type (`str`, defaults to `"layer_norm"`): + The layer norm implementation to use. + pre_layer_norm (`bool`, *optional*): + Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"), + as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm + (`pre_layer_norm = False`). + final_dropout (`bool`, *optional*): + Whether to use a final Dropout layer after the feedforward network. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", + pre_layer_norm: bool = False, + final_dropout: bool = True, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + + self.pre_layer_norm = pre_layer_norm + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + # 1. Self-Attn + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) # is self-attn if encoder_hidden_states is none + else: + self.attn2 = None + + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + ) + else: + self.norm2 = None + + # 3. Feed-forward + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + def forward( + self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + timestep=None, + cross_attention_kwargs=None, + class_labels=None, + ): + # Following the diffusers transformer block implementation, put the LayerNorm on the + # residual backbone + # Pre-LayerNorm + if self.pre_layer_norm: + if self.use_ada_layer_norm: + hidden_states = self.norm1(hidden_states, timestep) + else: + hidden_states = self.norm1(hidden_states) + + # 1. Self-Attention + cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + attn_output = self.attn1( + hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + # Following the diffusers transformer block implementation, put the LayerNorm on the + # residual backbone + # Post-LayerNorm + if not self.pre_layer_norm: + if self.use_ada_layer_norm: + hidden_states = self.norm1(hidden_states, timestep) + else: + hidden_states = self.norm1(hidden_states) + + if self.attn2 is not None: + # Pre-LayerNorm + if self.pre_layer_norm: + hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + # TODO (Birch-San): Here we should prepare the encoder_attention mask correctly + # prepare attention mask here + + # 2. Cross-Attention + attn_output = self.attn2( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + + hidden_states = attn_output + hidden_states + + # Post-LayerNorm + if not self.pre_layer_norm: + hidden_states = ( + self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) + ) + + # 3. Feed-forward + # Pre-LayerNorm + if self.pre_layer_norm: + hidden_states = self.norm3(hidden_states) + + ff_output = self.ff(hidden_states) + + hidden_states = ff_output + hidden_states + + # Post-LayerNorm + if not self.pre_layer_norm: + hidden_states = self.norm3(hidden_states) + + return hidden_states + + +# Modified from diffusers.models.transformer_2d.Transformer2DModel +# Modify the transformer block structure to be U-Net like following U-ViT +# Only supports patch-style input and torch.nn.LayerNorm currently +# https://github.com/baofff/U-ViT +class UTransformer2DModel(ModelMixin, ConfigMixin): + """ + Transformer model based on the [U-ViT](https://github.com/baofff/U-ViT) architecture for image-like data. Compared + to [`Transformer2DModel`], this model has skip connections between transformer blocks in a "U"-shaped fashion, + similar to a U-Net. Supports only continuous (actual embeddings) inputs, which are embedded via a [`PatchEmbed`] + layer and then reshaped to (b, t, d). + + Parameters: + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + Pass if the input is continuous. The number of channels in the input. + out_channels (`int`, *optional*): + The number of output channels; if `None`, defaults to `in_channels`. + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + norm_num_groups (`int`, *optional*, defaults to `32`): + The number of groups to use when performing Group Normalization. + cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. + attention_bias (`bool`, *optional*): + Configure if the TransformerBlocks' attention should contain a bias parameter. + sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. + Note that this is fixed at training time as it is used for learning a number of position embeddings. See + `ImagePositionalEmbeddings`. + num_vector_embeds (`int`, *optional*): + Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels. + Includes the class for the masked latent pixel. + patch_size (`int`, *optional*, defaults to 2): + The patch size to use in the patch embedding. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`. + The number of diffusion steps used during training. Note that this is fixed at training time as it is used + to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for + up to but not more than steps than `num_embeds_ada_norm`. + use_linear_projection (int, *optional*): TODO: Not used + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used in each + transformer block. + upcast_attention (`bool`, *optional*): + Whether to upcast the query and key to float() when performing the attention calculation. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`. + block_type (`str`, *optional*, defaults to `"unidiffuser"`): + The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual + backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard + behavior in `diffusers`.) + pre_layer_norm (`bool`, *optional*): + Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"), + as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm + (`pre_layer_norm = False`). + norm_elementwise_affine (`bool`, *optional*): + Whether to use learnable per-element affine parameters during layer normalization. + use_patch_pos_embed (`bool`, *optional*): + Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`). + final_dropout (`bool`, *optional*): + Whether to use a final Dropout layer after the feedforward network. + """ + + @register_to_config + def __init__( + self, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + patch_size: Optional[int] = 2, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + norm_type: str = "layer_norm", + block_type: str = "unidiffuser", + pre_layer_norm: bool = False, + norm_elementwise_affine: bool = True, + use_patch_pos_embed=False, + ff_final_dropout: bool = False, + ): + super().__init__() + self.use_linear_projection = use_linear_projection + self.num_attention_heads = num_attention_heads + self.attention_head_dim = attention_head_dim + inner_dim = num_attention_heads * attention_head_dim + + # 1. Input + # Only support patch input of shape (batch_size, num_channels, height, width) for now + assert in_channels is not None and patch_size is not None, "Patch input requires in_channels and patch_size." + + assert sample_size is not None, "UTransformer2DModel over patched input must provide sample_size" + + # 2. Define input layers + self.height = sample_size + self.width = sample_size + + self.patch_size = patch_size + self.pos_embed = PatchEmbed( + height=sample_size, + width=sample_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=inner_dim, + use_pos_embed=use_patch_pos_embed, + ) + + # 3. Define transformers blocks + # Modify this to have in_blocks ("downsample" blocks, even though we don't actually downsample), a mid_block, + # and out_blocks ("upsample" blocks). Like a U-Net, there are skip connections from in_blocks to out_blocks in + # a "U"-shaped fashion (e.g. first in_block to last out_block, etc.). + # Quick hack to make the transformer block type configurable + if block_type == "unidiffuser": + block_cls = UniDiffuserBlock + else: + block_cls = UTransformerBlock + self.transformer_in_blocks = nn.ModuleList( + [ + block_cls( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + pre_layer_norm=pre_layer_norm, + norm_elementwise_affine=norm_elementwise_affine, + final_dropout=ff_final_dropout, + ) + for d in range(num_layers // 2) + ] + ) + + self.transformer_mid_block = block_cls( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + pre_layer_norm=pre_layer_norm, + norm_elementwise_affine=norm_elementwise_affine, + final_dropout=ff_final_dropout, + ) + + # For each skip connection, we use a SkipBlock (concatenation + Linear + LayerNorm) to process the inputs + # before each transformer out_block. + self.transformer_out_blocks = nn.ModuleList( + [ + nn.ModuleDict( + { + "skip": SkipBlock( + inner_dim, + ), + "block": block_cls( + inner_dim, + num_attention_heads, + attention_head_dim, + dropout=dropout, + cross_attention_dim=cross_attention_dim, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + attention_bias=attention_bias, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + pre_layer_norm=pre_layer_norm, + norm_elementwise_affine=norm_elementwise_affine, + final_dropout=ff_final_dropout, + ), + } + ) + for d in range(num_layers // 2) + ] + ) + + # 4. Define output layers + self.out_channels = in_channels if out_channels is None else out_channels + + # Following the UniDiffuser U-ViT implementation, we process the transformer output with + # a LayerNorm layer with per-element affine params + self.norm_out = nn.LayerNorm(inner_dim) + + def forward( + self, + hidden_states, + encoder_hidden_states=None, + timestep=None, + class_labels=None, + cross_attention_kwargs=None, + return_dict: bool = True, + hidden_states_is_embedding: bool = False, + unpatchify: bool = True, + ): + """ + Args: + hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. + When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input + hidden_states + encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + timestep ( `torch.long`, *optional*): + Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): + Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels + conditioning. + cross_attention_kwargs (*optional*): + Keyword arguments to supply to the cross attention layers, if used. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + hidden_states_is_embedding (`bool`, *optional*, defaults to `False`): + Whether or not hidden_states is an embedding directly usable by the transformer. In this case we will + ignore input handling (e.g. continuous, vectorized, etc.) and directly feed hidden_states into the + transformer blocks. + unpatchify (`bool`, *optional*, defaults to `True`): + Whether to unpatchify the transformer output. + + Returns: + [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`: + [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is the sample tensor. + """ + # 0. Check inputs + + if not unpatchify and return_dict: + raise ValueError( + f"Cannot both define `unpatchify`: {unpatchify} and `return_dict`: {return_dict} since when" + f" `unpatchify` is {unpatchify} the returned output is of shape (batch_size, seq_len, hidden_dim)" + " rather than (batch_size, num_channels, height, width)." + ) + + # 1. Input + if not hidden_states_is_embedding: + hidden_states = self.pos_embed(hidden_states) + + # 2. Blocks + + # In ("downsample") blocks + skips = [] + for in_block in self.transformer_in_blocks: + hidden_states = in_block( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + ) + skips.append(hidden_states) + + # Mid block + hidden_states = self.transformer_mid_block(hidden_states) + + # Out ("upsample") blocks + for out_block in self.transformer_out_blocks: + hidden_states = out_block["skip"](hidden_states, skips.pop()) + hidden_states = out_block["block"]( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=timestep, + cross_attention_kwargs=cross_attention_kwargs, + class_labels=class_labels, + ) + + # 3. Output + # Don't support AdaLayerNorm for now, so no conditioning/scale/shift logic + hidden_states = self.norm_out(hidden_states) + # hidden_states = self.proj_out(hidden_states) + + if unpatchify: + # unpatchify + height = width = int(hidden_states.shape[1] ** 0.5) + hidden_states = hidden_states.reshape( + shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels) + ) + hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states) + output = hidden_states.reshape( + shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size) + ) + else: + output = hidden_states + + if not return_dict: + return (output,) + + return Transformer2DModelOutput(sample=output) + + +class UniDiffuserModel(ModelMixin, ConfigMixin): + """ + Transformer model for a image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model. This is a + modification of [`UTransformer2DModel`] with input and output heads for the VAE-embedded latent image, the + CLIP-embedded image, and the CLIP-embedded prompt (see paper for more details). + + Parameters: + text_dim (`int`): The hidden dimension of the CLIP text model used to embed images. + clip_img_dim (`int`): The hidden dimension of the CLIP vision model used to embed prompts. + num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. + attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. + in_channels (`int`, *optional*): + Pass if the input is continuous. The number of channels in the input. + out_channels (`int`, *optional*): + The number of output channels; if `None`, defaults to `in_channels`. + num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + norm_num_groups (`int`, *optional*, defaults to `32`): + The number of groups to use when performing Group Normalization. + cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. + attention_bias (`bool`, *optional*): + Configure if the TransformerBlocks' attention should contain a bias parameter. + sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. + Note that this is fixed at training time as it is used for learning a number of position embeddings. See + `ImagePositionalEmbeddings`. + num_vector_embeds (`int`, *optional*): + Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels. + Includes the class for the masked latent pixel. + patch_size (`int`, *optional*, defaults to 2): + The patch size to use in the patch embedding. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`. + The number of diffusion steps used during training. Note that this is fixed at training time as it is used + to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for + up to but not more than steps than `num_embeds_ada_norm`. + use_linear_projection (int, *optional*): TODO: Not used + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used in each + transformer block. + upcast_attention (`bool`, *optional*): + Whether to upcast the query and key to float32 when performing the attention calculation. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The Layer Normalization implementation to use. Defaults to `torch.nn.LayerNorm`. + block_type (`str`, *optional*, defaults to `"unidiffuser"`): + The transformer block implementation to use. If `"unidiffuser"`, has the LayerNorms on the residual + backbone of each transformer block; otherwise has them in the attention/feedforward branches (the standard + behavior in `diffusers`.) + pre_layer_norm (`bool`, *optional*): + Whether to perform layer normalization before the attention and feedforward operations ("pre-LayerNorm"), + as opposed to after ("post-LayerNorm"). The original UniDiffuser implementation is post-LayerNorm + (`pre_layer_norm = False`). + norm_elementwise_affine (`bool`, *optional*): + Whether to use learnable per-element affine parameters during layer normalization. + use_patch_pos_embed (`bool`, *optional*): + Whether to use position embeddings inside the patch embedding layer (`PatchEmbed`). + ff_final_dropout (`bool`, *optional*): + Whether to use a final Dropout layer after the feedforward network. + use_data_type_embedding (`bool`, *optional*): + Whether to use a data type embedding. This is only relevant for UniDiffuser-v1 style models; UniDiffuser-v1 + is continue-trained from UniDiffuser-v0 on non-publically-available data and accepts a `data_type` + argument, which can either be `1` to use the weights trained on non-publically-available data or `0` + otherwise. This argument is subsequently embedded by the data type embedding, if used. + """ + + @register_to_config + def __init__( + self, + text_dim: int = 768, + clip_img_dim: int = 512, + num_text_tokens: int = 77, + num_attention_heads: int = 16, + attention_head_dim: int = 88, + in_channels: Optional[int] = None, + out_channels: Optional[int] = None, + num_layers: int = 1, + dropout: float = 0.0, + norm_num_groups: int = 32, + cross_attention_dim: Optional[int] = None, + attention_bias: bool = False, + sample_size: Optional[int] = None, + num_vector_embeds: Optional[int] = None, + patch_size: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + use_linear_projection: bool = False, + only_cross_attention: bool = False, + upcast_attention: bool = False, + norm_type: str = "layer_norm", + block_type: str = "unidiffuser", + pre_layer_norm: bool = False, + use_timestep_embedding=False, + norm_elementwise_affine: bool = True, + use_patch_pos_embed=False, + ff_final_dropout: bool = True, + use_data_type_embedding: bool = False, + ): + super().__init__() + + # 0. Handle dimensions + self.inner_dim = num_attention_heads * attention_head_dim + + assert sample_size is not None, "UniDiffuserModel over patched input must provide sample_size" + self.sample_size = sample_size + self.in_channels = in_channels + self.out_channels = in_channels if out_channels is None else out_channels + + self.patch_size = patch_size + # Assume image is square... + self.num_patches = (self.sample_size // patch_size) * (self.sample_size // patch_size) + + # 1. Define input layers + # 1.1 Input layers for text and image input + # For now, only support patch input for VAE latent image input + self.vae_img_in = PatchEmbed( + height=sample_size, + width=sample_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=self.inner_dim, + use_pos_embed=use_patch_pos_embed, + ) + self.clip_img_in = nn.Linear(clip_img_dim, self.inner_dim) + self.text_in = nn.Linear(text_dim, self.inner_dim) + + # 1.2. Timestep embeddings for t_img, t_text + self.timestep_img_proj = Timesteps( + self.inner_dim, + flip_sin_to_cos=True, + downscale_freq_shift=0, + ) + self.timestep_img_embed = ( + TimestepEmbedding( + self.inner_dim, + 4 * self.inner_dim, + out_dim=self.inner_dim, + ) + if use_timestep_embedding + else nn.Identity() + ) + + self.timestep_text_proj = Timesteps( + self.inner_dim, + flip_sin_to_cos=True, + downscale_freq_shift=0, + ) + self.timestep_text_embed = ( + TimestepEmbedding( + self.inner_dim, + 4 * self.inner_dim, + out_dim=self.inner_dim, + ) + if use_timestep_embedding + else nn.Identity() + ) + + # 1.3. Positional embedding + self.num_text_tokens = num_text_tokens + self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches + self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, self.inner_dim)) + self.pos_embed_drop = nn.Dropout(p=dropout) + trunc_normal_(self.pos_embed, std=0.02) + + # 1.4. Handle data type token embeddings for UniDiffuser-V1, if necessary + self.use_data_type_embedding = use_data_type_embedding + if self.use_data_type_embedding: + self.data_type_token_embedding = nn.Embedding(2, self.inner_dim) + self.data_type_pos_embed_token = nn.Parameter(torch.zeros(1, 1, self.inner_dim)) + + # 2. Define transformer blocks + self.transformer = UTransformer2DModel( + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + in_channels=in_channels, + out_channels=out_channels, + num_layers=num_layers, + dropout=dropout, + norm_num_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + attention_bias=attention_bias, + sample_size=sample_size, + num_vector_embeds=num_vector_embeds, + patch_size=patch_size, + activation_fn=activation_fn, + num_embeds_ada_norm=num_embeds_ada_norm, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention, + upcast_attention=upcast_attention, + norm_type=norm_type, + block_type=block_type, + pre_layer_norm=pre_layer_norm, + norm_elementwise_affine=norm_elementwise_affine, + use_patch_pos_embed=use_patch_pos_embed, + ff_final_dropout=ff_final_dropout, + ) + + # 3. Define output layers + patch_dim = (patch_size**2) * out_channels + self.vae_img_out = nn.Linear(self.inner_dim, patch_dim) + self.clip_img_out = nn.Linear(self.inner_dim, clip_img_dim) + self.text_out = nn.Linear(self.inner_dim, text_dim) + + @torch.jit.ignore + def no_weight_decay(self): + return {"pos_embed"} + + def forward( + self, + latent_image_embeds: torch.FloatTensor, + image_embeds: torch.FloatTensor, + prompt_embeds: torch.FloatTensor, + timestep_img: Union[torch.Tensor, float, int], + timestep_text: Union[torch.Tensor, float, int], + data_type: Optional[Union[torch.Tensor, float, int]] = 1, + encoder_hidden_states=None, + cross_attention_kwargs=None, + ): + """ + Args: + latent_image_embeds (`torch.FloatTensor` of shape `(batch size, latent channels, height, width)`): + Latent image representation from the VAE encoder. + image_embeds (`torch.FloatTensor` of shape `(batch size, 1, clip_img_dim)`): + CLIP-embedded image representation (unsqueezed in the first dimension). + prompt_embeds (`torch.FloatTensor` of shape `(batch size, seq_len, text_dim)`): + CLIP-embedded text representation. + timestep_img (`torch.long` or `float` or `int`): + Current denoising step for the image. + timestep_text (`torch.long` or `float` or `int`): + Current denoising step for the text. + data_type: (`torch.int` or `float` or `int`, *optional*, defaults to `1`): + Only used in UniDiffuser-v1-style models. Can be either `1`, to use weights trained on nonpublic data, + or `0` otherwise. + encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): + Conditional embeddings for cross attention layer. If not given, cross-attention defaults to + self-attention. + cross_attention_kwargs (*optional*): + Keyword arguments to supply to the cross attention layers, if used. + + + Returns: + `tuple`: Returns relevant parts of the model's noise prediction: the first element of the tuple is tbe VAE + image embedding, the second element is the CLIP image embedding, and the third element is the CLIP text + embedding. + """ + batch_size = latent_image_embeds.shape[0] + + # 1. Input + # 1.1. Map inputs to shape (B, N, inner_dim) + vae_hidden_states = self.vae_img_in(latent_image_embeds) + clip_hidden_states = self.clip_img_in(image_embeds) + text_hidden_states = self.text_in(prompt_embeds) + + num_text_tokens, num_img_tokens = text_hidden_states.size(1), vae_hidden_states.size(1) + + # 1.2. Encode image timesteps to single token (B, 1, inner_dim) + if not torch.is_tensor(timestep_img): + timestep_img = torch.tensor([timestep_img], dtype=torch.long, device=vae_hidden_states.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep_img = timestep_img * torch.ones(batch_size, dtype=timestep_img.dtype, device=timestep_img.device) + + timestep_img_token = self.timestep_img_proj(timestep_img) + # t_img_token does not contain any weights and will always return f32 tensors + # but time_embedding might be fp16, so we need to cast here. + timestep_img_token = timestep_img_token.to(dtype=self.dtype) + timestep_img_token = self.timestep_img_embed(timestep_img_token) + timestep_img_token = timestep_img_token.unsqueeze(dim=1) + + # 1.3. Encode text timesteps to single token (B, 1, inner_dim) + if not torch.is_tensor(timestep_text): + timestep_text = torch.tensor([timestep_text], dtype=torch.long, device=vae_hidden_states.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep_text = timestep_text * torch.ones(batch_size, dtype=timestep_text.dtype, device=timestep_text.device) + + timestep_text_token = self.timestep_text_proj(timestep_text) + # t_text_token does not contain any weights and will always return f32 tensors + # but time_embedding might be fp16, so we need to cast here. + timestep_text_token = timestep_text_token.to(dtype=self.dtype) + timestep_text_token = self.timestep_text_embed(timestep_text_token) + timestep_text_token = timestep_text_token.unsqueeze(dim=1) + + # 1.4. Concatenate all of the embeddings together. + if self.use_data_type_embedding: + assert data_type is not None, "data_type must be supplied if the model uses a data type embedding" + if not torch.is_tensor(data_type): + data_type = torch.tensor([data_type], dtype=torch.int, device=vae_hidden_states.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + data_type = data_type * torch.ones(batch_size, dtype=data_type.dtype, device=data_type.device) + + data_type_token = self.data_type_token_embedding(data_type).unsqueeze(dim=1) + hidden_states = torch.cat( + [ + timestep_img_token, + timestep_text_token, + data_type_token, + text_hidden_states, + clip_hidden_states, + vae_hidden_states, + ], + dim=1, + ) + else: + hidden_states = torch.cat( + [timestep_img_token, timestep_text_token, text_hidden_states, clip_hidden_states, vae_hidden_states], + dim=1, + ) + + # 1.5. Prepare the positional embeddings and add to hidden states + # Note: I think img_vae should always have the proper shape, so there's no need to interpolate + # the position embeddings. + if self.use_data_type_embedding: + pos_embed = torch.cat( + [self.pos_embed[:, : 1 + 1, :], self.data_type_pos_embed_token, self.pos_embed[:, 1 + 1 :, :]], dim=1 + ) + else: + pos_embed = self.pos_embed + hidden_states = hidden_states + pos_embed + hidden_states = self.pos_embed_drop(hidden_states) + + # 2. Blocks + hidden_states = self.transformer( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + timestep=None, + class_labels=None, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + hidden_states_is_embedding=True, + unpatchify=False, + )[0] + + # 3. Output + # Split out the predicted noise representation. + if self.use_data_type_embedding: + ( + t_img_token_out, + t_text_token_out, + data_type_token_out, + text_out, + img_clip_out, + img_vae_out, + ) = hidden_states.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1) + else: + t_img_token_out, t_text_token_out, text_out, img_clip_out, img_vae_out = hidden_states.split( + (1, 1, num_text_tokens, 1, num_img_tokens), dim=1 + ) + + img_vae_out = self.vae_img_out(img_vae_out) + + # unpatchify + height = width = int(img_vae_out.shape[1] ** 0.5) + img_vae_out = img_vae_out.reshape( + shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels) + ) + img_vae_out = torch.einsum("nhwpqc->nchpwq", img_vae_out) + img_vae_out = img_vae_out.reshape( + shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size) + ) + + img_clip_out = self.clip_img_out(img_clip_out) + + text_out = self.text_out(text_out) + + return img_vae_out, img_clip_out, text_out diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py new file mode 100644 index 000000000000..36e5411b4215 --- /dev/null +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -0,0 +1,1422 @@ +import inspect +from dataclasses import dataclass +from typing import Callable, List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, + GPT2Tokenizer, +) + +from ...models import AutoencoderKL +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + PIL_INTERPOLATION, + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, +) +from ...utils.outputs import BaseOutput +from ..pipeline_utils import DiffusionPipeline +from .modeling_text_decoder import UniDiffuserTextDecoder +from .modeling_uvit import UniDiffuserModel + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess +def preprocess(image): + if isinstance(image, torch.Tensor): + return image + elif isinstance(image, PIL.Image.Image): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + w, h = image[0].size + w, h = (x - x % 8 for x in (w, h)) # resize to integer multiple of 8 + + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = image.transpose(0, 3, 1, 2) + image = 2.0 * image - 1.0 + image = torch.from_numpy(image) + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, dim=0) + return image + + +# New BaseOutput child class for joint image-text output +@dataclass +class ImageTextPipelineOutput(BaseOutput): + """ + Output class for joint image-text pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + text (`List[str]` or `List[List[str]]`) + List of generated text strings of length `batch_size` or a list of list of strings whose outer list has + length `batch_size`. Text generated by the diffusion pipeline. + """ + + images: Optional[Union[List[PIL.Image.Image], np.ndarray]] + text: Optional[Union[List[str], List[List[str]]]] + + +class UniDiffuserPipeline(DiffusionPipeline): + r""" + Pipeline for a bimodal image-text [UniDiffuser](https://arxiv.org/pdf/2303.06555.pdf) model, which supports + unconditional text and image generation, text-conditioned image generation, image-conditioned text generation, and + joint image-text generation. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. This + is part of the UniDiffuser image representation, along with the CLIP vision encoding. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Similar to Stable Diffusion, UniDiffuser uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel) to encode text + prompts. + image_encoder ([`CLIPVisionModel`]): + UniDiffuser uses the vision portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPVisionModel) to encode + images as part of its image representation, along with the VAE latent representation. + image_processor ([`CLIPImageProcessor`]): + CLIP image processor of class + [CLIPImageProcessor](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPImageProcessor), + used to preprocess the image before CLIP encoding it with `image_encoder`. + clip_tokenizer ([`CLIPTokenizer`]): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTokenizer) which + is used to tokenizer a prompt before encoding it with `text_encoder`. + text_decoder ([`UniDiffuserTextDecoder`]): + Frozen text decoder. This is a GPT-style model which is used to generate text from the UniDiffuser + embedding. + text_tokenizer ([`GPT2Tokenizer`]): + Tokenizer of class + [GPT2Tokenizer](https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Tokenizer) which + is used along with the `text_decoder` to decode text for text generation. + unet ([`UniDiffuserModel`]): + UniDiffuser uses a [U-ViT](https://github.com/baofff/U-ViT) model architecture, which is similar to a + [`Transformer2DModel`] with U-Net-style skip connections between transformer layers. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image and/or text latents. The + original UniDiffuser paper uses the [`DPMSolverMultistepScheduler`] scheduler. + """ + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + image_encoder: CLIPVisionModelWithProjection, + image_processor: CLIPImageProcessor, + clip_tokenizer: CLIPTokenizer, + text_decoder: UniDiffuserTextDecoder, + text_tokenizer: GPT2Tokenizer, + unet: UniDiffuserModel, + scheduler: KarrasDiffusionSchedulers, + ): + super().__init__() + + if text_encoder.config.hidden_size != text_decoder.prefix_inner_dim: + raise ValueError( + f"The text encoder hidden size and text decoder prefix inner dim must be the same, but" + f" `text_encoder.config.hidden_size`: {text_encoder.config.hidden_size} and `text_decoder.prefix_inner_dim`: {text_decoder.prefix_inner_dim}" + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + image_encoder=image_encoder, + image_processor=image_processor, + clip_tokenizer=clip_tokenizer, + text_decoder=text_decoder, + text_tokenizer=text_tokenizer, + unet=unet, + scheduler=scheduler, + ) + + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + + self.num_channels_latents = vae.config.latent_channels + self.text_encoder_seq_len = text_encoder.config.max_position_embeddings + self.text_encoder_hidden_size = text_encoder.config.hidden_size + self.image_encoder_projection_dim = image_encoder.config.projection_dim + self.unet_resolution = unet.config.sample_size + + self.text_intermediate_dim = self.text_encoder_hidden_size + if self.text_decoder.prefix_hidden_dim is not None: + self.text_intermediate_dim = self.text_decoder.prefix_hidden_dim + + self.mode = None + + # TODO: handle safety checking? + self.safety_checker = None + + # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload + # Add self.image_encoder, self.text_decoder to cpu_offloaded_models list + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.image_encoder, self.text_decoder]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload + # Add self.image_encoder, self.text_decoder to cpu_offloaded_models list + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae, self.image_encoder, self.text_decoder]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def _infer_mode(self, prompt, prompt_embeds, image, latents, prompt_latents, vae_latents, clip_latents): + r""" + Infer the generation task ('mode') from the inputs to `__call__`. If the mode has been manually set, the set + mode will be used. + """ + prompt_available = (prompt is not None) or (prompt_embeds is not None) + image_available = image is not None + input_available = prompt_available or image_available + + prompt_latents_available = prompt_latents is not None + vae_latents_available = vae_latents is not None + clip_latents_available = clip_latents is not None + full_latents_available = latents is not None + image_latents_available = vae_latents_available and clip_latents_available + all_indv_latents_available = prompt_latents_available and image_latents_available + + if self.mode is not None: + # Preferentially use the mode set by the user + mode = self.mode + elif prompt_available: + mode = "text2img" + elif image_available: + mode = "img2text" + else: + # Neither prompt nor image supplied, infer based on availability of latents + if full_latents_available or all_indv_latents_available: + mode = "joint" + elif prompt_latents_available: + mode = "text" + elif image_latents_available: + mode = "img" + else: + # No inputs or latents available + mode = "joint" + + # Give warnings for ambiguous cases + if self.mode is None and prompt_available and image_available: + logger.warning( + f"You have supplied both a text prompt and image to the pipeline and mode has not been set manually," + f" defaulting to mode '{mode}'." + ) + + if self.mode is None and not input_available: + if vae_latents_available != clip_latents_available: + # Exactly one of vae_latents and clip_latents is supplied + logger.warning( + f"You have supplied exactly one of `vae_latents` and `clip_latents`, whereas either both or none" + f" are expected to be supplied. Defaulting to mode '{mode}'." + ) + elif not prompt_latents_available and not vae_latents_available and not clip_latents_available: + # No inputs or latents supplied + logger.warning( + f"No inputs or latents have been supplied, and mode has not been manually set," + f" defaulting to mode '{mode}'." + ) + + return mode + + # Functions to manually set the mode + def set_text_mode(self): + r"""Manually set the generation mode to unconditional ("marginal") text generation.""" + self.mode = "text" + + def set_image_mode(self): + r"""Manually set the generation mode to unconditional ("marginal") image generation.""" + self.mode = "img" + + def set_text_to_image_mode(self): + r"""Manually set the generation mode to text-conditioned image generation.""" + self.mode = "text2img" + + def set_image_to_text_mode(self): + r"""Manually set the generation mode to image-conditioned text generation.""" + self.mode = "img2text" + + def set_joint_mode(self): + r"""Manually set the generation mode to unconditional joint image-text generation.""" + self.mode = "joint" + + def reset_mode(self): + r"""Removes a manually set mode; after calling this, the pipeline will infer the mode from inputs.""" + self.mode = None + + def _infer_batch_size( + self, + mode, + prompt, + prompt_embeds, + image, + num_images_per_prompt, + num_prompts_per_image, + latents, + prompt_latents, + vae_latents, + clip_latents, + ): + r"""Infers the batch size and multiplier depending on mode and supplied arguments to `__call__`.""" + if num_images_per_prompt is None: + num_images_per_prompt = 1 + if num_prompts_per_image is None: + num_prompts_per_image = 1 + + assert num_images_per_prompt > 0, "num_images_per_prompt must be a positive integer" + assert num_prompts_per_image > 0, "num_prompts_per_image must be a positive integer" + + if mode in ["text2img"]: + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + # Either prompt or prompt_embeds must be present for text2img. + batch_size = prompt_embeds.shape[0] + multiplier = num_images_per_prompt + elif mode in ["img2text"]: + if isinstance(image, PIL.Image.Image): + batch_size = 1 + else: + # Image must be available and type either PIL.Image.Image or torch.FloatTensor. + # Not currently supporting something like image_embeds. + batch_size = image.shape[0] + multiplier = num_prompts_per_image + elif mode in ["img"]: + if vae_latents is not None: + batch_size = vae_latents.shape[0] + elif clip_latents is not None: + batch_size = clip_latents.shape[0] + else: + batch_size = 1 + multiplier = num_images_per_prompt + elif mode in ["text"]: + if prompt_latents is not None: + batch_size = prompt_latents.shape[0] + else: + batch_size = 1 + multiplier = num_prompts_per_image + elif mode in ["joint"]: + if latents is not None: + batch_size = latents.shape[0] + elif prompt_latents is not None: + batch_size = prompt_latents.shape[0] + elif vae_latents is not None: + batch_size = vae_latents.shape[0] + elif clip_latents is not None: + batch_size = clip_latents.shape[0] + else: + batch_size = 1 + + if num_images_per_prompt == num_prompts_per_image: + multiplier = num_images_per_prompt + else: + multiplier = min(num_images_per_prompt, num_prompts_per_image) + logger.warning( + f"You are using mode `{mode}` and `num_images_per_prompt`: {num_images_per_prompt} and" + f" num_prompts_per_image: {num_prompts_per_image} are not equal. Using batch size equal to" + f" `min(num_images_per_prompt, num_prompts_per_image) = {batch_size}." + ) + return batch_size, multiplier + + # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + # self.tokenizer => self.clip_tokenizer + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + """ + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + text_inputs = self.clip_tokenizer( + prompt, + padding="max_length", + max_length=self.clip_tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.clip_tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.clip_tokenizer.batch_decode( + untruncated_ids[:, self.clip_tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.clip_tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + max_length = prompt_embeds.shape[1] + uncond_input = self.clip_tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix.StableDiffusionInstructPix2PixPipeline.prepare_image_latents + # Add num_prompts_per_image argument, sample from autoencoder moment distribution + def encode_image_vae_latents( + self, + image, + batch_size, + num_prompts_per_image, + dtype, + device, + do_classifier_free_guidance, + generator=None, + ): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_prompts_per_image + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if isinstance(generator, list): + image_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator=generator[i]) + * self.vae.config.scaling_factor + for i in range(batch_size) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.sample(generator=generator) + # Scale image_latents by the VAE's scaling factor + image_latents = image_latents * self.vae.config.scaling_factor + + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand image_latents for batch_size + deprecation_message = ( + f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many initial images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + else: + image_latents = torch.cat([image_latents], dim=0) + + if do_classifier_free_guidance: + uncond_image_latents = torch.zeros_like(image_latents) + image_latents = torch.cat([image_latents, image_latents, uncond_image_latents], dim=0) + + return image_latents + + def encode_image_clip_latents( + self, + image, + batch_size, + num_prompts_per_image, + dtype, + device, + generator=None, + ): + # Map image to CLIP embedding. + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + preprocessed_image = self.image_processor.preprocess( + image, + return_tensors="pt", + ) + preprocessed_image = preprocessed_image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_prompts_per_image + if isinstance(generator, list): + image_latents = [ + self.image_encoder(**preprocessed_image[i : i + 1]).image_embeds for i in range(batch_size) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.image_encoder(**preprocessed_image).image_embeds + + if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: + # expand image_latents for batch_size + deprecation_message = ( + f"You have passed {batch_size} text prompts (`prompt`), but only {image_latents.shape[0]} initial" + " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" + " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" + " your script to pass as many initial images as text prompts to suppress this warning." + ) + deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) + additional_image_per_prompt = batch_size // image_latents.shape[0] + image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0) + elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts." + ) + else: + image_latents = torch.cat([image_latents], dim=0) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + return image_latents + + # Note that the CLIP latents are not decoded for image generation. + # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + # Rename: decode_latents -> decode_image_latents + def decode_image_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + def prepare_text_latents( + self, batch_size, num_images_per_prompt, seq_len, hidden_size, dtype, device, generator, latents=None + ): + # Prepare latents for the CLIP embedded prompt. + shape = (batch_size * num_images_per_prompt, seq_len, hidden_size) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + # latents is assumed to have shace (B, L, D) + latents = latents.repeat(num_images_per_prompt, 1, 1) + latents = latents.to(device=device, dtype=dtype) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + # Modified from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + # Rename prepare_latents -> prepare_image_vae_latents and add num_prompts_per_image argument. + def prepare_image_vae_latents( + self, + batch_size, + num_prompts_per_image, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + ): + shape = ( + batch_size * num_prompts_per_image, + num_channels_latents, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + # latents is assumed to have shape (B, C, H, W) + latents = latents.repeat(num_prompts_per_image, 1, 1, 1) + latents = latents.to(device=device, dtype=dtype) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def prepare_image_clip_latents( + self, batch_size, num_prompts_per_image, clip_img_dim, dtype, device, generator, latents=None + ): + # Prepare latents for the CLIP embedded image. + shape = (batch_size * num_prompts_per_image, 1, clip_img_dim) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + # latents is assumed to have shape (B, L, D) + latents = latents.repeat(num_prompts_per_image, 1, 1) + latents = latents.to(device=device, dtype=dtype) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def _split(self, x, height, width): + r""" + Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim) into two tensors of shape (B, C, H, W) + and (B, 1, clip_img_dim) + """ + batch_size = x.shape[0] + latent_height = height // self.vae_scale_factor + latent_width = width // self.vae_scale_factor + img_vae_dim = self.num_channels_latents * latent_height * latent_width + + img_vae, img_clip = x.split([img_vae_dim, self.image_encoder_projection_dim], dim=1) + + img_vae = torch.reshape(img_vae, (batch_size, self.num_channels_latents, latent_height, latent_width)) + img_clip = torch.reshape(img_clip, (batch_size, 1, self.image_encoder_projection_dim)) + return img_vae, img_clip + + def _combine(self, img_vae, img_clip): + r""" + Combines a latent iamge img_vae of shape (B, C, H, W) and a CLIP-embedded image img_clip of shape (B, 1, + clip_img_dim) into a single tensor of shape (B, C * H * W + clip_img_dim). + """ + img_vae = torch.reshape(img_vae, (img_vae.shape[0], -1)) + img_clip = torch.reshape(img_clip, (img_clip.shape[0], -1)) + return torch.concat([img_vae, img_clip], dim=-1) + + def _split_joint(self, x, height, width): + r""" + Splits a flattened embedding x of shape (B, C * H * W + clip_img_dim + text_seq_len * text_dim] into (img_vae, + img_clip, text) where img_vae is of shape (B, C, H, W), img_clip is of shape (B, 1, clip_img_dim), and text is + of shape (B, text_seq_len, text_dim). + """ + batch_size = x.shape[0] + latent_height = height // self.vae_scale_factor + latent_width = width // self.vae_scale_factor + img_vae_dim = self.num_channels_latents * latent_height * latent_width + text_dim = self.text_encoder_seq_len * self.text_intermediate_dim + + img_vae, img_clip, text = x.split([img_vae_dim, self.image_encoder_projection_dim, text_dim], dim=1) + + img_vae = torch.reshape(img_vae, (batch_size, self.num_channels_latents, latent_height, latent_width)) + img_clip = torch.reshape(img_clip, (batch_size, 1, self.image_encoder_projection_dim)) + text = torch.reshape(text, (batch_size, self.text_encoder_seq_len, self.text_intermediate_dim)) + return img_vae, img_clip, text + + def _combine_joint(self, img_vae, img_clip, text): + r""" + Combines a latent image img_vae of shape (B, C, H, W), a CLIP-embedded image img_clip of shape (B, L_img, + clip_img_dim), and a text embedding text of shape (B, L_text, text_dim) into a single embedding x of shape (B, + C * H * W + L_img * clip_img_dim + L_text * text_dim). + """ + img_vae = torch.reshape(img_vae, (img_vae.shape[0], -1)) + img_clip = torch.reshape(img_clip, (img_clip.shape[0], -1)) + text = torch.reshape(text, (text.shape[0], -1)) + return torch.concat([img_vae, img_clip, text], dim=-1) + + def _get_noise_pred( + self, + mode, + latents, + t, + prompt_embeds, + img_vae, + img_clip, + max_timestep, + data_type, + guidance_scale, + generator, + device, + height, + width, + ): + r""" + Gets the noise prediction using the `unet` and performs classifier-free guidance, if necessary. + """ + if mode == "joint": + # Joint text-image generation + img_vae_latents, img_clip_latents, text_latents = self._split_joint(latents, height, width) + + img_vae_out, img_clip_out, text_out = self.unet( + img_vae_latents, img_clip_latents, text_latents, timestep_img=t, timestep_text=t, data_type=data_type + ) + + x_out = self._combine_joint(img_vae_out, img_clip_out, text_out) + + if guidance_scale <= 1.0: + return x_out + + # Classifier-free guidance + img_vae_T = randn_tensor(img_vae.shape, generator=generator, device=device, dtype=img_vae.dtype) + img_clip_T = randn_tensor(img_clip.shape, generator=generator, device=device, dtype=img_clip.dtype) + text_T = randn_tensor(prompt_embeds.shape, generator=generator, device=device, dtype=prompt_embeds.dtype) + + _, _, text_out_uncond = self.unet( + img_vae_T, img_clip_T, text_latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type + ) + + img_vae_out_uncond, img_clip_out_uncond, _ = self.unet( + img_vae_latents, + img_clip_latents, + text_T, + timestep_img=t, + timestep_text=max_timestep, + data_type=data_type, + ) + + x_out_uncond = self._combine_joint(img_vae_out_uncond, img_clip_out_uncond, text_out_uncond) + + return guidance_scale * x_out + (1.0 - guidance_scale) * x_out_uncond + elif mode == "text2img": + # Text-conditioned image generation + img_vae_latents, img_clip_latents = self._split(latents, height, width) + + img_vae_out, img_clip_out, text_out = self.unet( + img_vae_latents, img_clip_latents, prompt_embeds, timestep_img=t, timestep_text=0, data_type=data_type + ) + + img_out = self._combine(img_vae_out, img_clip_out) + + if guidance_scale <= 1.0: + return img_out + + # Classifier-free guidance + text_T = randn_tensor(prompt_embeds.shape, generator=generator, device=device, dtype=prompt_embeds.dtype) + + img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet( + img_vae_latents, + img_clip_latents, + text_T, + timestep_img=t, + timestep_text=max_timestep, + data_type=data_type, + ) + + img_out_uncond = self._combine(img_vae_out_uncond, img_clip_out_uncond) + + return guidance_scale * img_out + (1.0 - guidance_scale) * img_out_uncond + elif mode == "img2text": + # Image-conditioned text generation + img_vae_out, img_clip_out, text_out = self.unet( + img_vae, img_clip, latents, timestep_img=0, timestep_text=t, data_type=data_type + ) + + if guidance_scale <= 1.0: + return text_out + + # Classifier-free guidance + img_vae_T = randn_tensor(img_vae.shape, generator=generator, device=device, dtype=img_vae.dtype) + img_clip_T = randn_tensor(img_clip.shape, generator=generator, device=device, dtype=img_clip.dtype) + + img_vae_out_uncond, img_clip_out_uncond, text_out_uncond = self.unet( + img_vae_T, img_clip_T, latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type + ) + + return guidance_scale * text_out + (1.0 - guidance_scale) * text_out_uncond + elif mode == "text": + # Unconditional ("marginal") text generation (no CFG) + img_vae_out, img_clip_out, text_out = self.unet( + img_vae, img_clip, latents, timestep_img=max_timestep, timestep_text=t, data_type=data_type + ) + + return text_out + elif mode == "img": + # Unconditional ("marginal") image generation (no CFG) + img_vae_latents, img_clip_latents = self._split(latents, height, width) + + img_vae_out, img_clip_out, text_out = self.unet( + img_vae_latents, + img_clip_latents, + prompt_embeds, + timestep_img=t, + timestep_text=max_timestep, + data_type=data_type, + ) + + img_out = self._combine(img_vae_out, img_clip_out) + return img_out + + def check_latents_shape(self, latents_name, latents, expected_shape): + latents_shape = latents.shape + expected_num_dims = len(expected_shape) + 1 # expected dimensions plus the batch dimension + expected_shape_str = ", ".join(str(dim) for dim in expected_shape) + if len(latents_shape) != expected_num_dims: + raise ValueError( + f"`{latents_name}` should have shape (batch_size, {expected_shape_str}), but the current shape" + f" {latents_shape} has {len(latents_shape)} dimensions." + ) + for i in range(1, expected_num_dims): + if latents_shape[i] != expected_shape[i - 1]: + raise ValueError( + f"`{latents_name}` should have shape (batch_size, {expected_shape_str}), but the current shape" + f" {latents_shape} has {latents_shape[i]} != {expected_shape[i - 1]} at dimension {i}." + ) + + def check_inputs( + self, + mode, + prompt, + image, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + latents=None, + prompt_latents=None, + vae_latents=None, + clip_latents=None, + ): + # Check inputs before running the generative process. + if height % self.vae_scale_factor != 0 or width % self.vae_scale_factor != 0: + raise ValueError( + f"`height` and `width` have to be divisible by {self.vae_scale_factor} but are {height} and {width}." + ) + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if mode == "text2img": + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + if mode == "img2text": + if image is None: + raise ValueError("`img2text` mode requires an image to be provided.") + + # Check provided latents + latent_height = height // self.vae_scale_factor + latent_width = width // self.vae_scale_factor + full_latents_available = latents is not None + prompt_latents_available = prompt_latents is not None + vae_latents_available = vae_latents is not None + clip_latents_available = clip_latents is not None + + if full_latents_available: + individual_latents_available = ( + prompt_latents is not None or vae_latents is not None or clip_latents is not None + ) + if individual_latents_available: + logger.warning( + "You have supplied both `latents` and at least one of `prompt_latents`, `vae_latents`, and" + " `clip_latents`. The value of `latents` will override the value of any individually supplied latents." + ) + # Check shape of full latents + img_vae_dim = self.num_channels_latents * latent_height * latent_width + text_dim = self.text_encoder_seq_len * self.text_encoder_hidden_size + latents_dim = img_vae_dim + self.image_encoder_projection_dim + text_dim + latents_expected_shape = (latents_dim,) + self.check_latents_shape("latents", latents, latents_expected_shape) + + # Check individual latent shapes, if present + if prompt_latents_available: + prompt_latents_expected_shape = (self.text_encoder_seq_len, self.text_encoder_hidden_size) + self.check_latents_shape("prompt_latents", prompt_latents, prompt_latents_expected_shape) + + if vae_latents_available: + vae_latents_expected_shape = (self.num_channels_latents, latent_height, latent_width) + self.check_latents_shape("vae_latents", vae_latents, vae_latents_expected_shape) + + if clip_latents_available: + clip_latents_expected_shape = (1, self.image_encoder_projection_dim) + self.check_latents_shape("clip_latents", clip_latents, clip_latents_expected_shape) + + if mode in ["text2img", "img"] and vae_latents_available and clip_latents_available: + if vae_latents.shape[0] != clip_latents.shape[0]: + raise ValueError( + f"Both `vae_latents` and `clip_latents` are supplied, but their batch dimensions are not equal:" + f" {vae_latents.shape[0]} != {clip_latents.shape[0]}." + ) + + if mode == "joint" and prompt_latents_available and vae_latents_available and clip_latents_available: + if prompt_latents.shape[0] != vae_latents.shape[0] or prompt_latents.shape[0] != clip_latents.shape[0]: + raise ValueError( + f"All of `prompt_latents`, `vae_latents`, and `clip_latents` are supplied, but their batch" + f" dimensions are not equal: {prompt_latents.shape[0]} != {vae_latents.shape[0]}" + f" != {clip_latents.shape[0]}." + ) + + @torch.no_grad() + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = None, + image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + data_type: Optional[int] = 1, + num_inference_steps: int = 50, + guidance_scale: float = 8.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + num_prompts_per_image: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_latents: Optional[torch.FloatTensor] = None, + vae_latents: Optional[torch.FloatTensor] = None, + clip_latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds` + instead. Required for text-conditioned image generation (`text2img`) mode. + image (`torch.FloatTensor` or `PIL.Image.Image`, *optional*): + `Image`, or tensor representing an image batch. Required for image-conditioned text generation + (`img2text`) mode. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + data_type (`int`, *optional*, defaults to 1): + The data type (either 0 or 1). Only used if you are loading a checkpoint which supports a data type + embedding; this is added for compatibility with the UniDiffuser-v1 checkpoint. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 8.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. Note that the original [UniDiffuser + paper](https://arxiv.org/pdf/2303.06555.pdf) uses a different definition of the guidance scale `w'`, + which satisfies `w = w' + 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). Used in text-conditioned image generation (`text2img`) mode. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. Used in `text2img` (text-conditioned image generation) and + `img` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are + supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples will be generated. + num_prompts_per_image (`int`, *optional*, defaults to 1): + The number of prompts to generate per image. Used in `img2text` (image-conditioned text generation) and + `text` mode. If the mode is joint and both `num_images_per_prompt` and `num_prompts_per_image` are + supplied, `min(num_images_per_prompt, num_prompts_per_image)` samples will be generated. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for joint + image-text generation. Can be used to tweak the same generation with different prompts. If not + provided, a latents tensor will be generated by sampling using the supplied random `generator`. Note + that this is assumed to be a full set of VAE, CLIP, and text latents, if supplied, this will override + the value of `prompt_latents`, `vae_latents`, and `clip_latents`. + prompt_latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for text + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + vae_latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + clip_latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will be generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. Used in text-conditioned + image generation (`text2img`) mode. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. Used in text-conditioned image generation (`text2img`) mode. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.unidiffuser.ImageTextPipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + Returns: + [`~pipelines.unidiffuser.ImageTextPipelineOutput`] or `tuple`: + [`pipelines.unidiffuser.ImageTextPipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When + returning a tuple, the first element is a list with the generated images, and the second element is a list + of generated texts. + """ + + # 0. Default height and width to unet + height = height or self.unet_resolution * self.vae_scale_factor + width = width or self.unet_resolution * self.vae_scale_factor + + # 1. Check inputs + # Recalculate mode for each call to the pipeline. + mode = self._infer_mode(prompt, prompt_embeds, image, latents, prompt_latents, vae_latents, clip_latents) + self.check_inputs( + mode, + prompt, + image, + height, + width, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + latents, + prompt_latents, + vae_latents, + clip_latents, + ) + + # 2. Define call parameters + batch_size, multiplier = self._infer_batch_size( + mode, + prompt, + prompt_embeds, + image, + num_images_per_prompt, + num_prompts_per_image, + latents, + prompt_latents, + vae_latents, + clip_latents, + ) + device = self._execution_device + reduce_text_emb_dim = self.text_intermediate_dim < self.text_encoder_hidden_size or self.mode != "text2img" + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + # Note that this differs from the formulation in the unidiffusers paper! + # do_classifier_free_guidance = guidance_scale > 1.0 + + # check if scheduler is in sigmas space + # scheduler_is_in_sigma_space = hasattr(self.scheduler, "sigmas") + + # 3. Encode input prompt, if available; otherwise prepare text latents + if latents is not None: + # Overwrite individual latents + vae_latents, clip_latents, prompt_latents = self._split_joint(latents, height, width) + + if mode in ["text2img"]: + # 3.1. Encode input prompt, if available + assert prompt is not None or prompt_embeds is not None + prompt_embeds = self._encode_prompt( + prompt=prompt, + device=device, + num_images_per_prompt=multiplier, + do_classifier_free_guidance=False, # don't support standard classifier-free guidance for now + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + else: + # 3.2. Prepare text latent variables, if input not available + prompt_embeds = self.prepare_text_latents( + batch_size=batch_size, + num_images_per_prompt=multiplier, + seq_len=self.text_encoder_seq_len, + hidden_size=self.text_encoder_hidden_size, + dtype=self.text_encoder.dtype, # Should work with both full precision and mixed precision + device=device, + generator=generator, + latents=prompt_latents, + ) + + if reduce_text_emb_dim: + prompt_embeds = self.text_decoder.encode(prompt_embeds) + + # 4. Encode image, if available; otherwise prepare image latents + if mode in ["img2text"]: + # 4.1. Encode images, if available + assert image is not None, "`img2text` requires a conditioning image" + # Encode image using VAE + image_vae = preprocess(image) + height, width = image_vae.shape[-2:] + image_vae_latents = self.encode_image_vae_latents( + image=image_vae, + batch_size=batch_size, + num_prompts_per_image=multiplier, + dtype=prompt_embeds.dtype, + device=device, + do_classifier_free_guidance=False, # Copied from InstructPix2Pix, don't use their version of CFG + generator=generator, + ) + + # Encode image using CLIP + image_clip_latents = self.encode_image_clip_latents( + image=image, + batch_size=batch_size, + num_prompts_per_image=multiplier, + dtype=prompt_embeds.dtype, + device=device, + generator=generator, + ) + # (batch_size, clip_hidden_size) => (batch_size, 1, clip_hidden_size) + image_clip_latents = image_clip_latents.unsqueeze(1) + else: + # 4.2. Prepare image latent variables, if input not available + # Prepare image VAE latents in latent space + image_vae_latents = self.prepare_image_vae_latents( + batch_size=batch_size, + num_prompts_per_image=multiplier, + num_channels_latents=self.num_channels_latents, + height=height, + width=width, + dtype=prompt_embeds.dtype, + device=device, + generator=generator, + latents=vae_latents, + ) + + # Prepare image CLIP latents + image_clip_latents = self.prepare_image_clip_latents( + batch_size=batch_size, + num_prompts_per_image=multiplier, + clip_img_dim=self.image_encoder_projection_dim, + dtype=prompt_embeds.dtype, + device=device, + generator=generator, + latents=clip_latents, + ) + + # 5. Set timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + # max_timestep = timesteps[0] + max_timestep = self.scheduler.config.num_train_timesteps + + # 6. Prepare latent variables + if mode == "joint": + latents = self._combine_joint(image_vae_latents, image_clip_latents, prompt_embeds) + elif mode in ["text2img", "img"]: + latents = self._combine(image_vae_latents, image_clip_latents) + elif mode in ["img2text", "text"]: + latents = prompt_embeds + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + logger.debug(f"Scheduler extra step kwargs: {extra_step_kwargs}") + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # predict the noise residual + # Also applies classifier-free guidance as described in the UniDiffuser paper + noise_pred = self._get_noise_pred( + mode, + latents, + t, + prompt_embeds, + image_vae_latents, + image_clip_latents, + max_timestep, + data_type, + guidance_scale, + generator, + device, + height, + width, + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # 9. Post-processing + gen_image = None + gen_text = None + if mode == "joint": + image_vae_latents, image_clip_latents, text_latents = self._split_joint(latents, height, width) + + # Map latent VAE image back to pixel space + gen_image = self.decode_image_latents(image_vae_latents) + + # Generate text using the text decoder + output_token_list, seq_lengths = self.text_decoder.generate_captions( + text_latents, self.text_tokenizer.eos_token_id, device=device + ) + output_list = output_token_list.cpu().numpy() + gen_text = [ + self.text_tokenizer.decode(output[: int(length)], skip_special_tokens=True) + for output, length in zip(output_list, seq_lengths) + ] + elif mode in ["text2img", "img"]: + image_vae_latents, image_clip_latents = self._split(latents, height, width) + gen_image = self.decode_image_latents(image_vae_latents) + elif mode in ["img2text", "text"]: + text_latents = latents + output_token_list, seq_lengths = self.text_decoder.generate_captions( + text_latents, self.text_tokenizer.eos_token_id, device=device + ) + output_list = output_token_list.cpu().numpy() + gen_text = [ + self.text_tokenizer.decode(output[: int(length)], skip_special_tokens=True) + for output, length in zip(output_list, seq_lengths) + ] + + # 10. Convert to PIL + if output_type == "pil" and gen_image is not None: + gen_image = self.numpy_to_pil(gen_image) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (gen_image, gen_text) + + return ImageTextPipelineOutput(images=gen_image, text=gen_text) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index ea6a61cf7587..95d07c081ccd 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -152,6 +152,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class ImageTextPipelineOutput(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class KandinskyImg2ImgPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] @@ -632,6 +647,51 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class UniDiffuserModel(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class UniDiffuserPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class UniDiffuserTextDecoder(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class VersatileDiffusionDualGuidedPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/unidiffuser/__init__.py b/tests/pipelines/unidiffuser/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py new file mode 100644 index 000000000000..f9f798ebe55d --- /dev/null +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -0,0 +1,670 @@ +import gc +import random +import unittest + +import numpy as np +import torch +from PIL import Image +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, + GPT2Tokenizer, +) + +from diffusers import ( + AutoencoderKL, + DPMSolverMultistepScheduler, + UniDiffuserModel, + UniDiffuserPipeline, + UniDiffuserTextDecoder, +) +from diffusers.utils import floats_tensor, load_image, randn_tensor, slow, torch_device +from diffusers.utils.testing_utils import require_torch_gpu + +from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin + + +class UniDiffuserPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = UniDiffuserPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + + def get_dummy_components(self): + unet = UniDiffuserModel.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="unet", + ) + + scheduler = DPMSolverMultistepScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + solver_order=3, + ) + + vae = AutoencoderKL.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="vae", + ) + + text_encoder = CLIPTextModel.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="text_encoder", + ) + clip_tokenizer = CLIPTokenizer.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="clip_tokenizer", + ) + + image_encoder = CLIPVisionModelWithProjection.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="image_encoder", + ) + # From the Stable Diffusion Image Variation pipeline tests + image_processor = CLIPImageProcessor(crop_size=32, size=32) + # image_processor = CLIPImageProcessor.from_pretrained("hf-internal-testing/tiny-random-clip") + + text_tokenizer = GPT2Tokenizer.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="text_tokenizer", + ) + text_decoder = UniDiffuserTextDecoder.from_pretrained( + "hf-internal-testing/unidiffuser-diffusers-test", + subfolder="text_decoder", + ) + + components = { + "vae": vae, + "text_encoder": text_encoder, + "image_encoder": image_encoder, + "image_processor": image_processor, + "clip_tokenizer": clip_tokenizer, + "text_decoder": text_decoder, + "text_tokenizer": text_tokenizer, + "unet": unet, + "scheduler": scheduler, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + image = Image.fromarray(np.uint8(image)).convert("RGB") + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "an elephant under the sea", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "output_type": "numpy", + } + return inputs + + def get_fixed_latents(self, device, seed=0): + if type(device) == str: + device = torch.device(device) + generator = torch.Generator(device=device).manual_seed(seed) + # Hardcode the shapes for now. + prompt_latents = randn_tensor((1, 77, 32), generator=generator, device=device, dtype=torch.float32) + vae_latents = randn_tensor((1, 4, 16, 16), generator=generator, device=device, dtype=torch.float32) + clip_latents = randn_tensor((1, 1, 32), generator=generator, device=device, dtype=torch.float32) + + latents = { + "prompt_latents": prompt_latents, + "vae_latents": vae_latents, + "clip_latents": clip_latents, + } + return latents + + def get_dummy_inputs_with_latents(self, device, seed=0): + # image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + # image = image.cpu().permute(0, 2, 3, 1)[0] + # image = Image.fromarray(np.uint8(image)).convert("RGB") + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg", + ) + image = image.resize((32, 32)) + latents = self.get_fixed_latents(device, seed=seed) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + inputs = { + "prompt": "an elephant under the sea", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "output_type": "numpy", + "prompt_latents": latents.get("prompt_latents"), + "vae_latents": latents.get("vae_latents"), + "clip_latents": latents.get("clip_latents"), + } + return inputs + + def test_unidiffuser_default_joint_v0(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'joint' + unidiffuser_pipe.set_joint_mode() + assert unidiffuser_pipe.mode == "joint" + + # inputs = self.get_dummy_inputs(device) + inputs = self.get_dummy_inputs_with_latents(device) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + sample = unidiffuser_pipe(**inputs) + image = sample.images + text = sample.text + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 + + expected_text_prefix = " no no no " + assert text[0][:10] == expected_text_prefix + + def test_unidiffuser_default_joint_no_cfg_v0(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'joint' + unidiffuser_pipe.set_joint_mode() + assert unidiffuser_pipe.mode == "joint" + + # inputs = self.get_dummy_inputs(device) + inputs = self.get_dummy_inputs_with_latents(device) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + # Set guidance scale to 1.0 to turn off CFG + inputs["guidance_scale"] = 1.0 + sample = unidiffuser_pipe(**inputs) + image = sample.images + text = sample.text + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 + + expected_text_prefix = " no no no " + assert text[0][:10] == expected_text_prefix + + def test_unidiffuser_default_text2img_v0(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'text2img' + unidiffuser_pipe.set_text_to_image_mode() + assert unidiffuser_pipe.mode == "text2img" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete image for text-conditioned image generation + del inputs["image"] + image = unidiffuser_pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_unidiffuser_default_image_0(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img' + unidiffuser_pipe.set_image_mode() + assert unidiffuser_pipe.mode == "img" + + inputs = self.get_dummy_inputs(device) + # Delete prompt and image for unconditional ("marginal") text generation. + del inputs["prompt"] + del inputs["image"] + image = unidiffuser_pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.5760, 0.6270, 0.6571, 0.4966, 0.4638, 0.5663, 0.5254, 0.5068, 0.5715]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_unidiffuser_default_text_v0(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img' + unidiffuser_pipe.set_text_mode() + assert unidiffuser_pipe.mode == "text" + + inputs = self.get_dummy_inputs(device) + # Delete prompt and image for unconditional ("marginal") text generation. + del inputs["prompt"] + del inputs["image"] + text = unidiffuser_pipe(**inputs).text + + expected_text_prefix = " no no no " + assert text[0][:10] == expected_text_prefix + + def test_unidiffuser_default_img2text_v0(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img2text' + unidiffuser_pipe.set_image_to_text_mode() + assert unidiffuser_pipe.mode == "img2text" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete text for image-conditioned text generation + del inputs["prompt"] + text = unidiffuser_pipe(**inputs).text + + expected_text_prefix = " no no no " + assert text[0][:10] == expected_text_prefix + + def test_unidiffuser_default_joint_v1(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1") + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'joint' + unidiffuser_pipe.set_joint_mode() + assert unidiffuser_pipe.mode == "joint" + + # inputs = self.get_dummy_inputs(device) + inputs = self.get_dummy_inputs_with_latents(device) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + inputs["data_type"] = 1 + sample = unidiffuser_pipe(**inputs) + image = sample.images + text = sample.text + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.5760, 0.6270, 0.6571, 0.4965, 0.4638, 0.5663, 0.5254, 0.5068, 0.5716]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 + + expected_text_prefix = " no no no " + assert text[0][:10] == expected_text_prefix + + def test_unidiffuser_default_text2img_v1(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1") + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'text2img' + unidiffuser_pipe.set_text_to_image_mode() + assert unidiffuser_pipe.mode == "text2img" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete image for text-conditioned image generation + del inputs["image"] + image = unidiffuser_pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.5758, 0.6269, 0.6570, 0.4967, 0.4639, 0.5664, 0.5257, 0.5067, 0.5715]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_unidiffuser_default_img2text_v1(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + unidiffuser_pipe = UniDiffuserPipeline.from_pretrained("hf-internal-testing/unidiffuser-test-v1") + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img2text' + unidiffuser_pipe.set_image_to_text_mode() + assert unidiffuser_pipe.mode == "img2text" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete text for image-conditioned text generation + del inputs["prompt"] + text = unidiffuser_pipe(**inputs).text + + expected_text_prefix = " no no no " + assert text[0][:10] == expected_text_prefix + + def test_unidiffuser_text2img_multiple_images(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'text2img' + unidiffuser_pipe.set_text_to_image_mode() + assert unidiffuser_pipe.mode == "text2img" + + inputs = self.get_dummy_inputs(device) + # Delete image for text-conditioned image generation + del inputs["image"] + inputs["num_images_per_prompt"] = 2 + inputs["num_prompts_per_image"] = 3 + image = unidiffuser_pipe(**inputs).images + assert image.shape == (2, 32, 32, 3) + + def test_unidiffuser_img2text_multiple_prompts(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img2text' + unidiffuser_pipe.set_image_to_text_mode() + assert unidiffuser_pipe.mode == "img2text" + + inputs = self.get_dummy_inputs(device) + # Delete text for image-conditioned text generation + del inputs["prompt"] + inputs["num_images_per_prompt"] = 2 + inputs["num_prompts_per_image"] = 3 + text = unidiffuser_pipe(**inputs).text + + assert len(text) == 3 + + def test_unidiffuser_text2img_multiple_images_with_latents(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'text2img' + unidiffuser_pipe.set_text_to_image_mode() + assert unidiffuser_pipe.mode == "text2img" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete image for text-conditioned image generation + del inputs["image"] + inputs["num_images_per_prompt"] = 2 + inputs["num_prompts_per_image"] = 3 + image = unidiffuser_pipe(**inputs).images + assert image.shape == (2, 32, 32, 3) + + def test_unidiffuser_img2text_multiple_prompts_with_latents(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + unidiffuser_pipe = UniDiffuserPipeline(**components) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img2text' + unidiffuser_pipe.set_image_to_text_mode() + assert unidiffuser_pipe.mode == "img2text" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete text for image-conditioned text generation + del inputs["prompt"] + inputs["num_images_per_prompt"] = 2 + inputs["num_prompts_per_image"] = 3 + text = unidiffuser_pipe(**inputs).text + + assert len(text) == 3 + + @require_torch_gpu + def test_unidiffuser_default_joint_v1_cuda_fp16(self): + device = "cuda" + unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( + "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 + ) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'joint' + unidiffuser_pipe.set_joint_mode() + assert unidiffuser_pipe.mode == "joint" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + inputs["data_type"] = 1 + sample = unidiffuser_pipe(**inputs) + image = sample.images + text = sample.text + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.5049, 0.5498, 0.5854, 0.3052, 0.4460, 0.6489, 0.5122, 0.4810, 0.6138]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 + + expected_text_prefix = '" This This' + assert text[0][: len(expected_text_prefix)] == expected_text_prefix + + @require_torch_gpu + def test_unidiffuser_default_text2img_v1_cuda_fp16(self): + device = "cuda" + unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( + "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 + ) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'text2img' + unidiffuser_pipe.set_text_to_image_mode() + assert unidiffuser_pipe.mode == "text2img" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete prompt and image for joint inference. + del inputs["image"] + inputs["data_type"] = 1 + sample = unidiffuser_pipe(**inputs) + image = sample.images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.5054, 0.5498, 0.5854, 0.3052, 0.4458, 0.6489, 0.5122, 0.4810, 0.6138]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-3 + + @require_torch_gpu + def test_unidiffuser_default_img2text_v1_cuda_fp16(self): + device = "cuda" + unidiffuser_pipe = UniDiffuserPipeline.from_pretrained( + "hf-internal-testing/unidiffuser-test-v1", torch_dtype=torch.float16 + ) + unidiffuser_pipe = unidiffuser_pipe.to(device) + unidiffuser_pipe.set_progress_bar_config(disable=None) + + # Set mode to 'img2text' + unidiffuser_pipe.set_image_to_text_mode() + assert unidiffuser_pipe.mode == "img2text" + + inputs = self.get_dummy_inputs_with_latents(device) + # Delete prompt and image for joint inference. + del inputs["prompt"] + inputs["data_type"] = 1 + text = unidiffuser_pipe(**inputs).text + + expected_text_prefix = '" This This' + assert text[0][: len(expected_text_prefix)] == expected_text_prefix + + +@slow +@require_torch_gpu +class UniDiffuserPipelineSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, seed=0, generate_latents=False): + generator = torch.manual_seed(seed) + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unidiffuser/unidiffuser_example_image.jpg" + ) + inputs = { + "prompt": "an elephant under the sea", + "image": image, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 8.0, + "output_type": "numpy", + } + if generate_latents: + latents = self.get_fixed_latents(device, seed=seed) + for latent_name, latent_tensor in latents.items(): + inputs[latent_name] = latent_tensor + return inputs + + def get_fixed_latents(self, device, seed=0): + if type(device) == str: + device = torch.device(device) + latent_device = torch.device("cpu") + generator = torch.Generator(device=latent_device).manual_seed(seed) + # Hardcode the shapes for now. + prompt_latents = randn_tensor((1, 77, 768), generator=generator, device=device, dtype=torch.float32) + vae_latents = randn_tensor((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float32) + clip_latents = randn_tensor((1, 1, 512), generator=generator, device=device, dtype=torch.float32) + + # Move latents onto desired device. + prompt_latents = prompt_latents.to(device) + vae_latents = vae_latents.to(device) + clip_latents = clip_latents.to(device) + + latents = { + "prompt_latents": prompt_latents, + "vae_latents": vae_latents, + "clip_latents": clip_latents, + } + return latents + + def test_unidiffuser_default_joint_v1(self): + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + # inputs = self.get_dummy_inputs(device) + inputs = self.get_inputs(device=torch_device, generate_latents=True) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + sample = pipe(**inputs) + image = sample.images + text = sample.text + assert image.shape == (1, 512, 512, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1 + + expected_text_prefix = "A living room" + assert text[0][: len(expected_text_prefix)] == expected_text_prefix + + def test_unidiffuser_default_text2img_v1(self): + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs(device=torch_device, generate_latents=True) + del inputs["image"] + sample = pipe(**inputs) + image = sample.images + assert image.shape == (1, 512, 512, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 + + def test_unidiffuser_default_img2text_v1(self): + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1") + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs(device=torch_device, generate_latents=True) + del inputs["prompt"] + sample = pipe(**inputs) + text = sample.text + + expected_text_prefix = "An astronaut" + assert text[0][: len(expected_text_prefix)] == expected_text_prefix + + def test_unidiffuser_default_joint_v1_fp16(self): + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + # inputs = self.get_dummy_inputs(device) + inputs = self.get_inputs(device=torch_device, generate_latents=True) + # Delete prompt and image for joint inference. + del inputs["prompt"] + del inputs["image"] + sample = pipe(**inputs) + image = sample.images + text = sample.text + assert image.shape == (1, 512, 512, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1 + + expected_text_prefix = "A living room" + assert text[0][: len(expected_text_prefix)] == expected_text_prefix + + def test_unidiffuser_default_text2img_v1_fp16(self): + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs(device=torch_device, generate_latents=True) + del inputs["image"] + sample = pipe(**inputs) + image = sample.images + assert image.shape == (1, 512, 512, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.0242, 0.0103, 0.0022, 0.0129, 0.0000, 0.0090, 0.0376, 0.0508, 0.0005]) + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-1 + + def test_unidiffuser_default_img2text_v1_fp16(self): + pipe = UniDiffuserPipeline.from_pretrained("thu-ml/unidiffuser-v1", torch_dtype=torch.float16) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs(device=torch_device, generate_latents=True) + del inputs["prompt"] + sample = pipe(**inputs) + text = sample.text + + expected_text_prefix = "An astronaut" + assert text[0][: len(expected_text_prefix)] == expected_text_prefix From 67cf0445ef48b1f913b90ce0025ac0c75673e32e Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 26 May 2023 21:03:25 +0900 Subject: [PATCH 037/199] Fix to apply LoRAXFormersAttnProcessor instead of LoRAAttnProcessor when xFormers is enabled (#3556) * fix to use LoRAXFormersAttnProcessor * add test * using new LoraLoaderMixin.save_lora_weights * add test_lora_save_load_with_xformers --- src/diffusers/loaders.py | 7 ++- tests/models/test_lora_layers.py | 96 +++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index cea2abe40c3f..3761f0e59d05 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -27,7 +27,9 @@ CustomDiffusionXFormersAttnProcessor, LoRAAttnAddedKVProcessor, LoRAAttnProcessor, + LoRAXFormersAttnProcessor, SlicedAttnAddedKVProcessor, + XFormersAttnProcessor, ) from .utils import ( DIFFUSERS_CACHE, @@ -279,7 +281,10 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict attn_processor_class = LoRAAttnAddedKVProcessor else: cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1] - attn_processor_class = LoRAAttnProcessor + if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)): + attn_processor_class = LoRAXFormersAttnProcessor + else: + attn_processor_class = LoRAAttnProcessor attn_processors[key] = attn_processor_class( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 6f1e85e15558..64e30ba4057d 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -22,7 +22,14 @@ from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel from diffusers.loaders import AttnProcsLayers, LoraLoaderMixin -from diffusers.models.attention_processor import LoRAAttnProcessor +from diffusers.models.attention_processor import ( + Attention, + AttnProcessor, + AttnProcessor2_0, + LoRAAttnProcessor, + LoRAXFormersAttnProcessor, + XFormersAttnProcessor, +) from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, floats_tensor, torch_device @@ -212,3 +219,90 @@ def test_lora_save_load_legacy(self): # Outputs shouldn't match. self.assertFalse(torch.allclose(torch.from_numpy(orig_image_slice), torch.from_numpy(lora_image_slice))) + + def create_lora_weight_file(self, tmpdirname): + _, lora_components = self.get_dummy_components() + LoraLoaderMixin.save_lora_weights( + save_directory=tmpdirname, + unet_lora_layers=lora_components["unet_lora_layers"], + text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + + def test_lora_unet_attn_processors(self): + with tempfile.TemporaryDirectory() as tmpdirname: + self.create_lora_weight_file(tmpdirname) + + pipeline_components, _ = self.get_dummy_components() + sd_pipe = StableDiffusionPipeline(**pipeline_components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + # check if vanilla attention processors are used + for _, module in sd_pipe.unet.named_modules(): + if isinstance(module, Attention): + self.assertIsInstance(module.processor, (AttnProcessor, AttnProcessor2_0)) + + # load LoRA weight file + sd_pipe.load_lora_weights(tmpdirname) + + # check if lora attention processors are used + for _, module in sd_pipe.unet.named_modules(): + if isinstance(module, Attention): + self.assertIsInstance(module.processor, LoRAAttnProcessor) + + @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU") + def test_lora_unet_attn_processors_with_xformers(self): + with tempfile.TemporaryDirectory() as tmpdirname: + self.create_lora_weight_file(tmpdirname) + + pipeline_components, _ = self.get_dummy_components() + sd_pipe = StableDiffusionPipeline(**pipeline_components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + # enable XFormers + sd_pipe.enable_xformers_memory_efficient_attention() + + # check if xFormers attention processors are used + for _, module in sd_pipe.unet.named_modules(): + if isinstance(module, Attention): + self.assertIsInstance(module.processor, XFormersAttnProcessor) + + # load LoRA weight file + sd_pipe.load_lora_weights(tmpdirname) + + # check if lora attention processors are used + for _, module in sd_pipe.unet.named_modules(): + if isinstance(module, Attention): + self.assertIsInstance(module.processor, LoRAXFormersAttnProcessor) + + @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU") + def test_lora_save_load_with_xformers(self): + pipeline_components, lora_components = self.get_dummy_components() + sd_pipe = StableDiffusionPipeline(**pipeline_components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + noise, input_ids, pipeline_inputs = self.get_dummy_inputs() + + # enable XFormers + sd_pipe.enable_xformers_memory_efficient_attention() + + original_images = sd_pipe(**pipeline_inputs).images + orig_image_slice = original_images[0, -3:, -3:, -1] + + with tempfile.TemporaryDirectory() as tmpdirname: + LoraLoaderMixin.save_lora_weights( + save_directory=tmpdirname, + unet_lora_layers=lora_components["unet_lora_layers"], + text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + sd_pipe.load_lora_weights(tmpdirname) + + lora_images = sd_pipe(**pipeline_inputs).images + lora_image_slice = lora_images[0, -3:, -3:, -1] + + # Outputs shouldn't match. + self.assertFalse(torch.allclose(torch.from_numpy(orig_image_slice), torch.from_numpy(lora_image_slice))) From 1d1f648c6bffa0b15cd102014bf9f7ed6dfc2f7a Mon Sep 17 00:00:00 2001 From: Leon Lin Date: Sat, 27 May 2023 01:58:50 +0800 Subject: [PATCH 038/199] fix dreambooth attention mask (#3541) --- examples/dreambooth/train_dreambooth.py | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 158d03185a54..37b06acb6977 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -681,6 +681,7 @@ def collate_fn(examples, with_prior_preservation=False): } if has_attention_mask: + attention_mask = torch.cat(attention_mask, dim=0) batch["attention_mask"] = attention_mask return batch From bdc75e753ddebdd1922280fea507228fb9f1e1c8 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Fri, 26 May 2023 10:59:44 -0700 Subject: [PATCH 039/199] [IF super res] correctly normalize PIL input (#3536) * [IF super res] correctl normalize PIL input * 175 -> 127.5 --- .../deepfloyd_if/pipeline_if_img2img_superresolution.py | 2 +- .../deepfloyd_if/pipeline_if_inpainting_superresolution.py | 2 +- .../pipelines/deepfloyd_if/pipeline_if_superresolution.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index 94ea1015862d..a49d25137b2a 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -759,7 +759,7 @@ def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device image = [image] if isinstance(image[0], PIL.Image.Image): - image = [np.array(i).astype(np.float32) / 255.0 for i in image] + image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image] image = np.stack(image, axis=0) # to np image = torch.from_numpy(image.transpose(0, 3, 1, 2)) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index 77a3b2594bf1..f255948dc74b 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -795,7 +795,7 @@ def preprocess_image(self, image: PIL.Image.Image, num_images_per_prompt, device image = [image] if isinstance(image[0], PIL.Image.Image): - image = [np.array(i).astype(np.float32) / 255.0 for i in image] + image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image] image = np.stack(image, axis=0) # to np image = torch.from_numpy(image.transpose(0, 3, 1, 2)) diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index 2fe8e6a9d5d5..7a8de51579b7 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -664,7 +664,7 @@ def preprocess_image(self, image, num_images_per_prompt, device): image = [image] if isinstance(image[0], PIL.Image.Image): - image = [np.array(i).astype(np.float32) / 255.0 for i in image] + image = [np.array(i).astype(np.float32) / 127.5 - 1.0 for i in image] image = np.stack(image, axis=0) # to np image = torch.from_numpy(image.transpose(0, 3, 1, 2)) From ab986769f1a6401bd1d0a1faf17e85dc67c2e8c4 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 26 May 2023 12:04:15 -0700 Subject: [PATCH 040/199] [docs] Maintenance (#3552) * doc fixes * fix latex * parenthesis on inside --- docs/source/en/api/models.mdx | 2 +- docs/source/en/api/pipelines/overview.mdx | 102 ------------------ docs/source/en/conceptual/contribution.mdx | 2 +- docs/source/en/optimization/fp16.mdx | 14 +-- docs/source/en/optimization/torch2.0.mdx | 2 +- docs/source/en/stable_diffusion.mdx | 2 +- docs/source/en/training/controlnet.mdx | 18 ++-- docs/source/en/training/custom_diffusion.mdx | 12 ++- docs/source/en/training/dreambooth.mdx | 11 +- docs/source/en/training/instructpix2pix.mdx | 9 +- docs/source/en/training/text2image.mdx | 44 +++++--- docs/source/en/training/text_inversion.mdx | 6 +- .../en/training/unconditional_training.mdx | 3 +- .../conditional_image_generation.mdx | 4 +- docs/source/en/using-diffusers/schedulers.mdx | 5 +- .../en/using-diffusers/using_safetensors.mdx | 9 +- .../en/using-diffusers/write_own_pipeline.mdx | 2 +- 17 files changed, 87 insertions(+), 160 deletions(-) diff --git a/docs/source/en/api/models.mdx b/docs/source/en/api/models.mdx index 2361fd4f6597..74291f9173ea 100644 --- a/docs/source/en/api/models.mdx +++ b/docs/source/en/api/models.mdx @@ -13,7 +13,7 @@ specific language governing permissions and limitations under the License. # Models Diffusers contains pretrained models for popular algorithms and modules for creating the next set of diffusion models. -The primary function of these models is to denoise an input sample, by modeling the distribution $p_\theta(\mathbf{x}_{t-1}|\mathbf{x}_t)$. +The primary function of these models is to denoise an input sample, by modeling the distribution \\(p_{\theta}(x_{t-1}|x_{t})\\). The models are built on the base class ['ModelMixin'] that is a `torch.nn.module` with basic functionality for saving and loading models both locally and from the HuggingFace hub. ## ModelMixin diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx index 2b2f95590016..0ae3d897a3b1 100644 --- a/docs/source/en/api/pipelines/overview.mdx +++ b/docs/source/en/api/pipelines/overview.mdx @@ -113,105 +113,3 @@ each pipeline, one should look directly into the respective pipeline. **Note**: All pipelines have PyTorch's autograd disabled by decorating the `__call__` method with a [`torch.no_grad`](https://pytorch.org/docs/stable/generated/torch.no_grad.html) decorator because pipelines should not be used for training. If you want to store the gradients during the forward pass, we recommend writing your own pipeline, see also our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). - -## Contribution - -We are more than happy about any contribution to the officially supported pipelines 🤗. We aspire -all of our pipelines to be **self-contained**, **easy-to-tweak**, **beginner-friendly** and for **one-purpose-only**. - -- **Self-contained**: A pipeline shall be as self-contained as possible. More specifically, this means that all functionality should be either directly defined in the pipeline file itself, should be inherited from (and only from) the [`DiffusionPipeline` class](.../diffusion_pipeline) or be directly attached to the model and scheduler components of the pipeline. -- **Easy-to-use**: Pipelines should be extremely easy to use - one should be able to load the pipeline and -use it for its designated task, *e.g.* text-to-image generation, in just a couple of lines of code. Most -logic including pre-processing, an unrolled diffusion loop, and post-processing should all happen inside the `__call__` method. -- **Easy-to-tweak**: Certain pipelines will not be able to handle all use cases and tasks that you might like them to. If you want to use a certain pipeline for a specific use case that is not yet supported, you might have to copy the pipeline file and tweak the code to your needs. We try to make the pipeline code as readable as possible so that each part –from pre-processing to diffusing to post-processing– can easily be adapted. If you would like the community to benefit from your customized pipeline, we would love to see a contribution to our [community-examples](https://github.com/huggingface/diffusers/tree/main/examples/community). If you feel that an important pipeline should be part of the official pipelines but isn't, a contribution to the [official pipelines](./overview) would be even better. -- **One-purpose-only**: Pipelines should be used for one task and one task only. Even if two tasks are very similar from a modeling point of view, *e.g.* image2image translation and in-painting, pipelines shall be used for one task only to keep them *easy-to-tweak* and *readable*. - -## Examples - -### Text-to-Image generation with Stable Diffusion - -```python -# make sure you're logged in with `huggingface-cli login` -from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler - -pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") -pipe = pipe.to("cuda") - -prompt = "a photo of an astronaut riding a horse on mars" -image = pipe(prompt).images[0] - -image.save("astronaut_rides_horse.png") -``` - -### Image-to-Image text-guided generation with Stable Diffusion - -The `StableDiffusionImg2ImgPipeline` lets you pass a text prompt and an initial image to condition the generation of new images. - -```python -import requests -from PIL import Image -from io import BytesIO - -from diffusers import StableDiffusionImg2ImgPipeline - -# load the pipeline -device = "cuda" -pipe = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to( - device -) - -# let's download an initial image -url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" - -response = requests.get(url) -init_image = Image.open(BytesIO(response.content)).convert("RGB") -init_image = init_image.resize((768, 512)) - -prompt = "A fantasy landscape, trending on artstation" - -images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images - -images[0].save("fantasy_landscape.png") -``` -You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) - -### Tweak prompts reusing seeds and latents - -You can generate your own latents to reproduce results, or tweak your prompt on a specific result you liked. [This notebook](https://github.com/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) shows how to do it step by step. You can also run it in Google Colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pcuenca/diffusers-examples/blob/main/notebooks/stable-diffusion-seeds.ipynb) - - -### In-painting using Stable Diffusion - -The `StableDiffusionInpaintPipeline` lets you edit specific parts of an image by providing a mask and text prompt. - -```python -import PIL -import requests -import torch -from io import BytesIO - -from diffusers import StableDiffusionInpaintPipeline - - -def download_image(url): - response = requests.get(url) - return PIL.Image.open(BytesIO(response.content)).convert("RGB") - - -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" - -init_image = download_image(img_url).resize((512, 512)) -mask_image = download_image(mask_url).resize((512, 512)) - -pipe = StableDiffusionInpaintPipeline.from_pretrained( - "runwayml/stable-diffusion-inpainting", - torch_dtype=torch.float16, -) -pipe = pipe.to("cuda") - -prompt = "Face of a yellow cat, high resolution, sitting on a park bench" -image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0] -``` - -You can also run this example on colab [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) diff --git a/docs/source/en/conceptual/contribution.mdx b/docs/source/en/conceptual/contribution.mdx index 7b78d318b679..ea1d15f2124c 100644 --- a/docs/source/en/conceptual/contribution.mdx +++ b/docs/source/en/conceptual/contribution.mdx @@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License. We ❤️ contributions from the open-source community! Everyone is welcome, and all types of participation –not just code– are valued and appreciated. Answering questions, helping others, reaching out, and improving the documentation are all immensely valuable to the community, so don't be afraid and get involved if you're up for it! -Everyone is encouraged to start by saying 👋 in our public Discord channel. We discuss the latest trends in diffusion models, ask questions, show off personal projects, help each other with contributions, or just hang out ☕. Join us on Discord +Everyone is encouraged to start by saying 👋 in our public Discord channel. We discuss the latest trends in diffusion models, ask questions, show off personal projects, help each other with contributions, or just hang out ☕. Join us on Discord Whichever way you choose to contribute, we strive to be part of an open, welcoming, and kind community. Please, read our [code of conduct](https://github.com/huggingface/diffusers/blob/main/CODE_OF_CONDUCT.md) and be mindful to respect it during your interactions. We also recommend you become familiar with the [ethical guidelines](https://huggingface.co/docs/diffusers/conceptual/ethical_guidelines) that guide our project and ask you to adhere to the same principles of transparency and responsibility. diff --git a/docs/source/en/optimization/fp16.mdx b/docs/source/en/optimization/fp16.mdx index 4081cfc6efd6..8b3a62cba099 100644 --- a/docs/source/en/optimization/fp16.mdx +++ b/docs/source/en/optimization/fp16.mdx @@ -50,7 +50,6 @@ from diffusers import DiffusionPipeline pipe = DiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - torch_dtype=torch.float16, ) pipe = pipe.to("cuda") @@ -85,7 +84,6 @@ from diffusers import DiffusionPipeline pipe = DiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - torch_dtype=torch.float16, ) pipe = pipe.to("cuda") @@ -112,7 +110,6 @@ from diffusers import StableDiffusionPipeline pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - torch_dtype=torch.float16, ) pipe = pipe.to("cuda") @@ -166,7 +163,6 @@ from diffusers import StableDiffusionPipeline pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - torch_dtype=torch.float16, ) @@ -191,7 +187,6 @@ from diffusers import StableDiffusionPipeline pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - torch_dtype=torch.float16, ) @@ -409,7 +404,14 @@ Here are the speedups we obtain on a few Nvidia GPUs when running the inference | A100-SXM4-40GB | 18.6it/s | 29.it/s | | A100-SXM-80GB | 18.7it/s | 29.5it/s | -To leverage it just make sure you have: +To leverage it just make sure you have: + + + +If you have PyTorch 2.0 installed, you shouldn't use xFormers! + + + - PyTorch > 1.12 - Cuda available - [Installed the xformers library](xformers). diff --git a/docs/source/en/optimization/torch2.0.mdx b/docs/source/en/optimization/torch2.0.mdx index 05a4043d26d1..6e8466fd6ecc 100644 --- a/docs/source/en/optimization/torch2.0.mdx +++ b/docs/source/en/optimization/torch2.0.mdx @@ -23,7 +23,7 @@ To benefit from the accelerated attention implementation and `torch.compile()`, when PyTorch 2.0 is available. ```bash -pip install --upgrade torch torchvision diffusers +pip install --upgrade torch diffusers ``` ## Using accelerated transformers and `torch.compile`. diff --git a/docs/source/en/stable_diffusion.mdx b/docs/source/en/stable_diffusion.mdx index 64c90c7f6477..78fa848421d8 100644 --- a/docs/source/en/stable_diffusion.mdx +++ b/docs/source/en/stable_diffusion.mdx @@ -266,6 +266,6 @@ image_grid(images) In this tutorial, you learned how to optimize a [`DiffusionPipeline`] for computational and memory efficiency as well as improving the quality of generated outputs. If you're interested in making your pipeline even faster, take a look at the following resources: -- Learn how [PyTorch 2.0](./optimization/torch2.0) and [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) can yield 5 - 300% faster inference speed. +- Learn how [PyTorch 2.0](./optimization/torch2.0) and [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) can yield 5 - 300% faster inference speed. On an A100 GPU, inference can be up to 50% faster! - If you can't use PyTorch 2, we recommend you install [xFormers](./optimization/xformers). Its memory-efficient attention mechanism works great with PyTorch 1.13.1 for faster speed and reduced memory consumption. - Other optimization techniques, such as model offloading, are covered in [this guide](./optimization/fp16). diff --git a/docs/source/en/training/controlnet.mdx b/docs/source/en/training/controlnet.mdx index 476081c88704..16a9ba95f057 100644 --- a/docs/source/en/training/controlnet.mdx +++ b/docs/source/en/training/controlnet.mdx @@ -97,7 +97,8 @@ accelerate launch train_controlnet.py \ --learning_rate=1e-5 \ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ - --train_batch_size=4 + --train_batch_size=4 \ + --push_to_hub ``` This default configuration requires ~38GB VRAM. @@ -120,7 +121,8 @@ accelerate launch train_controlnet.py \ --validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \ --validation_prompt "red circle with blue background" "cyan circle with brown floral background" \ --train_batch_size=1 \ - --gradient_accumulation_steps=4 + --gradient_accumulation_steps=4 \ + --push_to_hub ``` ## Training with multiple GPUs @@ -143,7 +145,8 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_controlnet.py \ --train_batch_size=4 \ --mixed_precision="fp16" \ --tracker_project_name="controlnet-demo" \ - --report_to=wandb + --report_to=wandb \ + --push_to_hub ``` ## Example results @@ -191,7 +194,8 @@ accelerate launch train_controlnet.py \ --train_batch_size=1 \ --gradient_accumulation_steps=4 \ --gradient_checkpointing \ - --use_8bit_adam + --use_8bit_adam \ + --push_to_hub ``` ## Training on a 12 GB GPU @@ -219,7 +223,8 @@ accelerate launch train_controlnet.py \ --gradient_checkpointing \ --use_8bit_adam \ --enable_xformers_memory_efficient_attention \ - --set_grads_to_none + --set_grads_to_none \ + --push_to_hub ``` When using `enable_xformers_memory_efficient_attention`, please make sure to install `xformers` by `pip install xformers`. @@ -283,7 +288,8 @@ accelerate launch train_controlnet.py \ --gradient_checkpointing \ --enable_xformers_memory_efficient_attention \ --set_grads_to_none \ - --mixed_precision fp16 + --mixed_precision fp16 \ + --push_to_hub ``` ## Inference diff --git a/docs/source/en/training/custom_diffusion.mdx b/docs/source/en/training/custom_diffusion.mdx index dda9c17c7ebc..ffee456de41f 100644 --- a/docs/source/en/training/custom_diffusion.mdx +++ b/docs/source/en/training/custom_diffusion.mdx @@ -100,7 +100,8 @@ accelerate launch train_custom_diffusion.py \ --lr_warmup_steps=0 \ --max_train_steps=250 \ --scale_lr --hflip \ - --modifier_token "" + --modifier_token "" \ + --push_to_hub ``` **Use `--enable_xformers_memory_efficient_attention` for faster training with lower VRAM requirement (16GB per GPU). Follow [this guide](https://github.com/facebookresearch/xformers) for installation instructions.** @@ -132,7 +133,8 @@ accelerate launch train_custom_diffusion.py \ --scale_lr --hflip \ --modifier_token "" \ --validation_prompt=" cat sitting in a bucket" \ - --report_to="wandb" + --report_to="wandb" \ + --push_to_hub ``` Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/26ghrcau) where you can check out the intermediate results along with other training details. @@ -168,7 +170,8 @@ accelerate launch train_custom_diffusion.py \ --max_train_steps=500 \ --num_class_images=200 \ --scale_lr --hflip \ - --modifier_token "+" + --modifier_token "+" \ + --push_to_hub ``` Here is an example [Weights and Biases page](https://wandb.ai/sayakpaul/custom-diffusion/runs/3990tzkg) where you can check out the intermediate results along with other training details. @@ -207,7 +210,8 @@ accelerate launch train_custom_diffusion.py \ --scale_lr --hflip --noaug \ --freeze_model crossattn \ --modifier_token "" \ - --enable_xformers_memory_efficient_attention + --enable_xformers_memory_efficient_attention \ + --push_to_hub ``` ## Inference diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx index de93772abedd..039cf1f5ca7b 100644 --- a/docs/source/en/training/dreambooth.mdx +++ b/docs/source/en/training/dreambooth.mdx @@ -130,7 +130,8 @@ python train_dreambooth_flax.py \ --resolution=512 \ --train_batch_size=1 \ --learning_rate=5e-6 \ - --max_train_steps=400 + --max_train_steps=400 \ + --push_to_hub ``` @@ -187,7 +188,8 @@ python train_dreambooth_flax.py \ --train_batch_size=1 \ --learning_rate=5e-6 \ --num_class_images=200 \ - --max_train_steps=800 + --max_train_steps=800 \ + --push_to_hub ``` @@ -223,7 +225,7 @@ accelerate launch train_dreambooth.py \ --class_prompt="a photo of dog" \ --resolution=512 \ --train_batch_size=1 \ - --use_8bit_adam + --use_8bit_adam \ --gradient_checkpointing \ --learning_rate=2e-6 \ --lr_scheduler="constant" \ @@ -253,7 +255,8 @@ python train_dreambooth_flax.py \ --train_batch_size=1 \ --learning_rate=2e-6 \ --num_class_images=200 \ - --max_train_steps=800 + --max_train_steps=800 \ + --push_to_hub ``` diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx index 2a9e99cda1f2..64d97ecd6c83 100644 --- a/docs/source/en/training/instructpix2pix.mdx +++ b/docs/source/en/training/instructpix2pix.mdx @@ -100,7 +100,8 @@ accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \ --learning_rate=5e-05 --max_grad_norm=1 --lr_warmup_steps=0 \ --conditioning_dropout_prob=0.05 \ --mixed_precision=fp16 \ - --seed=42 + --seed=42 \ + --push_to_hub ``` Additionally, we support performing validation inference to monitor training progress @@ -121,7 +122,8 @@ accelerate launch --mixed_precision="fp16" train_instruct_pix2pix.py \ --val_image_url="https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png" \ --validation_prompt="make the mountains snowy" \ --seed=42 \ - --report_to=wandb + --report_to=wandb \ + --push_to_hub ``` We recommend this type of validation as it can be useful for model debugging. Note that you need `wandb` installed to use this. You can install `wandb` by running `pip install wandb`. @@ -148,7 +150,8 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_instruct_pix2pix.py --learning_rate=5e-05 --lr_warmup_steps=0 \ --conditioning_dropout_prob=0.05 \ --mixed_precision=fp16 \ - --seed=42 + --seed=42 \ + --push_to_hub ``` ## Inference diff --git a/docs/source/en/training/text2image.mdx b/docs/source/en/training/text2image.mdx index 8535e6ffac70..eb8a120c0211 100644 --- a/docs/source/en/training/text2image.mdx +++ b/docs/source/en/training/text2image.mdx @@ -76,13 +76,25 @@ Launch the [PyTorch training script](https://github.com/huggingface/diffusers/bl Specify the `MODEL_NAME` environment variable (either a Hub model repository id or a path to the directory containing the model weights) and pass it to the [`pretrained_model_name_or_path`](https://huggingface.co/docs/diffusers/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.from_pretrained.pretrained_model_name_or_path) argument. - -{"path": "../../../../examples/text_to_image/README.md", -"language": "bash", -"start-after": "accelerate_snippet_start", -"end-before": "accelerate_snippet_end", -"dedent": 0} - +```bash +export MODEL_NAME="CompVis/stable-diffusion-v1-4" +export dataset_name="lambdalabs/pokemon-blip-captions" + +accelerate launch --mixed_precision="fp16" train_text_to_image.py \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --dataset_name=$dataset_name \ + --use_ema \ + --resolution=512 --center_crop --random_flip \ + --train_batch_size=1 \ + --gradient_accumulation_steps=4 \ + --gradient_checkpointing \ + --max_train_steps=15000 \ + --learning_rate=1e-05 \ + --max_grad_norm=1 \ + --lr_scheduler="constant" --lr_warmup_steps=0 \ + --output_dir="sd-pokemon-model" \ + --push_to_hub +``` To finetune on your own dataset, prepare the dataset according to the format required by 🤗 [Datasets](https://huggingface.co/docs/datasets/index). You can [upload your dataset to the Hub](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub), or you can [prepare a local folder with your files](https://huggingface.co/docs/datasets/image_dataset#imagefolder). @@ -105,8 +117,10 @@ accelerate launch train_text_to_image.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir=${OUTPUT_DIR} + --lr_scheduler="constant" + --lr_warmup_steps=0 \ + --output_dir=${OUTPUT_DIR} \ + --push_to_hub ``` #### Training with multiple GPUs @@ -129,8 +143,10 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --lr_scheduler="constant" --lr_warmup_steps=0 \ - --output_dir="sd-pokemon-model" + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --output_dir="sd-pokemon-model" \ + --push_to_hub ``` @@ -159,7 +175,8 @@ python train_text_to_image_flax.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-pokemon-model" \ + --push_to_hub ``` To finetune on your own dataset, prepare the dataset according to the format required by 🤗 [Datasets](https://huggingface.co/docs/datasets/index). You can [upload your dataset to the Hub](https://huggingface.co/docs/datasets/image_dataset#upload-dataset-to-the-hub), or you can [prepare a local folder with your files](https://huggingface.co/docs/datasets/image_dataset#imagefolder). @@ -179,7 +196,8 @@ python train_text_to_image_flax.py \ --max_train_steps=15000 \ --learning_rate=1e-05 \ --max_grad_norm=1 \ - --output_dir="sd-pokemon-model" + --output_dir="sd-pokemon-model" \ + --push_to_hub ``` diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx index 1afecc7b71bb..a4fe4c2c4e5b 100644 --- a/docs/source/en/training/text_inversion.mdx +++ b/docs/source/en/training/text_inversion.mdx @@ -120,7 +120,8 @@ accelerate launch textual_inversion.py \ --learning_rate=5.0e-04 --scale_lr \ --lr_scheduler="constant" \ --lr_warmup_steps=0 \ - --output_dir="textual_inversion_cat" + --output_dir="textual_inversion_cat" \ + --push_to_hub ``` @@ -161,7 +162,8 @@ python textual_inversion_flax.py \ --train_batch_size=1 \ --max_train_steps=3000 \ --learning_rate=5.0e-04 --scale_lr \ - --output_dir="textual_inversion_cat" + --output_dir="textual_inversion_cat" \ + --push_to_hub ``` diff --git a/docs/source/en/training/unconditional_training.mdx b/docs/source/en/training/unconditional_training.mdx index 164b4f599f1e..7a588cc4cc63 100644 --- a/docs/source/en/training/unconditional_training.mdx +++ b/docs/source/en/training/unconditional_training.mdx @@ -141,5 +141,6 @@ accelerate launch --mixed_precision="fp16" --multi_gpu train_unconditional.py \ --learning_rate=1e-4 \ --lr_warmup_steps=500 \ --mixed_precision="fp16" \ - --logger="wandb" + --logger="wandb" \ + --push_to_hub ``` \ No newline at end of file diff --git a/docs/source/en/using-diffusers/conditional_image_generation.mdx b/docs/source/en/using-diffusers/conditional_image_generation.mdx index 0b5c02415d87..195aa2d6c360 100644 --- a/docs/source/en/using-diffusers/conditional_image_generation.mdx +++ b/docs/source/en/using-diffusers/conditional_image_generation.mdx @@ -20,12 +20,12 @@ The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion syst Start by creating an instance of [`DiffusionPipeline`] and specify which pipeline [checkpoint](https://huggingface.co/models?library=diffusers&sort=downloads) you would like to download. -In this guide, you'll use [`DiffusionPipeline`] for text-to-image generation with [Latent Diffusion](https://huggingface.co/CompVis/ldm-text2im-large-256): +In this guide, you'll use [`DiffusionPipeline`] for text-to-image generation with [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5): ```python >>> from diffusers import DiffusionPipeline ->>> generator = DiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256") +>>> generator = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5") ``` The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components. diff --git a/docs/source/en/using-diffusers/schedulers.mdx b/docs/source/en/using-diffusers/schedulers.mdx index e17d826c7dab..741d92bdd90d 100644 --- a/docs/source/en/using-diffusers/schedulers.mdx +++ b/docs/source/en/using-diffusers/schedulers.mdx @@ -28,18 +28,15 @@ The following paragraphs show how to do so with the 🧨 Diffusers library. ## Load pipeline -Let's start by loading the stable diffusion pipeline. -Remember that you have to be a registered user on the 🤗 Hugging Face Hub, and have "click-accepted" the [license](https://huggingface.co/runwayml/stable-diffusion-v1-5) in order to use stable diffusion. +Let's start by loading the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5) model in the [`DiffusionPipeline`]: ```python from huggingface_hub import login from diffusers import DiffusionPipeline import torch -# first we need to login with our access token login() -# Now we can download the pipeline pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16) ``` diff --git a/docs/source/en/using-diffusers/using_safetensors.mdx b/docs/source/en/using-diffusers/using_safetensors.mdx index 2767b95f3bcc..2015f2faf85a 100644 --- a/docs/source/en/using-diffusers/using_safetensors.mdx +++ b/docs/source/en/using-diffusers/using_safetensors.mdx @@ -30,14 +30,7 @@ pipeline = StableDiffusionPipeline.from_ckpt( ## Convert to safetensors -Not all weights on the Hub are available in the `.safetensors` format, and you may encounter weights stored as `.bin`. In this case, use the Space below to convert the weights to `.safetensors`. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file on the Hub. This way, if there is any malicious code contained in the pickled files, they're uploaded to the Hub - which has a [security scanner](https://huggingface.co/docs/hub/security-pickle#hubs-security-scanner) to detect unsafe files and suspicious pickle imports - instead of your computer. - - +Not all weights on the Hub are available in the `.safetensors` format, and you may encounter weights stored as `.bin`. In this case, use the [Convert Space](https://huggingface.co/spaces/diffusers/convert) to convert the weights to `.safetensors`. The Convert Space downloads the pickled weights, converts them, and opens a Pull Request to upload the newly converted `.safetensors` file on the Hub. This way, if there is any malicious code contained in the pickled files, they're uploaded to the Hub - which has a [security scanner](https://huggingface.co/docs/hub/security-pickle#hubs-security-scanner) to detect unsafe files and suspicious pickle imports - instead of your computer. You can use the model with the new `.safetensors` weights by specifying the reference to the Pull Request in the `revision` parameter (you can also test it in this [Check PR](https://huggingface.co/spaces/diffusers/check_pr) Space on the Hub), for example `refs/pr/22`: diff --git a/docs/source/en/using-diffusers/write_own_pipeline.mdx b/docs/source/en/using-diffusers/write_own_pipeline.mdx index 3dca40dff714..be92980118b1 100644 --- a/docs/source/en/using-diffusers/write_own_pipeline.mdx +++ b/docs/source/en/using-diffusers/write_own_pipeline.mdx @@ -36,7 +36,7 @@ A pipeline is a quick and easy way to run a model for inference, requiring no mo That was super easy, but how did the pipeline do that? Let's breakdown the pipeline and take a look at what's happening under the hood. -In the example above, the pipeline contains a UNet model and a DDPM scheduler. The pipeline denoises an image by taking random noise the size of the desired output and passing it through the model several times. At each timestep, the model predicts the *noise residual* and the scheduler uses it to predict a less noisy image. The pipeline repeats this process until it reaches the end of the specified number of inference steps. +In the example above, the pipeline contains a [`UNet2DModel`] model and a [`DDPMScheduler`]. The pipeline denoises an image by taking random noise the size of the desired output and passing it through the model several times. At each timestep, the model predicts the *noise residual* and the scheduler uses it to predict a less noisy image. The pipeline repeats this process until it reaches the end of the specified number of inference steps. To recreate the pipeline with the model and scheduler separately, let's write our own denoising process. From 9917c329165e39fd05c404ada6beea97c8fa0e18 Mon Sep 17 00:00:00 2001 From: Brandon Date: Sat, 27 May 2023 03:10:32 +0800 Subject: [PATCH 041/199] [docs] update the broken links (#3568) update the broken links update the broken links for training folder doc --- docs/source/en/tutorials/basic_training.mdx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/tutorials/basic_training.mdx b/docs/source/en/tutorials/basic_training.mdx index 52ce7c71fa68..99221274f745 100644 --- a/docs/source/en/tutorials/basic_training.mdx +++ b/docs/source/en/tutorials/basic_training.mdx @@ -407,9 +407,9 @@ Once training is complete, take a look at the final 🦋 images 🦋 generated b ## Next steps -Unconditional image generation is one example of a task that can be trained. You can explore other tasks and training techniques by visiting the [🧨 Diffusers Training Examples](./training/overview) page. Here are some examples of what you can learn: +Unconditional image generation is one example of a task that can be trained. You can explore other tasks and training techniques by visiting the [🧨 Diffusers Training Examples](../training/overview) page. Here are some examples of what you can learn: -* [Textual Inversion](./training/text_inversion), an algorithm that teaches a model a specific visual concept and integrates it into the generated image. -* [DreamBooth](./training/dreambooth), a technique for generating personalized images of a subject given several input images of the subject. -* [Guide](./training/text2image) to finetuning a Stable Diffusion model on your own dataset. -* [Guide](./training/lora) to using LoRA, a memory-efficient technique for finetuning really large models faster. +* [Textual Inversion](../training/text_inversion), an algorithm that teaches a model a specific visual concept and integrates it into the generated image. +* [DreamBooth](../training/dreambooth), a technique for generating personalized images of a subject given several input images of the subject. +* [Guide](../training/text2image) to finetuning a Stable Diffusion model on your own dataset. +* [Guide](../training/lora) to using LoRA, a memory-efficient technique for finetuning really large models faster. From 5559d0423771e8b6e454b1541164e0d5d54b6265 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 26 May 2023 14:37:51 -0700 Subject: [PATCH 042/199] [docs] Working with different formats (#3534) * add ckpt * fix format * apply feedback * fix * include pb * rename file --- docs/source/en/_toctree.yml | 4 +- docs/source/en/using-diffusers/kerascv.mdx | 179 ------------------ .../en/using-diffusers/other-formats.mdx | 126 ++++++++++++ 3 files changed, 128 insertions(+), 181 deletions(-) delete mode 100644 docs/source/en/using-diffusers/kerascv.mdx create mode 100644 docs/source/en/using-diffusers/other-formats.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 86b0da3de303..5bd271c18873 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -28,8 +28,8 @@ title: Load community pipelines - local: using-diffusers/using_safetensors title: Load safetensors - - local: using-diffusers/kerascv - title: Load KerasCV Stable Diffusion checkpoints + - local: using-diffusers/other-formats + title: Load different Stable Diffusion formats title: Loading & Hub - sections: - local: using-diffusers/pipeline_overview diff --git a/docs/source/en/using-diffusers/kerascv.mdx b/docs/source/en/using-diffusers/kerascv.mdx deleted file mode 100644 index 06981cc8fdd1..000000000000 --- a/docs/source/en/using-diffusers/kerascv.mdx +++ /dev/null @@ -1,179 +0,0 @@ - - -# Using KerasCV Stable Diffusion Checkpoints in Diffusers - - - -This is an experimental feature. - - - -[KerasCV](https://github.com/keras-team/keras-cv/) provides APIs for implementing various computer vision workflows. It -also provides the Stable Diffusion [v1 and v2](https://github.com/keras-team/keras-cv/blob/master/keras_cv/models/stable_diffusion) -models. Many practitioners find it easy to fine-tune the Stable Diffusion models shipped by KerasCV. However, as of this writing, KerasCV offers limited support to experiment with Stable Diffusion models for inference and deployment. On the other hand, -Diffusers provides tooling dedicated to this purpose (and more), such as different [noise schedulers](https://huggingface.co/docs/diffusers/using-diffusers/schedulers), [flash attention](https://huggingface.co/docs/diffusers/optimization/xformers), and [other -optimization techniques](https://huggingface.co/docs/diffusers/optimization/fp16). - -How about fine-tuning Stable Diffusion models in KerasCV and exporting them such that they become compatible with Diffusers to combine the -best of both worlds? We have created a [tool](https://huggingface.co/spaces/sayakpaul/convert-kerascv-sd-diffusers) that -lets you do just that! It takes KerasCV Stable Diffusion checkpoints and exports them to Diffusers-compatible checkpoints. -More specifically, it first converts the checkpoints to PyTorch and then wraps them into a -[`StableDiffusionPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview) which is ready -for inference. Finally, it pushes the converted checkpoints to a repository on the Hugging Face Hub. - -We welcome you to try out the tool [here](https://huggingface.co/spaces/sayakpaul/convert-kerascv-sd-diffusers) -and share feedback via [discussions](https://huggingface.co/spaces/sayakpaul/convert-kerascv-sd-diffusers/discussions/new). - -## Getting Started - -First, you need to obtain the fine-tuned KerasCV Stable Diffusion checkpoints. We provide an -overview of the different ways Stable Diffusion models can be fine-tuned [using `diffusers`](https://huggingface.co/docs/diffusers/training/overview). For the Keras implementation of some of these methods, you can check out these resources: - -* [Teach StableDiffusion new concepts via Textual Inversion](https://keras.io/examples/generative/fine_tune_via_textual_inversion/) -* [Fine-tuning Stable Diffusion](https://keras.io/examples/generative/finetune_stable_diffusion/) -* [DreamBooth](https://keras.io/examples/generative/dreambooth/) -* [Prompt-to-Prompt editing](https://github.com/miguelCalado/prompt-to-prompt-tensorflow) - -Stable Diffusion is comprised of the following models: - -* Text encoder -* UNet -* VAE - -Depending on the fine-tuning task, we may fine-tune one or more of these components (the VAE is almost always left untouched). Here are some common combinations: - -* DreamBooth: UNet and text encoder -* Classical text to image fine-tuning: UNet -* Textual Inversion: Just the newly initialized embeddings in the text encoder - -### Performing the Conversion - -Let's use [this checkpoint](https://huggingface.co/sayakpaul/textual-inversion-kerasio/resolve/main/textual_inversion_kerasio.h5) which was generated -by conducting Textual Inversion with the following "placeholder token": ``. - -On the tool, we supply the following things: - -* Path(s) to download the fine-tuned checkpoint(s) (KerasCV) -* An HF token -* Placeholder token (only applicable for Textual Inversion) - -
- -
- -As soon as you hit "Submit", the conversion process will begin. Once it's complete, you should see the following: - -
- -
- -If you click the [link](https://huggingface.co/sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline/tree/main), you -should see something like so: - -
- -
- -If you head over to the [model card of the repository](https://huggingface.co/sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline), the -following should appear: - -
- -
- - - -Note that we're not specifying the UNet weights here since the UNet is not fine-tuned during Textual Inversion. - - - -And that's it! You now have your fine-tuned KerasCV Stable Diffusion model in Diffusers 🧨. - -## Using the Converted Model in Diffusers - -Just beside the model card of the [repository](https://huggingface.co/sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline), -you'd notice an inference widget to try out the model directly from the UI 🤗 - -
- -
- -On the top right hand side, we provide a "Use in Diffusers" button. If you click the button, you should see the following code-snippet: - -```py -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline") -``` - -The model is in standard `diffusers` format. Let's perform inference! - -```py -from diffusers import DiffusionPipeline - -pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline") -pipeline.to("cuda") - -placeholder_token = "" -prompt = f"two {placeholder_token} getting married, photorealistic, high quality" -image = pipeline(prompt, num_inference_steps=50).images[0] -``` - -And we get: - -
- -
- -_**Note that if you specified a `placeholder_token` while performing the conversion, the tool will log it accordingly. Refer -to the model card of [this repository](https://huggingface.co/sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline) -as an example.**_ - -We welcome you to use the tool for various Stable Diffusion fine-tuning scenarios and let us know your feedback! Here are some examples -of Diffusers checkpoints that were obtained using the tool: - -* [sayakpaul/text-unet-dogs-kerascv_sd_diffusers_pipeline](https://huggingface.co/sayakpaul/text-unet-dogs-kerascv_sd_diffusers_pipeline) (DreamBooth with both the text encoder and UNet fine-tuned) -* [sayakpaul/unet-dogs-kerascv_sd_diffusers_pipeline](https://huggingface.co/sayakpaul/unet-dogs-kerascv_sd_diffusers_pipeline) (DreamBooth with only the UNet fine-tuned) - -## Incorporating Diffusers Goodies 🎁 - -Diffusers provides various options that one can leverage to experiment with different inference setups. One particularly -useful option is the use of a different noise scheduler during inference other than what was used during fine-tuning. -Let's try out the [`DPMSolverMultistepScheduler`](https://huggingface.co/docs/diffusers/main/en/api/schedulers/multistep_dpm_solver) -which is different from the one ([`DDPMScheduler`](https://huggingface.co/docs/diffusers/main/en/api/schedulers/ddpm)) used during -fine-tuning. - -You can read more details about this process in [this section](https://huggingface.co/docs/diffusers/using-diffusers/schedulers). - -```py -from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler - -pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline") -pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) -pipeline.to("cuda") - -placeholder_token = "" -prompt = f"two {placeholder_token} getting married, photorealistic, high quality" -image = pipeline(prompt, num_inference_steps=50).images[0] -``` - -
- -
- -One can also continue fine-tuning from these Diffusers checkpoints by leveraging some relevant tools from Diffusers. Refer [here](https://huggingface.co/docs/diffusers/training/overview) for -more details. For inference-specific optimizations, refer [here](https://huggingface.co/docs/diffusers/main/en/optimization/fp16). - -## Known Limitations - -* Only Stable Diffusion v1 checkpoints are supported for conversion in this tool. diff --git a/docs/source/en/using-diffusers/other-formats.mdx b/docs/source/en/using-diffusers/other-formats.mdx new file mode 100644 index 000000000000..c8dc7cca86fc --- /dev/null +++ b/docs/source/en/using-diffusers/other-formats.mdx @@ -0,0 +1,126 @@ + + +# Load different Stable Diffusion formats + +Stable Diffusion models are available in different formats depending on the framework they're trained and saved with, and where you download them from. Converting these formats for use in 🤗 Diffusers allows you to use all the features supported by the library, such as [using different schedulers](schedulers) for inference, [building your custom pipeline](write_own_pipeline), and a variety of techniques and methods for [optimizing inference speed](./optimization/opt_overview). + + + +We highly recommend using the `.safetensors` format because it is more secure than traditional pickled files which are vulnerable and can be exploited to execute any code on your machine (learn more in the [Load safetensors](using_safetensors) guide). + + + +This guide will show you how to convert other Stable Diffusion formats to be compatible with 🤗 Diffusers. + +## PyTorch .ckpt + +The checkpoint - or `.ckpt` - format is commonly used to store and save models. The `.ckpt` file contains the entire model and is typically several GBs in size. While you can load and use a `.ckpt` file directly with the [`~StableDiffusionPipeline.from_ckpt`] method, it is generally better to convert the `.ckpt` file to 🤗 Diffusers so both formats are available. + +There are two options for converting a `.ckpt` file; use a Space to convert the checkpoint or convert the `.ckpt` file with a script. + +### Convert with a Space + +The easiest and most convenient way to convert a `.ckpt` file is to use the [SD to Diffusers](https://huggingface.co/spaces/diffusers/sd-to-diffusers) Space. You can follow the instructions on the Space to convert the `.ckpt` file. + +This approach works well for basic models, but it may struggle with more customized models. You'll know the Space failed if it returns an empty pull request or error. In this case, you can try converting the `.ckpt` file with a script. + +### Convert with a script + +🤗 Diffusers provides a [conversion script](https://github.com/huggingface/diffusers/blob/main/scripts/convert_original_stable_diffusion_to_diffusers.py) for converting `.ckpt` files. This approach is more reliable than the Space above. + +Before you start, make sure you have a local clone of 🤗 Diffusers to run the script and log in to your Hugging Face account so you can open pull requests and push your converted model to the Hub. + +```bash +huggingface-cli login +``` + +To use the script: + +1. Git clone the repository containing the `.ckpt` file you want to convert. For this example, let's convert this [TemporalNet](https://huggingface.co/CiaraRowles/TemporalNet) `.ckpt` file: + +```bash +git lfs install +git clone https://huggingface.co/CiaraRowles/TemporalNet +``` + +2. Open a pull request on the repository where you're converting the checkpoint from: + +```bash +cd TemporalNet && git fetch origin refs/pr/13:pr/13 +git checkout pr/13 +``` + +3. There are several input arguments to configure in the conversion script, but the most important ones are: + + - `checkpoint_path`: the path to the `.ckpt` file to convert. + - `original_config_file`: a YAML file defining the configuration of the original architecture. If you can't find this file, try searching for the YAML file in the GitHub repository where you found the `.ckpt` file. + - `dump_path`: the path to the converted model. + + For example, you can take the `cldm_v15.yaml` file from the [ControlNet](https://github.com/lllyasviel/ControlNet/tree/main/models) repository because the TemporalNet model is a Stable Diffusion v1.5 and ControlNet model. + +4. Now you can run the script to convert the `.ckpt` file: + +```bash +python ../diffusers/scripts/convert_original_stable_diffusion_to_diffusers.py --checkpoint_path temporalnetv3.ckpt --original_config_file cldm_v15.yaml --dump_path ./ --controlnet +``` + +5. Once the conversion is done, upload your converted model and test out the resulting [pull request](https://huggingface.co/CiaraRowles/TemporalNet/discussions/13)! + +```bash +git push origin pr/13:refs/pr/13 +``` + +## Keras .pb or .h5 + + + +🧪 This is an experimental feature. Only Stable Diffusion v1 checkpoints are supported by the Convert KerasCV Space at the moment. + + + +[KerasCV](https://keras.io/keras_cv/) supports training for [Stable Diffusion](https://github.com/keras-team/keras-cv/blob/master/keras_cv/models/stable_diffusion) v1 and v2. However, it offers limited support for experimenting with Stable Diffusion models for inference and deployment whereas 🤗 Diffusers has a more complete set of features for this purpose, such as different [noise schedulers](https://huggingface.co/docs/diffusers/using-diffusers/schedulers), [flash attention](https://huggingface.co/docs/diffusers/optimization/xformers), and [other +optimization techniques](https://huggingface.co/docs/diffusers/optimization/fp16). + +The [Convert KerasCV](https://huggingface.co/spaces/sayakpaul/convert-kerascv-sd-diffusers) Space converts `.pb` or `.h5` files to PyTorch, and then wraps them in a [`StableDiffusionPipeline`] so it is ready for inference. The converted checkpoint is stored in a repository on the Hugging Face Hub. + +For this example, let's convert the [`sayakpaul/textual-inversion-kerasio`](https://huggingface.co/sayakpaul/textual-inversion-kerasio/tree/main) checkpoint which was trained with Textual Inversion. It uses the special token `` to personalize images with cats. + +The Convert KerasCV Space allows you to input the following: + +* Your Hugging Face token. +* Paths to download the UNet and text encoder weights from. Depending on how the model was trained, you don't necessarily need to provide the paths to both the UNet and text encoder. For example, Textual Inversion only requires the embeddings from the text encoder and a text-to-image model only requires the UNet weights. +* Placeholder token is only applicable for textual inversion models. +* The `output_repo_prefix` is the name of the repository where the converted model is stored. + +Click the **Submit** button to automatically convert the KerasCV checkpoint! Once the checkpoint is successfully converted, you'll see a link to the new repository containing the converted checkpoint. Follow the link to the new repository, and you'll see the Convert KerasCV Space generated a model card with an inference widget to try out the converted model. + +If you prefer to run inference with code, click on the **Use in Diffusers** button in the upper right corner of the model card to copy and paste the code snippet: + +```py +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline") +``` + +Then you can generate an image like: + +```py +from diffusers import DiffusionPipeline + +pipeline = DiffusionPipeline.from_pretrained("sayakpaul/textual-inversion-cat-kerascv_sd_diffusers_pipeline") +pipeline.to("cuda") + +placeholder_token = "" +prompt = f"two {placeholder_token} getting married, photorealistic, high quality" +image = pipeline(prompt, num_inference_steps=50).images[0] +``` \ No newline at end of file From c6ae8837512d0572639b9f57491d4482fdc8948c Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 29 May 2023 09:20:31 +0530 Subject: [PATCH 043/199] remove print statements from attention processor. (#3592) --- src/diffusers/models/attention_processor.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 4b65d164bda1..0b86dbe546d2 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -222,9 +222,6 @@ def set_use_memory_efficient_attention_xformers( ) processor.load_state_dict(self.processor.state_dict()) processor.to(self.processor.to_q_lora.up.weight.device) - print( - f"is_lora is set to {is_lora}, type: LoRAXFormersAttnProcessor: {isinstance(processor, LoRAXFormersAttnProcessor)}" - ) elif is_custom_diffusion: processor = CustomDiffusionXFormersAttnProcessor( train_kv=self.processor.train_kv, @@ -262,7 +259,6 @@ def set_use_memory_efficient_attention_xformers( # We use the AttnProcessor2_0 by default when torch 2.x is used which uses # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 - print("Still defaulting to: AttnProcessor2_0 :O") processor = ( AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk From c0f867afd119162eed9cf6ae68c995737bd28f50 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 30 May 2023 11:26:23 +0100 Subject: [PATCH 044/199] Fix temb attention (#3607) * Fix temb attention * Apply suggestions from code review * make style * Add tests and fix docker * Apply suggestions from code review --- docker/diffusers-pytorch-cuda/Dockerfile | 4 +- src/diffusers/models/attention_processor.py | 24 +++++++-- src/diffusers/utils/testing_utils.py | 6 +++ .../stable_diffusion/test_stable_diffusion.py | 54 ++++++++++++++++++- 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile index a51a12ee2838..6b56403a6f94 100644 --- a/docker/diffusers-pytorch-cuda/Dockerfile +++ b/docker/diffusers-pytorch-cuda/Dockerfile @@ -38,6 +38,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ scipy \ tensorboard \ transformers \ - omegaconf + omegaconf \ + pytorch-lightning \ + xformers CMD ["/bin/bash"] diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 0b86dbe546d2..1bfaa0258155 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -540,9 +540,14 @@ def __init__(self, hidden_size, cross_attention_dim=None, rank=4): self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0): + def __call__( + self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None + ): residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + input_ndim = hidden_states.ndim if input_ndim == 4: @@ -905,9 +910,13 @@ def __call__( hidden_states: torch.FloatTensor, encoder_hidden_states: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, ): residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + input_ndim = hidden_states.ndim if input_ndim == 4: @@ -1081,9 +1090,14 @@ def __init__(self, hidden_size, cross_attention_dim, rank=4, attention_op: Optio self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0): + def __call__( + self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None + ): residual = hidden_states + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + input_ndim = hidden_states.ndim if input_ndim == 4: @@ -1334,8 +1348,12 @@ class SlicedAttnAddedKVProcessor: def __init__(self, slice_size): self.slice_size = slice_size - def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, attention_mask=None): + def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, attention_mask=None, temb=None): residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) batch_size, sequence_length, _ = hidden_states.shape diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index 7d5e6bcacecd..abddd48851bf 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -577,3 +577,9 @@ def enable_full_determinism(): torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False torch.backends.cuda.matmul.allow_tf32 = False + + +def disable_full_determinism(): + os.environ["CUDA_LAUNCH_BLOCKING"] = "0" + os.environ["CUBLAS_WORKSPACE_CONFIG"] = "" + torch.use_deterministic_algorithms(False) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 6140bf771e65..b5d968e2a39e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -37,16 +37,18 @@ UNet2DConditionModel, logging, ) -from diffusers.models.attention_processor import AttnProcessor +from diffusers.models.attention_processor import AttnProcessor, LoRAXFormersAttnProcessor from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import ( CaptureLogger, + disable_full_determinism, enable_full_determinism, require_torch_2, require_torch_gpu, run_test_in_subprocess, ) +from ...models.test_lora_layers import create_unet_lora_layers from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -366,6 +368,56 @@ def test_stable_diffusion_pndm(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + @unittest.skipIf(not torch.cuda.is_available(), reason="xformers requires cuda") + def test_stable_diffusion_attn_processors(self): + disable_full_determinism() + device = "cuda" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + + # run normal sd pipe + image = sd_pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + # run xformers attention + sd_pipe.enable_xformers_memory_efficient_attention() + image = sd_pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + # run attention slicing + sd_pipe.enable_attention_slicing() + image = sd_pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + # run vae attention slicing + sd_pipe.enable_vae_slicing() + image = sd_pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + # run lora attention + attn_processors, _ = create_unet_lora_layers(sd_pipe.unet) + attn_processors = {k: v.to("cuda") for k, v in attn_processors.items()} + sd_pipe.unet.set_attn_processor(attn_processors) + image = sd_pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + # run lora xformers attention + attn_processors, _ = create_unet_lora_layers(sd_pipe.unet) + attn_processors = { + k: LoRAXFormersAttnProcessor(hidden_size=v.hidden_size, cross_attention_dim=v.cross_attention_dim) + for k, v in attn_processors.items() + } + attn_processors = {k: v.to("cuda") for k, v in attn_processors.items()} + sd_pipe.unet.set_attn_processor(attn_processors) + image = sd_pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + enable_full_determinism() + def test_stable_diffusion_no_safety_checker(self): pipe = StableDiffusionPipeline.from_pretrained( "hf-internal-testing/tiny-stable-diffusion-lms-pipe", safety_checker=None From c059cc0992899383d1079fbea52b71a49aa3f88a Mon Sep 17 00:00:00 2001 From: Kadir Nar Date: Tue, 30 May 2023 13:44:53 +0300 Subject: [PATCH 045/199] [docs] update the broken links (#3577) --- README.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cb6e29ee1406..9d06cf1099c9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@


- +

@@ -128,57 +128,57 @@ just hang out ☕. Unconditional Image Generation - DDPM + DDPM google/ddpm-ema-church-256 Text-to-Image - Stable Diffusion Text-to-Image + Stable Diffusion Text-to-Image runwayml/stable-diffusion-v1-5 Text-to-Image - unclip + unclip kakaobrain/karlo-v1-alpha Text-to-Image - if + if DeepFloyd/IF-I-XL-v1.0 Text-guided Image-to-Image - Controlnet + Controlnet lllyasviel/sd-controlnet-canny Text-guided Image-to-Image - Instruct Pix2Pix + Instruct Pix2Pix timbrooks/instruct-pix2pix Text-guided Image-to-Image - Stable Diffusion Image-to-Image + Stable Diffusion Image-to-Image runwayml/stable-diffusion-v1-5 Text-guided Image Inpainting - Stable Diffusion Inpaint + Stable Diffusion Inpaint runwayml/stable-diffusion-inpainting Image Variation - Stable Diffusion Image Variation + Stable Diffusion Image Variation lambdalabs/sd-image-variations-diffusers Super Resolution - Stable Diffusion Upscale + Stable Diffusion Upscale stabilityai/stable-diffusion-x4-upscaler Super Resolution - Stable Diffusion Latent Upscale + Stable Diffusion Latent Upscale stabilityai/sd-x2-latent-upscaler From 0612f48cd05f47e238256392d6b45a38875f55b8 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 30 May 2023 16:37:18 +0530 Subject: [PATCH 046/199] [UniDiffuser Tests] Fix some tests (#3609) * fix: unidiffuser test failures. * living room. --- tests/pipelines/unidiffuser/test_unidiffuser.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/pipelines/unidiffuser/test_unidiffuser.py b/tests/pipelines/unidiffuser/test_unidiffuser.py index f9f798ebe55d..06cb451281c9 100644 --- a/tests/pipelines/unidiffuser/test_unidiffuser.py +++ b/tests/pipelines/unidiffuser/test_unidiffuser.py @@ -436,6 +436,9 @@ def test_unidiffuser_img2text_multiple_prompts_with_latents(self): assert len(text) == 3 + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=2e-4) + @require_torch_gpu def test_unidiffuser_default_joint_v1_cuda_fp16(self): device = "cuda" @@ -583,7 +586,7 @@ def test_unidiffuser_default_joint_v1(self): expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1 - expected_text_prefix = "A living room" + expected_text_prefix = "a living room" assert text[0][: len(expected_text_prefix)] == expected_text_prefix def test_unidiffuser_default_text2img_v1(self): @@ -634,9 +637,9 @@ def test_unidiffuser_default_joint_v1_fp16(self): image_slice = image[0, -3:, -3:, -1] expected_img_slice = np.array([0.2402, 0.2375, 0.2285, 0.2378, 0.2407, 0.2263, 0.2354, 0.2307, 0.2520]) - assert np.abs(image_slice.flatten() - expected_img_slice).max() < 1e-1 + assert np.abs(image_slice.flatten() - expected_img_slice).max() < 2e-1 - expected_text_prefix = "A living room" + expected_text_prefix = "a living room" assert text[0][: len(expected_text_prefix)] == expected_text_prefix def test_unidiffuser_default_text2img_v1_fp16(self): From 35a740427ec9fb25f0fb2350d1b63806f0ff9cff Mon Sep 17 00:00:00 2001 From: Rupert Menneer <71332436+rupertmenneer@users.noreply.github.com> Date: Tue, 30 May 2023 04:17:42 -0700 Subject: [PATCH 047/199] #3487 Fix inpainting strength for various samplers (#3532) * Throw error if strength adjusted num_inference_steps < 1 * Added new fast test to check ValueError raised when num_inference_steps < 1 when strength adjusts the num_inference_steps then the inpainting pipeline should fail * fix #3487 initial latents are now only scaled by init_noise_sigma when pure noise updated this commit w.r.t the latest merge here: https://github.com/huggingface/diffusers/pull/3533 * fix --------- Co-authored-by: Patrick von Platen --- .../controlnet/pipeline_controlnet_inpaint.py | 7 ++++--- .../pipeline_stable_diffusion_inpaint.py | 13 ++++++++++--- .../test_stable_diffusion_inpaint.py | 14 ++++++++++++++ 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index f57d88bd8d8a..821a93028c5d 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -863,12 +863,13 @@ def prepare_latents( if latents is None: noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + # if strength is 1. then initialise the latents to noise, else initial to image + noise latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) + # if pure noise then scale the initial latents by the Scheduler's init sigma + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma + latents = latents * self.scheduler.init_noise_sigma outputs = (latents,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 5dbac9295800..534748c35363 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -648,12 +648,13 @@ def prepare_latents( if latents is None: noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + # if strength is 1. then initialise the latents to noise, else initial to image + noise latents = noise if is_strength_max else self.scheduler.add_noise(image_latents, noise, timestep) + # if pure noise then scale the initial latents by the Scheduler's init sigma + latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: latents = latents.to(device) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * self.scheduler.init_noise_sigma + latents = latents * self.scheduler.init_noise_sigma outputs = (latents,) @@ -912,6 +913,12 @@ def __call__( timesteps, num_inference_steps = self.get_timesteps( num_inference_steps=num_inference_steps, strength=strength, device=device ) + # check that number of inference steps is not < 1 - as this doesn't make sense + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" + f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." + ) # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index e355e82e5b35..0cf4d711be4c 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -231,6 +231,20 @@ def test_stable_diffusion_inpaint_lora(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) + def test_stable_diffusion_inpaint_strength_zero_test(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionInpaintPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + + # check that the pipeline raises value error when num_inference_steps is < 1 + inputs["strength"] = 0.01 + with self.assertRaises(ValueError): + sd_pipe(**inputs).images + class StableDiffusionSimpleInpaintPipelineFastTests(StableDiffusionInpaintPipelineFastTests): pipeline_class = StableDiffusionInpaintPipeline From 6cbddf558adc2992c77e640fdaadf0584fbfa233 Mon Sep 17 00:00:00 2001 From: Kadir Nar Date: Tue, 30 May 2023 14:24:15 +0300 Subject: [PATCH 048/199] [Community] Support StableDiffusionTilingPipeline (#3586) * added mixture pipeline * added docstring * update docstring --- examples/community/ mixture.py | 404 +++++++++++++++++++++++++++++++++ examples/community/README.md | 30 +++ 2 files changed, 434 insertions(+) create mode 100644 examples/community/ mixture.py diff --git a/examples/community/ mixture.py b/examples/community/ mixture.py new file mode 100644 index 000000000000..60d0ee2d09d3 --- /dev/null +++ b/examples/community/ mixture.py @@ -0,0 +1,404 @@ +import inspect +from copy import deepcopy +from enum import Enum +from typing import List, Optional, Tuple, Union + +import torch +from ligo.segments import segment +from tqdm.auto import tqdm +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker +from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from diffusers.utils import logging + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import LMSDiscreteScheduler + >>> from mixdiff import StableDiffusionTilingPipeline + + >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) + >>> pipeline = StableDiffusionTilingPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler) + >>> pipeline.to("cuda:0") + + >>> image = pipeline( + >>> prompt=[[ + >>> "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + >>> "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + >>> "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece" + >>> ]], + >>> tile_height=640, + >>> tile_width=640, + >>> tile_row_overlap=0, + >>> tile_col_overlap=256, + >>> guidance_scale=8, + >>> seed=7178915308, + >>> num_inference_steps=50, + >>> )["images"][0] + ``` +""" + + +def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): + """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image + + Returns a tuple with: + - Starting coordinates of rows in pixel space + - Ending coordinates of rows in pixel space + - Starting coordinates of columns in pixel space + - Ending coordinates of columns in pixel space + """ + px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap) + px_row_end = px_row_init + tile_height + px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap) + px_col_end = px_col_init + tile_width + return px_row_init, px_row_end, px_col_init, px_col_end + + +def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end): + """Translates coordinates in pixel space to coordinates in latent space""" + return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8 + + +def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): + """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image + + Returns a tuple with: + - Starting coordinates of rows in latent space + - Ending coordinates of rows in latent space + - Starting coordinates of columns in latent space + - Ending coordinates of columns in latent space + """ + px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end) + + +def _tile2latent_exclusive_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns +): + """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image + + Returns a tuple with: + - Starting coordinates of rows in latent space + - Ending coordinates of rows in latent space + - Starting coordinates of columns in latent space + - Ending coordinates of columns in latent space + """ + row_init, row_end, col_init, col_end = _tile2latent_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + row_segment = segment(row_init, row_end) + col_segment = segment(col_init, col_end) + # Iterate over the rest of tiles, clipping the region for the current tile + for row in range(rows): + for column in range(columns): + if row != tile_row and column != tile_col: + clip_row_init, clip_row_end, clip_col_init, clip_col_end = _tile2latent_indices( + row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + row_segment = row_segment - segment(clip_row_init, clip_row_end) + col_segment = col_segment - segment(clip_col_init, clip_col_end) + # return row_init, row_end, col_init, col_end + return row_segment[0], row_segment[1], col_segment[0], col_segment[1] + + +class StableDiffusionExtrasMixin: + """Mixin providing additional convenience method to Stable Diffusion pipelines""" + + def decode_latents(self, latents, cpu_vae=False): + """Decodes a given array of latents into pixel space""" + # scale and decode the image latents with vae + if cpu_vae: + lat = deepcopy(latents).cpu() + vae = deepcopy(self.vae).cpu() + else: + lat = latents + vae = self.vae + + lat = 1 / 0.18215 * lat + image = vae.decode(lat).sample + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + return self.numpy_to_pil(image) + + +class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin): + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + class SeedTilesMode(Enum): + """Modes in which the latents of a particular tile can be re-seeded""" + + FULL = "full" + EXCLUSIVE = "exclusive" + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[List[str]]], + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + tile_height: Optional[int] = 512, + tile_width: Optional[int] = 512, + tile_row_overlap: Optional[int] = 256, + tile_col_overlap: Optional[int] = 256, + guidance_scale_tiles: Optional[List[List[float]]] = None, + seed_tiles: Optional[List[List[int]]] = None, + seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full", + seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None, + cpu_vae: Optional[bool] = False, + ): + r""" + Function to run the diffusion pipeline with tiling support. + + Args: + prompt: either a single string (no tiling) or a list of lists with all the prompts to use (one list for each row of tiles). This will also define the tiling structure. + num_inference_steps: number of diffusions steps. + guidance_scale: classifier-free guidance. + seed: general random seed to initialize latents. + tile_height: height in pixels of each grid tile. + tile_width: width in pixels of each grid tile. + tile_row_overlap: number of overlap pixels between tiles in consecutive rows. + tile_col_overlap: number of overlap pixels between tiles in consecutive columns. + guidance_scale_tiles: specific weights for classifier-free guidance in each tile. + guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used. + seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter. + seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden. + seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles. + cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues. + + Examples: + + Returns: + A PIL image with the generated image. + + """ + if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt): + raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}") + grid_rows = len(prompt) + grid_cols = len(prompt[0]) + if not all(len(row) == grid_cols for row in prompt): + raise ValueError("All prompt rows must have the same number of prompt columns") + if not isinstance(seed_tiles_mode, str) and ( + not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode) + ): + raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") + if isinstance(seed_tiles_mode, str): + seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] + if any( + mode not in (modes := [mode.value for mode in self.SeedTilesMode]) + for row in seed_tiles_mode + for mode in row + ): + raise ValueError(f"Seed tiles mode must be one of {modes}") + if seed_reroll_regions is None: + seed_reroll_regions = [] + batch_size = 1 + + # create original noisy latents using the timesteps + height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap) + width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap) + latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8) + generator = torch.Generator("cuda").manual_seed(seed) + latents = torch.randn(latents_shape, generator=generator, device=self.device) + + # overwrite latents for specific tiles if provided + if seed_tiles is not None: + for row in range(grid_rows): + for col in range(grid_cols): + if (seed_tile := seed_tiles[row][col]) is not None: + mode = seed_tiles_mode[row][col] + if mode == self.SeedTilesMode.FULL.value: + row_init, row_end, col_init, col_end = _tile2latent_indices( + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + else: + row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices( + row, + col, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + grid_rows, + grid_cols, + ) + tile_generator = torch.Generator("cuda").manual_seed(seed_tile) + tile_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init) + latents[:, :, row_init:row_end, col_init:col_end] = torch.randn( + tile_shape, generator=tile_generator, device=self.device + ) + + # overwrite again for seed reroll regions + for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions: + row_init, row_end, col_init, col_end = _pixel2latent_indices( + row_init, row_end, col_init, col_end + ) # to latent space coordinates + reroll_generator = torch.Generator("cuda").manual_seed(seed_reroll) + region_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init) + latents[:, :, row_init:row_end, col_init:col_end] = torch.randn( + region_shape, generator=reroll_generator, device=self.device + ) + + # Prepare scheduler + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + if accepts_offset: + extra_set_kwargs["offset"] = 1 + self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas + if isinstance(self.scheduler, LMSDiscreteScheduler): + latents = latents * self.scheduler.sigmas[0] + + # get prompts text embeddings + text_input = [ + [ + self.tokenizer( + col, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + for col in row + ] + for row in prompt + ] + text_embeddings = [[self.text_encoder(col.input_ids.to(self.device))[0] for col in row] for row in text_input] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 # TODO: also active if any tile has guidance scale + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + for i in range(grid_rows): + for j in range(grid_cols): + max_length = text_input[i][j].input_ids.shape[-1] + uncond_input = self.tokenizer( + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt" + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings[i][j] = torch.cat([uncond_embeddings, text_embeddings[i][j]]) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # Mask for tile weights strenght + tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size) + + # Diffusion timesteps + for i, t in tqdm(enumerate(self.scheduler.timesteps)): + # Diffuse each tile + noise_preds = [] + for row in range(grid_rows): + noise_preds_row = [] + for col in range(grid_cols): + px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end] + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([tile_latents] * 2) if do_classifier_free_guidance else tile_latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[ + "sample" + ] + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + guidance = ( + guidance_scale + if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None + else guidance_scale_tiles[row][col] + ) + noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond) + noise_preds_row.append(noise_pred_tile) + noise_preds.append(noise_preds_row) + # Stitch noise predictions for all tiles + noise_pred = torch.zeros(latents.shape, device=self.device) + contributors = torch.zeros(latents.shape, device=self.device) + # Add each tile contribution to overall latents + for row in range(grid_rows): + for col in range(grid_cols): + px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += ( + noise_preds[row][col] * tile_weights + ) + contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights + # Average overlapping areas with more than 1 contributor + noise_pred /= contributors + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + # scale and decode the image latents with vae + image = self.decode_latents(latents, cpu_vae) + + return {"images": image} + + def _gaussian_weights(self, tile_width, tile_height, nbatches): + """Generates a gaussian mask of weights for tile contributions""" + import numpy as np + from numpy import exp, pi, sqrt + + latent_width = tile_width // 8 + latent_height = tile_height // 8 + + var = 0.01 + midpoint = (latent_width - 1) / 2 # -1 because index goes from 0 to latent_width - 1 + x_probs = [ + exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var) + for x in range(latent_width) + ] + midpoint = latent_height / 2 + y_probs = [ + exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var) + for y in range(latent_height) + ] + + weights = np.outer(y_probs, x_probs) + return torch.tile(torch.tensor(weights, device=self.device), (nbatches, self.unet.config.in_channels, 1, 1)) diff --git a/examples/community/README.md b/examples/community/README.md index 0211287d4ebb..f3af03410097 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1510,3 +1510,33 @@ latency = elapsed_time(pipe4) print("Latency of StableDiffusionPipeline--fp32",latency) ``` + +### Stable Diffusion Mixture + +This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details. + +```python +from diffusers import LMSDiscreteScheduler +from mixdiff import StableDiffusionTilingPipeline + +# Creater scheduler and model (similar to StableDiffusionPipeline) +scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) +pipeline = StableDiffusionTilingPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler) +pipeline.to("cuda:0") + +# Mixture of Diffusers generation +image = pipeline( + prompt=[[ + "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece" + ]], + tile_height=640, + tile_width=640, + tile_row_overlap=0, + tile_col_overlap=256, + guidance_scale=8, + seed=7178915308, + num_inference_steps=50, +)["images"][0] +``` From 07ef4855cd2b2fe9d72bc0479f15959333f11068 Mon Sep 17 00:00:00 2001 From: takuoko Date: Tue, 30 May 2023 20:38:16 +0900 Subject: [PATCH 049/199] [Community, Enhancement] Add reference tricks in README (#3589) add reference tricks --- examples/community/README.md | 5 +++++ .../stable_diffusion_controlnet_reference.py | 16 ++++++++-------- examples/community/stable_diffusion_reference.py | 16 ++++++++-------- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/examples/community/README.md b/examples/community/README.md index f3af03410097..21fba38e690b 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1326,6 +1326,8 @@ image.save('tensorrt_img2img_new_zealand_hills.png') This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280). +Based on [this issue](https://github.com/huggingface/diffusers/issues/3566), +- `EulerAncestralDiscreteScheduler` got poor results. ```py import torch @@ -1369,6 +1371,9 @@ Output Image of `reference_attn=True` and `reference_adain=True` This pipeline uses the Reference Control with ControlNet. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280). +Based on [this issue](https://github.com/huggingface/diffusers/issues/3566), +- `EulerAncestralDiscreteScheduler` got poor results. +- `guess_mode=True` works well for ControlNet v1.1 ```py import cv2 diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index 606fe09c68fc..ca06136d7829 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -505,8 +505,8 @@ def hack_CrossAttnDownBlock2D_forward( if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) @@ -545,8 +545,8 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None): if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) @@ -605,8 +605,8 @@ def hacked_CrossAttnUpBlock2D_forward( if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) @@ -642,8 +642,8 @@ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb= if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index 22e0b40f60a3..dbfb768f8b4f 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -499,8 +499,8 @@ def hack_CrossAttnDownBlock2D_forward( if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) @@ -539,8 +539,8 @@ def hacked_DownBlock2D_forward(self, hidden_states, temb=None): if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) @@ -599,8 +599,8 @@ def hacked_CrossAttnUpBlock2D_forward( if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) @@ -636,8 +636,8 @@ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb= if MODE == "write": if gn_auto_machine_weight >= self.gn_weight: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) - self.mean_bank.append(mean) - self.var_bank.append(var) + self.mean_bank.append([mean]) + self.var_bank.append([var]) if MODE == "read": if len(self.mean_bank) > 0 and len(self.var_bank) > 0: var, mean = torch.var_mean(hidden_states, dim=(2, 3), keepdim=True, correction=0) From 799f5b4e12c5350872b6fe5ebc28be423d2570c3 Mon Sep 17 00:00:00 2001 From: Greg Hunkins Date: Tue, 30 May 2023 08:13:34 -0400 Subject: [PATCH 050/199] [Feat] Enable State Dict For Textual Inversion Loader (#3439) * enable state dict for textual inversion loader * Empty-Commit | restart CI * Empty-Commit | restart CI * Empty-Commit | restart CI * Empty-Commit | restart CI * add tests * fix tests * fix tests * fix tests --------- Co-authored-by: Patrick von Platen --- src/diffusers/loaders.py | 71 +++++++++++++++++-------------- tests/pipelines/test_pipelines.py | 59 +++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 33 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 3761f0e59d05..84e6b4e61f0f 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -470,7 +470,7 @@ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"): def load_textual_inversion( self, - pretrained_model_name_or_path: Union[str, List[str]], + pretrained_model_name_or_path: Union[str, List[str], Dict[str, torch.Tensor], List[Dict[str, torch.Tensor]]], token: Optional[Union[str, List[str]]] = None, **kwargs, ): @@ -485,7 +485,7 @@ def load_textual_inversion( Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]`): + pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`): Can be either: - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. @@ -494,6 +494,8 @@ def load_textual_inversion( - A path to a *directory* containing textual inversion weights, e.g. `./my_text_inversion_directory/`. - A path to a *file* containing textual inversion weights, e.g. `./my_text_inversions.pt`. + - A [torch state + dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). Or a list of those elements. token (`str` or `List[str]`, *optional*): @@ -618,7 +620,7 @@ def load_textual_inversion( "framework": "pytorch", } - if isinstance(pretrained_model_name_or_path, str): + if not isinstance(pretrained_model_name_or_path, list): pretrained_model_name_or_paths = [pretrained_model_name_or_path] else: pretrained_model_name_or_paths = pretrained_model_name_or_path @@ -643,16 +645,38 @@ def load_textual_inversion( token_ids_and_embeddings = [] for pretrained_model_name_or_path, token in zip(pretrained_model_name_or_paths, tokens): - # 1. Load textual inversion file - model_file = None - # Let's first try to load .safetensors weights - if (use_safetensors and weight_name is None) or ( - weight_name is not None and weight_name.endswith(".safetensors") - ): - try: + if not isinstance(pretrained_model_name_or_path, dict): + # 1. Load textual inversion file + model_file = None + # Let's first try to load .safetensors weights + if (use_safetensors and weight_name is None) or ( + weight_name is not None and weight_name.endswith(".safetensors") + ): + try: + model_file = _get_model_file( + pretrained_model_name_or_path, + weights_name=weight_name or TEXT_INVERSION_NAME_SAFE, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + ) + state_dict = safetensors.torch.load_file(model_file, device="cpu") + except Exception as e: + if not allow_pickle: + raise e + + model_file = None + + if model_file is None: model_file = _get_model_file( pretrained_model_name_or_path, - weights_name=weight_name or TEXT_INVERSION_NAME_SAFE, + weights_name=weight_name or TEXT_INVERSION_NAME, cache_dir=cache_dir, force_download=force_download, resume_download=resume_download, @@ -663,28 +687,9 @@ def load_textual_inversion( subfolder=subfolder, user_agent=user_agent, ) - state_dict = safetensors.torch.load_file(model_file, device="cpu") - except Exception as e: - if not allow_pickle: - raise e - - model_file = None - - if model_file is None: - model_file = _get_model_file( - pretrained_model_name_or_path, - weights_name=weight_name or TEXT_INVERSION_NAME, - cache_dir=cache_dir, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - subfolder=subfolder, - user_agent=user_agent, - ) - state_dict = torch.load(model_file, map_location="cpu") + state_dict = torch.load(model_file, map_location="cpu") + else: + state_dict = pretrained_model_name_or_path # 2. Load token and embedding correcly from file loaded_token = None diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 8eaee0915a4f..bb7c980875ef 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -663,6 +663,65 @@ def test_text_inversion_download(self): out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) + # single token state dict load + ten = {"": torch.ones((32,))} + pipe.load_textual_inversion(ten) + + token = pipe.tokenizer.convert_tokens_to_ids("") + assert token == num_tokens + 10, "Added token must be at spot `num_tokens`" + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 32 + assert pipe._maybe_convert_prompt("", pipe.tokenizer) == "" + + prompt = "hey " + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images + assert out.shape == (1, 128, 128, 3) + + # multi embedding state dict load + ten1 = {"": torch.ones((32,))} + ten2 = {"": 2 * torch.ones((1, 32))} + + pipe.load_textual_inversion([ten1, ten2]) + + token = pipe.tokenizer.convert_tokens_to_ids("") + assert token == num_tokens + 11, "Added token must be at spot `num_tokens`" + assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 32 + assert pipe._maybe_convert_prompt("", pipe.tokenizer) == "" + + token = pipe.tokenizer.convert_tokens_to_ids("") + assert token == num_tokens + 12, "Added token must be at spot `num_tokens`" + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 64 + assert pipe._maybe_convert_prompt("", pipe.tokenizer) == "" + + prompt = "hey " + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images + assert out.shape == (1, 128, 128, 3) + + # auto1111 multi-token state dict load + ten = { + "string_to_param": { + "*": torch.cat([3 * torch.ones((1, 32)), 4 * torch.ones((1, 32)), 5 * torch.ones((1, 32))]) + }, + "name": "", + } + + pipe.load_textual_inversion(ten) + + token = pipe.tokenizer.convert_tokens_to_ids("") + token_1 = pipe.tokenizer.convert_tokens_to_ids("_1") + token_2 = pipe.tokenizer.convert_tokens_to_ids("_2") + + assert token == num_tokens + 13, "Added token must be at spot `num_tokens`" + assert token_1 == num_tokens + 14, "Added token must be at spot `num_tokens`" + assert token_2 == num_tokens + 15, "Added token must be at spot `num_tokens`" + assert pipe.text_encoder.get_input_embeddings().weight[-3].sum().item() == 96 + assert pipe.text_encoder.get_input_embeddings().weight[-2].sum().item() == 128 + assert pipe.text_encoder.get_input_embeddings().weight[-1].sum().item() == 160 + assert pipe._maybe_convert_prompt("", pipe.tokenizer) == " _1 _2" + + prompt = "hey " + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images + assert out.shape == (1, 128, 128, 3) + def test_download_ignore_files(self): # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4 with tempfile.TemporaryDirectory() as tmpdirname: From bb22d546c062ae768a9f54a9eb1675f2a8dcdad9 Mon Sep 17 00:00:00 2001 From: Denis Date: Tue, 30 May 2023 15:13:45 +0300 Subject: [PATCH 051/199] [Community] CLIP Guided Images Mixing with Stable DIffusion Pipeline (#3587) * added clip_guided_images_mixing_stable_diffusion file and readme description * apply pre-commit --------- Co-authored-by: Patrick von Platen --- examples/community/README.md | 84 +++ ...p_guided_images_mixing_stable_diffusion.py | 512 ++++++++++++++++++ 2 files changed, 596 insertions(+) create mode 100644 examples/community/clip_guided_images_mixing_stable_diffusion.py diff --git a/examples/community/README.md b/examples/community/README.md index 21fba38e690b..8afd3aed18a6 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -36,6 +36,7 @@ If a community doesn't work as expected, please open an issue and ping the autho | Stable Diffusion RePaint | Stable Diffusion pipeline using [RePaint](https://arxiv.org/abs/2201.0986) for inpainting. | [Stable Diffusion RePaint](#stable-diffusion-repaint ) | - | [Markus Pobitzer](https://github.com/Markus-Pobitzer) | | TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) | | Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) | +| CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. ```py @@ -1515,6 +1516,89 @@ latency = elapsed_time(pipe4) print("Latency of StableDiffusionPipeline--fp32",latency) ``` + +### CLIP Guided Images Mixing With Stable Diffusion + +![clip_guided_images_mixing_examples](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/main.png) + +CLIP guided stable diffusion images mixing pipline allows to combine two images using standard diffusion models. +This approach is using (optional) CoCa model to avoid writing image description. +[More code examples](https://github.com/TheDenk/images_mixing) + +## Example Images Mixing (with CoCa) +```python +import requests +from io import BytesIO + +import PIL +import torch +import open_clip +from open_clip import SimpleTokenizer +from diffusers import DiffusionPipeline +from transformers import CLIPFeatureExtractor, CLIPModel + + +def download_image(url): + response = requests.get(url) + return PIL.Image.open(BytesIO(response.content)).convert("RGB") + +# Loading additional models +feature_extractor = CLIPFeatureExtractor.from_pretrained( + "laion/CLIP-ViT-B-32-laion2B-s34B-b79K" +) +clip_model = CLIPModel.from_pretrained( + "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16 +) +coca_model = open_clip.create_model('coca_ViT-L-14', pretrained='laion2B-s13B-b90k').to('cuda') +coca_model.dtype = torch.float16 +coca_transform = open_clip.image_transform( + coca_model.visual.image_size, + is_train = False, + mean = getattr(coca_model.visual, 'image_mean', None), + std = getattr(coca_model.visual, 'image_std', None), +) +coca_tokenizer = SimpleTokenizer() + +# Pipline creating +mixing_pipeline = DiffusionPipeline.from_pretrained( + "CompVis/stable-diffusion-v1-4", + custom_pipeline="clip_guided_images_mixing_stable_diffusion", + clip_model=clip_model, + feature_extractor=feature_extractor, + coca_model=coca_model, + coca_tokenizer=coca_tokenizer, + coca_transform=coca_transform, + torch_dtype=torch.float16, +) +mixing_pipeline.enable_attention_slicing() +mixing_pipeline = mixing_pipeline.to("cuda") + +# Pipline running +generator = torch.Generator(device="cuda").manual_seed(17) + +def download_image(url): + response = requests.get(url) + return PIL.Image.open(BytesIO(response.content)).convert("RGB") + +content_image = download_image("https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/boromir.jpg") +style_image = download_image("https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/gigachad.jpg") + +pipe_images = mixing_pipeline( + num_inference_steps=50, + content_image=content_image, + style_image=style_image, + noise_strength=0.65, + slerp_latent_style_strength=0.9, + slerp_prompt_style_strength=0.1, + slerp_clip_image_style_strength=0.1, + guidance_scale=9.0, + batch_size=1, + clip_guidance_scale=100, + generator=generator, +).images +``` + +![image_mixing_result](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/boromir_gigachad.png) ### Stable Diffusion Mixture diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py new file mode 100644 index 000000000000..97e3c6627b76 --- /dev/null +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -0,0 +1,512 @@ +# -*- coding: utf-8 -*- +import inspect +from typing import Optional, Union + +import numpy as np +import PIL +import torch +from torch.nn import functional as F +from torchvision import transforms +from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextModel, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + LMSDiscreteScheduler, + PNDMScheduler, + UNet2DConditionModel, +) +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipelineOutput +from diffusers.utils import ( + PIL_INTERPOLATION, + randn_tensor, +) + + +def preprocess(image, w, h): + if isinstance(image, torch.Tensor): + return image + elif isinstance(image, PIL.Image.Image): + image = [image] + + if isinstance(image[0], PIL.Image.Image): + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION['lanczos']))[ + None, :] for i in image] + image = np.concatenate(image, axis=0) + image = np.array(image).astype(np.float32) / 255.0 + image = image.transpose(0, 3, 1, 2) + image = 2.0 * image - 1.0 + image = torch.from_numpy(image) + elif isinstance(image[0], torch.Tensor): + image = torch.cat(image, dim=0) + return image + + +def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): + + if not isinstance(v0, np.ndarray): + inputs_are_torch = True + input_device = v0.device + v0 = v0.cpu().numpy() + v1 = v1.cpu().numpy() + + dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1))) + if np.abs(dot) > DOT_THRESHOLD: + v2 = (1 - t) * v0 + t * v1 + else: + theta_0 = np.arccos(dot) + sin_theta_0 = np.sin(theta_0) + theta_t = theta_0 * t + sin_theta_t = np.sin(theta_t) + s0 = np.sin(theta_0 - theta_t) / sin_theta_0 + s1 = sin_theta_t / sin_theta_0 + v2 = s0 * v0 + s1 * v1 + + if inputs_are_torch: + v2 = torch.from_numpy(v2).to(input_device) + + return v2 + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) + + +def set_requires_grad(model, value): + for param in model.parameters(): + param.requires_grad = value + + +class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline): + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + clip_model: CLIPModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[PNDMScheduler, LMSDiscreteScheduler, DDIMScheduler, DPMSolverMultistepScheduler], + feature_extractor: CLIPFeatureExtractor, + coca_model=None, + coca_tokenizer=None, + coca_transform=None, + ): + super().__init__() + self.register_modules( + vae=vae, + text_encoder=text_encoder, + clip_model=clip_model, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + feature_extractor=feature_extractor, + coca_model=coca_model, + coca_tokenizer=coca_tokenizer, + coca_transform=coca_transform, + ) + self.feature_extractor_size = ( + feature_extractor.size + if isinstance(feature_extractor.size, int) + else feature_extractor.size['shortest_edge'] + ) + self.normalize = transforms.Normalize( + mean=feature_extractor.image_mean, std=feature_extractor.image_std) + set_requires_grad(self.text_encoder, False) + set_requires_grad(self.clip_model, False) + + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = 'auto'): + if slice_size == 'auto': + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = self.unet.config.attention_head_dim // 2 + self.unet.set_attention_slice(slice_size) + + def disable_attention_slicing(self): + self.enable_attention_slicing(None) + + def freeze_vae(self): + set_requires_grad(self.vae, False) + + def unfreeze_vae(self): + set_requires_grad(self.vae, True) + + def freeze_unet(self): + set_requires_grad(self.unet, False) + + def unfreeze_unet(self): + set_requires_grad(self.unet, True) + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min( + int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, image, timestep, batch_size, dtype, device, generator=None): + if not isinstance(image, torch.Tensor): + raise ValueError( + f'`image` has to be of type `torch.Tensor` but is {type(image)}' + ) + + image = image.to(device=device, dtype=dtype) + + if isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) + + # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor + init_latents = 0.18215 * init_latents + init_latents = init_latents.repeat_interleave(batch_size, dim=0) + + noise = randn_tensor(init_latents.shape, + generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + return latents + + def get_image_description(self, image): + transformed_image = self.coca_transform(image).unsqueeze(0) + with torch.no_grad(), torch.cuda.amp.autocast(): + generated = self.coca_model.generate(transformed_image.to( + device=self.device, dtype=self.coca_model.dtype)) + generated = self.coca_tokenizer.decode(generated[0].cpu().numpy()) + return generated.split('')[0].replace('', '').rstrip(' .,') + + def get_clip_image_embeddings(self, image, batch_size): + clip_image_input = self.feature_extractor.preprocess(image) + clip_image_features = torch.from_numpy( + clip_image_input['pixel_values'][0]).unsqueeze(0).to(self.device).half() + image_embeddings_clip = self.clip_model.get_image_features( + clip_image_features) + image_embeddings_clip = image_embeddings_clip / \ + image_embeddings_clip.norm(p=2, dim=-1, keepdim=True) + image_embeddings_clip = image_embeddings_clip.repeat_interleave( + batch_size, dim=0) + return image_embeddings_clip + + @torch.enable_grad() + def cond_fn( + self, + latents, + timestep, + index, + text_embeddings, + noise_pred_original, + original_image_embeddings_clip, + clip_guidance_scale, + ): + latents = latents.detach().requires_grad_() + + latent_model_input = self.scheduler.scale_model_input( + latents, timestep) + + # predict the noise residual + noise_pred = self.unet(latent_model_input, timestep, + encoder_hidden_states=text_embeddings).sample + + if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)): + alpha_prod_t = self.scheduler.alphas_cumprod[timestep] + beta_prod_t = 1 - alpha_prod_t + # compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_original_sample = ( + latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) + + fac = torch.sqrt(beta_prod_t) + sample = pred_original_sample * (fac) + latents * (1 - fac) + elif isinstance(self.scheduler, LMSDiscreteScheduler): + sigma = self.scheduler.sigmas[index] + sample = latents - sigma * noise_pred + else: + raise ValueError( + f'scheduler type {type(self.scheduler)} not supported') + + # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor + sample = 1 / 0.18215 * sample + image = self.vae.decode(sample).sample + image = (image / 2 + 0.5).clamp(0, 1) + + image = transforms.Resize(self.feature_extractor_size)(image) + image = self.normalize(image).to(latents.dtype) + + image_embeddings_clip = self.clip_model.get_image_features(image) + image_embeddings_clip = image_embeddings_clip / \ + image_embeddings_clip.norm(p=2, dim=-1, keepdim=True) + + loss = spherical_dist_loss( + image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale + + grads = -torch.autograd.grad(loss, latents)[0] + + if isinstance(self.scheduler, LMSDiscreteScheduler): + latents = latents.detach() + grads * (sigma**2) + noise_pred = noise_pred_original + else: + noise_pred = noise_pred_original - torch.sqrt(beta_prod_t) * grads + return noise_pred, latents + + @torch.no_grad() + def __call__( + self, + style_image: Union[torch.FloatTensor, PIL.Image.Image], + content_image: Union[torch.FloatTensor, PIL.Image.Image], + style_prompt: Optional[str] = None, + content_prompt: Optional[str] = None, + height: Optional[int] = 512, + width: Optional[int] = 512, + noise_strength: float = 0.6, + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + batch_size: Optional[int] = 1, + eta: float = 0.0, + clip_guidance_scale: Optional[float] = 100, + generator: Optional[torch.Generator] = None, + output_type: Optional[str] = 'pil', + return_dict: bool = True, + slerp_latent_style_strength: float = 0.8, + slerp_prompt_style_strength: float = 0.1, + slerp_clip_image_style_strength: float = 0.1, + ): + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f'You have passed {batch_size} batch_size, but only {len(generator)} generators.') + + if height % 8 != 0 or width % 8 != 0: + raise ValueError( + f'`height` and `width` have to be divisible by 8 but are {height} and {width}.') + + if isinstance(generator, torch.Generator) and batch_size > 1: + generator = [generator] + [None] * (batch_size - 1) + + coca_is_none = [ + ('model', self.coca_model is None), + ('tokenizer', self.coca_tokenizer is None), + ('transform', self.coca_transform is None) + ] + coca_is_none = [x[0] for x in coca_is_none if x[1]] + coca_is_none_str = ', '.join(coca_is_none) + # generate prompts with coca model if prompt is None + if content_prompt is None: + if len(coca_is_none): + raise ValueError( + f'Content prompt is None and CoCa [{coca_is_none_str}] is None.' + f'Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline.' + ) + content_prompt = self.get_image_description(content_image) + if style_prompt is None: + if len(coca_is_none): + raise ValueError( + f'Style prompt is None and CoCa [{coca_is_none_str}] is None.' + f' Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline.' + ) + style_prompt = self.get_image_description(style_image) + + # get prompt text embeddings for content and style + content_text_input = self.tokenizer( + content_prompt, + padding='max_length', + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors='pt', + ) + content_text_embeddings = self.text_encoder( + content_text_input.input_ids.to(self.device))[0] + + style_text_input = self.tokenizer( + style_prompt, + padding='max_length', + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors='pt', + ) + style_text_embeddings = self.text_encoder( + style_text_input.input_ids.to(self.device))[0] + + text_embeddings = slerp( + slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings) + + # duplicate text embeddings for each generation per prompt + text_embeddings = text_embeddings.repeat_interleave(batch_size, dim=0) + + # set timesteps + accepts_offset = 'offset' in set(inspect.signature( + self.scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + if accepts_offset: + extra_set_kwargs['offset'] = 1 + + self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + # Some schedulers like PNDM have timesteps as arrays + # It's more optimized to move all timesteps to correct device beforehand + self.scheduler.timesteps.to(self.device) + + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps, noise_strength, self.device) + latent_timestep = timesteps[:1].repeat(batch_size) + + # Preprocess image + preprocessed_content_image = preprocess(content_image, width, height) + content_latents = self.prepare_latents( + preprocessed_content_image, + latent_timestep, + batch_size, + text_embeddings.dtype, + self.device, + generator + ) + + preprocessed_style_image = preprocess(style_image, width, height) + style_latents = self.prepare_latents( + preprocessed_style_image, + latent_timestep, + batch_size, + text_embeddings.dtype, + self.device, + generator + ) + + latents = slerp(slerp_latent_style_strength, + content_latents, style_latents) + + if clip_guidance_scale > 0: + content_clip_image_embedding = self.get_clip_image_embeddings( + content_image, batch_size) + style_clip_image_embedding = self.get_clip_image_embeddings( + style_image, batch_size) + clip_image_embeddings = slerp( + slerp_clip_image_style_strength, content_clip_image_embedding, style_clip_image_embedding) + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + max_length = content_text_input.input_ids.shape[-1] + uncond_input = self.tokenizer( + [''], padding='max_length', max_length=max_length, return_tensors='pt') + uncond_embeddings = self.text_encoder( + uncond_input.input_ids.to(self.device))[0] + # duplicate unconditional embeddings for each generation per prompt + uncond_embeddings = uncond_embeddings.repeat_interleave( + batch_size, dim=0) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) + + # get the initial random noise unless the user supplied it + + # Unlike in other pipelines, latents need to be generated in the target device + # for 1-to-1 results reproducibility with the CompVis implementation. + # However this currently doesn't work in `mps`. + latents_shape = ( + batch_size, self.unet.config.in_channels, height // 8, width // 8) + latents_dtype = text_embeddings.dtype + if latents is None: + if self.device.type == 'mps': + # randn does not work reproducibly on mps + latents = torch.randn( + latents_shape, + generator=generator, + device='cpu', + dtype=latents_dtype + ).to(self.device) + else: + latents = torch.randn( + latents_shape, generator=generator, device=self.device, dtype=latents_dtype) + else: + if latents.shape != latents_shape: + raise ValueError( + f'Unexpected latents shape, got {latents.shape}, expected {latents_shape}') + latents = latents.to(self.device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = 'eta' in set(inspect.signature( + self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs['eta'] = eta + + # check if the scheduler accepts generator + accepts_generator = 'generator' in set( + inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs['generator'] = generator + + with self.progress_bar(total=num_inference_steps): + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat( + [latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input( + latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, t, encoder_hidden_states=text_embeddings).sample + + # perform classifier free guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * \ + (noise_pred_text - noise_pred_uncond) + + # perform clip guidance + if clip_guidance_scale > 0: + text_embeddings_for_guidance = ( + text_embeddings.chunk( + 2)[1] if do_classifier_free_guidance else text_embeddings + ) + noise_pred, latents = self.cond_fn( + latents, + t, + i, + text_embeddings_for_guidance, + noise_pred, + clip_image_embeddings, + clip_guidance_scale, + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor + latents = 1 / 0.18215 * latents + image = self.vae.decode(latents).sample + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + if output_type == 'pil': + image = self.numpy_to_pil(image) + + if not return_dict: + return (image, None) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=None) From 160c377ddc22c8c64b45829c541a9616c9310a7f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 30 May 2023 13:14:09 +0100 Subject: [PATCH 052/199] Make style --- ...p_guided_images_mixing_stable_diffusion.py | 196 +++++++----------- 1 file changed, 70 insertions(+), 126 deletions(-) diff --git a/examples/community/clip_guided_images_mixing_stable_diffusion.py b/examples/community/clip_guided_images_mixing_stable_diffusion.py index 97e3c6627b76..e4c52fe63f49 100644 --- a/examples/community/clip_guided_images_mixing_stable_diffusion.py +++ b/examples/community/clip_guided_images_mixing_stable_diffusion.py @@ -32,8 +32,7 @@ def preprocess(image, w, h): image = [image] if isinstance(image[0], PIL.Image.Image): - image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION['lanczos']))[ - None, :] for i in image] + image = [np.array(i.resize((w, h), resample=PIL_INTERPOLATION["lanczos"]))[None, :] for i in image] image = np.concatenate(image, axis=0) image = np.array(image).astype(np.float32) / 255.0 image = image.transpose(0, 3, 1, 2) @@ -45,7 +44,6 @@ def preprocess(image, w, h): def slerp(t, v0, v1, DOT_THRESHOLD=0.9995): - if not isinstance(v0, np.ndarray): inputs_are_torch = True input_device = v0.device @@ -82,7 +80,6 @@ def set_requires_grad(model, value): class CLIPGuidedImagesMixingStableDiffusion(DiffusionPipeline): - def __init__( self, vae: AutoencoderKL, @@ -112,15 +109,14 @@ def __init__( self.feature_extractor_size = ( feature_extractor.size if isinstance(feature_extractor.size, int) - else feature_extractor.size['shortest_edge'] + else feature_extractor.size["shortest_edge"] ) - self.normalize = transforms.Normalize( - mean=feature_extractor.image_mean, std=feature_extractor.image_std) + self.normalize = transforms.Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std) set_requires_grad(self.text_encoder, False) set_requires_grad(self.clip_model, False) - def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = 'auto'): - if slice_size == 'auto': + def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"): + if slice_size == "auto": # half the attention head size is usually a good trade-off between # speed and memory slice_size = self.unet.config.attention_head_dim // 2 @@ -143,8 +139,7 @@ def unfreeze_unet(self): def get_timesteps(self, num_inference_steps, strength, device): # get the original timestep using init_timestep - init_timestep = min( - int(num_inference_steps * strength), num_inference_steps) + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) t_start = max(num_inference_steps - init_timestep, 0) timesteps = self.scheduler.timesteps[t_start:] @@ -153,15 +148,13 @@ def get_timesteps(self, num_inference_steps, strength, device): def prepare_latents(self, image, timestep, batch_size, dtype, device, generator=None): if not isinstance(image, torch.Tensor): - raise ValueError( - f'`image` has to be of type `torch.Tensor` but is {type(image)}' - ) + raise ValueError(f"`image` has to be of type `torch.Tensor` but is {type(image)}") image = image.to(device=device, dtype=dtype) if isinstance(generator, list): init_latents = [ - self.vae.encode(image[i: i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) ] init_latents = torch.cat(init_latents, dim=0) else: @@ -171,8 +164,7 @@ def prepare_latents(self, image, timestep, batch_size, dtype, device, generator= init_latents = 0.18215 * init_latents init_latents = init_latents.repeat_interleave(batch_size, dim=0) - noise = randn_tensor(init_latents.shape, - generator=generator, device=device, dtype=dtype) + noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype) # get latents init_latents = self.scheduler.add_noise(init_latents, noise, timestep) @@ -183,21 +175,16 @@ def prepare_latents(self, image, timestep, batch_size, dtype, device, generator= def get_image_description(self, image): transformed_image = self.coca_transform(image).unsqueeze(0) with torch.no_grad(), torch.cuda.amp.autocast(): - generated = self.coca_model.generate(transformed_image.to( - device=self.device, dtype=self.coca_model.dtype)) + generated = self.coca_model.generate(transformed_image.to(device=self.device, dtype=self.coca_model.dtype)) generated = self.coca_tokenizer.decode(generated[0].cpu().numpy()) - return generated.split('')[0].replace('', '').rstrip(' .,') + return generated.split("")[0].replace("", "").rstrip(" .,") def get_clip_image_embeddings(self, image, batch_size): clip_image_input = self.feature_extractor.preprocess(image) - clip_image_features = torch.from_numpy( - clip_image_input['pixel_values'][0]).unsqueeze(0).to(self.device).half() - image_embeddings_clip = self.clip_model.get_image_features( - clip_image_features) - image_embeddings_clip = image_embeddings_clip / \ - image_embeddings_clip.norm(p=2, dim=-1, keepdim=True) - image_embeddings_clip = image_embeddings_clip.repeat_interleave( - batch_size, dim=0) + clip_image_features = torch.from_numpy(clip_image_input["pixel_values"][0]).unsqueeze(0).to(self.device).half() + image_embeddings_clip = self.clip_model.get_image_features(clip_image_features) + image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True) + image_embeddings_clip = image_embeddings_clip.repeat_interleave(batch_size, dim=0) return image_embeddings_clip @torch.enable_grad() @@ -213,20 +200,17 @@ def cond_fn( ): latents = latents.detach().requires_grad_() - latent_model_input = self.scheduler.scale_model_input( - latents, timestep) + latent_model_input = self.scheduler.scale_model_input(latents, timestep) # predict the noise residual - noise_pred = self.unet(latent_model_input, timestep, - encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, timestep, encoder_hidden_states=text_embeddings).sample if isinstance(self.scheduler, (PNDMScheduler, DDIMScheduler, DPMSolverMultistepScheduler)): alpha_prod_t = self.scheduler.alphas_cumprod[timestep] beta_prod_t = 1 - alpha_prod_t # compute predicted original sample from predicted noise also called # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - pred_original_sample = ( - latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) + pred_original_sample = (latents - beta_prod_t ** (0.5) * noise_pred) / alpha_prod_t ** (0.5) fac = torch.sqrt(beta_prod_t) sample = pred_original_sample * (fac) + latents * (1 - fac) @@ -234,8 +218,7 @@ def cond_fn( sigma = self.scheduler.sigmas[index] sample = latents - sigma * noise_pred else: - raise ValueError( - f'scheduler type {type(self.scheduler)} not supported') + raise ValueError(f"scheduler type {type(self.scheduler)} not supported") # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor sample = 1 / 0.18215 * sample @@ -246,11 +229,9 @@ def cond_fn( image = self.normalize(image).to(latents.dtype) image_embeddings_clip = self.clip_model.get_image_features(image) - image_embeddings_clip = image_embeddings_clip / \ - image_embeddings_clip.norm(p=2, dim=-1, keepdim=True) + image_embeddings_clip = image_embeddings_clip / image_embeddings_clip.norm(p=2, dim=-1, keepdim=True) - loss = spherical_dist_loss( - image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale + loss = spherical_dist_loss(image_embeddings_clip, original_image_embeddings_clip).mean() * clip_guidance_scale grads = -torch.autograd.grad(loss, latents)[0] @@ -277,121 +258,101 @@ def __call__( eta: float = 0.0, clip_guidance_scale: Optional[float] = 100, generator: Optional[torch.Generator] = None, - output_type: Optional[str] = 'pil', + output_type: Optional[str] = "pil", return_dict: bool = True, slerp_latent_style_strength: float = 0.8, slerp_prompt_style_strength: float = 0.1, slerp_clip_image_style_strength: float = 0.1, ): - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f'You have passed {batch_size} batch_size, but only {len(generator)} generators.') + raise ValueError(f"You have passed {batch_size} batch_size, but only {len(generator)} generators.") if height % 8 != 0 or width % 8 != 0: - raise ValueError( - f'`height` and `width` have to be divisible by 8 but are {height} and {width}.') + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") if isinstance(generator, torch.Generator) and batch_size > 1: generator = [generator] + [None] * (batch_size - 1) coca_is_none = [ - ('model', self.coca_model is None), - ('tokenizer', self.coca_tokenizer is None), - ('transform', self.coca_transform is None) + ("model", self.coca_model is None), + ("tokenizer", self.coca_tokenizer is None), + ("transform", self.coca_transform is None), ] coca_is_none = [x[0] for x in coca_is_none if x[1]] - coca_is_none_str = ', '.join(coca_is_none) + coca_is_none_str = ", ".join(coca_is_none) # generate prompts with coca model if prompt is None if content_prompt is None: if len(coca_is_none): raise ValueError( - f'Content prompt is None and CoCa [{coca_is_none_str}] is None.' - f'Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline.' + f"Content prompt is None and CoCa [{coca_is_none_str}] is None." + f"Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline." ) content_prompt = self.get_image_description(content_image) if style_prompt is None: if len(coca_is_none): raise ValueError( - f'Style prompt is None and CoCa [{coca_is_none_str}] is None.' - f' Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline.' + f"Style prompt is None and CoCa [{coca_is_none_str}] is None." + f" Set prompt or pass Coca [{coca_is_none_str}] to DiffusionPipeline." ) style_prompt = self.get_image_description(style_image) # get prompt text embeddings for content and style content_text_input = self.tokenizer( content_prompt, - padding='max_length', + padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors='pt', + return_tensors="pt", ) - content_text_embeddings = self.text_encoder( - content_text_input.input_ids.to(self.device))[0] + content_text_embeddings = self.text_encoder(content_text_input.input_ids.to(self.device))[0] style_text_input = self.tokenizer( style_prompt, - padding='max_length', + padding="max_length", max_length=self.tokenizer.model_max_length, truncation=True, - return_tensors='pt', + return_tensors="pt", ) - style_text_embeddings = self.text_encoder( - style_text_input.input_ids.to(self.device))[0] + style_text_embeddings = self.text_encoder(style_text_input.input_ids.to(self.device))[0] - text_embeddings = slerp( - slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings) + text_embeddings = slerp(slerp_prompt_style_strength, content_text_embeddings, style_text_embeddings) # duplicate text embeddings for each generation per prompt text_embeddings = text_embeddings.repeat_interleave(batch_size, dim=0) # set timesteps - accepts_offset = 'offset' in set(inspect.signature( - self.scheduler.set_timesteps).parameters.keys()) + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) extra_set_kwargs = {} if accepts_offset: - extra_set_kwargs['offset'] = 1 + extra_set_kwargs["offset"] = 1 self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) # Some schedulers like PNDM have timesteps as arrays # It's more optimized to move all timesteps to correct device beforehand self.scheduler.timesteps.to(self.device) - timesteps, num_inference_steps = self.get_timesteps( - num_inference_steps, noise_strength, self.device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, noise_strength, self.device) latent_timestep = timesteps[:1].repeat(batch_size) # Preprocess image preprocessed_content_image = preprocess(content_image, width, height) content_latents = self.prepare_latents( - preprocessed_content_image, - latent_timestep, - batch_size, - text_embeddings.dtype, - self.device, - generator + preprocessed_content_image, latent_timestep, batch_size, text_embeddings.dtype, self.device, generator ) preprocessed_style_image = preprocess(style_image, width, height) style_latents = self.prepare_latents( - preprocessed_style_image, - latent_timestep, - batch_size, - text_embeddings.dtype, - self.device, - generator + preprocessed_style_image, latent_timestep, batch_size, text_embeddings.dtype, self.device, generator ) - latents = slerp(slerp_latent_style_strength, - content_latents, style_latents) + latents = slerp(slerp_latent_style_strength, content_latents, style_latents) if clip_guidance_scale > 0: - content_clip_image_embedding = self.get_clip_image_embeddings( - content_image, batch_size) - style_clip_image_embedding = self.get_clip_image_embeddings( - style_image, batch_size) + content_clip_image_embedding = self.get_clip_image_embeddings(content_image, batch_size) + style_clip_image_embedding = self.get_clip_image_embeddings(style_image, batch_size) clip_image_embeddings = slerp( - slerp_clip_image_style_strength, content_clip_image_embedding, style_clip_image_embedding) + slerp_clip_image_style_strength, content_clip_image_embedding, style_clip_image_embedding + ) # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -400,13 +361,10 @@ def __call__( # get unconditional embeddings for classifier free guidance if do_classifier_free_guidance: max_length = content_text_input.input_ids.shape[-1] - uncond_input = self.tokenizer( - [''], padding='max_length', max_length=max_length, return_tensors='pt') - uncond_embeddings = self.text_encoder( - uncond_input.input_ids.to(self.device))[0] + uncond_input = self.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt") + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] # duplicate unconditional embeddings for each generation per prompt - uncond_embeddings = uncond_embeddings.repeat_interleave( - batch_size, dim=0) + uncond_embeddings = uncond_embeddings.repeat_interleave(batch_size, dim=0) # For classifier free guidance, we need to do two forward passes. # Here we concatenate the unconditional and text embeddings into a single batch @@ -418,25 +376,19 @@ def __call__( # Unlike in other pipelines, latents need to be generated in the target device # for 1-to-1 results reproducibility with the CompVis implementation. # However this currently doesn't work in `mps`. - latents_shape = ( - batch_size, self.unet.config.in_channels, height // 8, width // 8) + latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8) latents_dtype = text_embeddings.dtype if latents is None: - if self.device.type == 'mps': + if self.device.type == "mps": # randn does not work reproducibly on mps - latents = torch.randn( - latents_shape, - generator=generator, - device='cpu', - dtype=latents_dtype - ).to(self.device) + latents = torch.randn(latents_shape, generator=generator, device="cpu", dtype=latents_dtype).to( + self.device + ) else: - latents = torch.randn( - latents_shape, generator=generator, device=self.device, dtype=latents_dtype) + latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype) else: if latents.shape != latents_shape: - raise ValueError( - f'Unexpected latents shape, got {latents.shape}, expected {latents_shape}') + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") latents = latents.to(self.device) # scale the initial noise by the standard deviation required by the scheduler @@ -446,41 +398,34 @@ def __call__( # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 # and should be between [0, 1] - accepts_eta = 'eta' in set(inspect.signature( - self.scheduler.step).parameters.keys()) + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: - extra_step_kwargs['eta'] = eta + extra_step_kwargs["eta"] = eta # check if the scheduler accepts generator - accepts_generator = 'generator' in set( - inspect.signature(self.scheduler.step).parameters.keys()) + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) if accepts_generator: - extra_step_kwargs['generator'] = generator + extra_step_kwargs["generator"] = generator with self.progress_bar(total=num_inference_steps): for i, t in enumerate(timesteps): # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat( - [latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input( - latent_model_input, t) + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet( - latent_model_input, t, encoder_hidden_states=text_embeddings).sample + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample # perform classifier free guidance if do_classifier_free_guidance: noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - noise_pred = noise_pred_uncond + guidance_scale * \ - (noise_pred_text - noise_pred_uncond) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # perform clip guidance if clip_guidance_scale > 0: text_embeddings_for_guidance = ( - text_embeddings.chunk( - 2)[1] if do_classifier_free_guidance else text_embeddings + text_embeddings.chunk(2)[1] if do_classifier_free_guidance else text_embeddings ) noise_pred, latents = self.cond_fn( latents, @@ -493,8 +438,7 @@ def __call__( ) # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - noise_pred, t, latents, **extra_step_kwargs).prev_sample + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample # Hardcode 0.18215 because stable-diffusion-2-base has not self.vae.config.scaling_factor latents = 1 / 0.18215 * latents @@ -503,7 +447,7 @@ def __call__( image = (image / 2 + 0.5).clamp(0, 1) image = image.cpu().permute(0, 2, 3, 1).numpy() - if output_type == 'pil': + if output_type == "pil": image = self.numpy_to_pil(image) if not return_dict: From a359ab4e29c8cc064bde3c828d35912fb15c9bd2 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 30 May 2023 18:26:32 +0100 Subject: [PATCH 053/199] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9d06cf1099c9..ab37c629102a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@


- +

From 9d3ff0794d7bada13c01ca0e517377d776cf48ff Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 30 May 2023 18:59:07 +0100 Subject: [PATCH 054/199] fix tests (#3614) --- examples/community/{ mixture.py => mixture.py} | 0 tests/pipelines/stable_diffusion/test_stable_diffusion.py | 5 ++--- 2 files changed, 2 insertions(+), 3 deletions(-) rename examples/community/{ mixture.py => mixture.py} (100%) diff --git a/examples/community/ mixture.py b/examples/community/mixture.py similarity index 100% rename from examples/community/ mixture.py rename to examples/community/mixture.py diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index b5d968e2a39e..dafd00321527 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -41,7 +41,6 @@ from diffusers.utils import load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import ( CaptureLogger, - disable_full_determinism, enable_full_determinism, require_torch_2, require_torch_gpu, @@ -370,7 +369,7 @@ def test_stable_diffusion_pndm(self): @unittest.skipIf(not torch.cuda.is_available(), reason="xformers requires cuda") def test_stable_diffusion_attn_processors(self): - disable_full_determinism() + # disable_full_determinism() device = "cuda" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionPipeline(**components) @@ -416,7 +415,7 @@ def test_stable_diffusion_attn_processors(self): image = sd_pipe(**inputs).images assert image.shape == (1, 64, 64, 3) - enable_full_determinism() + # enable_full_determinism() def test_stable_diffusion_no_safety_checker(self): pipe = StableDiffusionPipeline.from_pretrained( From 0cc3a7a1232cc8725104b458ded08b8af6130d10 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 30 May 2023 20:47:14 +0100 Subject: [PATCH 055/199] Make sure we also change the config when setting `encoder_hid_dim_type=="text_proj"` and allow xformers (#3615) * fix if * make style * make style * add tests for xformers * make style * update --- examples/community/mixture.py | 7 +- frog.png | Bin 0 -> 110438 bytes src/diffusers/models/attention_processor.py | 97 ++++++++++++++++-- src/diffusers/models/unet_2d_condition.py | 1 + .../versatile_diffusion/modeling_text_unet.py | 1 + tests/pipelines/deepfloyd_if/test_if.py | 10 +- .../pipelines/deepfloyd_if/test_if_img2img.py | 10 +- .../test_if_img2img_superresolution.py | 10 +- .../deepfloyd_if/test_if_inpainting.py | 10 +- .../test_if_inpainting_superresolution.py | 10 +- .../deepfloyd_if/test_if_superresolution.py | 10 +- 11 files changed, 141 insertions(+), 25 deletions(-) create mode 100644 frog.png diff --git a/examples/community/mixture.py b/examples/community/mixture.py index 60d0ee2d09d3..845ad76b6a2e 100644 --- a/examples/community/mixture.py +++ b/examples/community/mixture.py @@ -215,11 +215,8 @@ def __call__( raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") if isinstance(seed_tiles_mode, str): seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] - if any( - mode not in (modes := [mode.value for mode in self.SeedTilesMode]) - for row in seed_tiles_mode - for mode in row - ): + modes = [mode.value for mode in self.SeedTilesMode] + if any(mode not in modes for row in seed_tiles_mode for mode in row): raise ValueError(f"Seed tiles mode must be one of {modes}") if seed_reroll_regions is None: seed_reroll_regions = [] diff --git a/frog.png b/frog.png new file mode 100644 index 0000000000000000000000000000000000000000..dce094c892a958a6f8dbf1ae30fe09e8a295fbb8 GIT binary patch literal 110438 zcmV()K;OTKP)L!Prp(uXU{}-3ml0?o(T;!5C-9256DggJ9 znGxZ~Z0my^_sDxuO=Wdo+{~L98SZY!Y}>YNW_^5+sE~ zlB6&tBtejbWTud&OhHNmgz+JWAhr+IzYr2gNB{(o5J&(5fdq*9|M7K_AlVG=oSIFJ zX)@% zrIe`@A!KHxASxA!2y`h!S(mPQ)b+>-Bw2=F^|=Vr>fwMi$P8xV!Mn)}G&9l(Hjpx> z8R#$q4yWV^fi9A8K!AZv5};@_Q!|M$nj0I~$diSS?;wC6MF>m@U~*U~LC}O$Dhy1f zfUKKJ(n0`{P#TrPCdx2|{j+SD(y;C_dhpqC0YDg$8b3fFAsPvhQV5`F=_xMBDa{%) z*pkyk%aAk0NfwzDAR~;B1c-D(@L{JA5>AsDMR-s~>*`2$EnO5lNgz`Idl*g1gfmh` z2P7&p$6ca{BqOOLGw7C?0FfO=lLW*j%MBp#~zI}xwNSFYQA2xtF!UX}KGYKTIfdmr;u2rqq2)uL~X4L6evmW})E?I39?sJTXCl`aZa23O%}Gb$34$-pdg zudg^*!K)mvkmZV$CHq`lsjAW+KFr2OGc9BAYNmw+t%mk4JfC|yF zHEMW)x>Lg#wLUS{6ECZjsCu7-ED1wYC{>mpHxcX4arG#5@;4ClEE){t^8rG_IFB3- zgU|m}symeh7EC}xEH*W!m@QIGNF_DJ1Scgr(Hoser==ysWoc&h6-Y{Rrlr$lA_)_X z?v}kXfIv!gB?2o>lNUSyBvl}{9A;c6X1LekUniOymHB zbt97S;WG?O3Nl$pq$GnN1QN+G8CHNwS+3S0DFKSG6~e+uP$_I+#Cjp6Itg6C59&aS zs~#7mL?Sh?D}`jnc>VG#7{9S>f0$EQ>h=|YtoMdY!3+qb2{40<24>2vi7cldk1~!mKw&$M_0BSxRCJhS$lSAulR{!llw3@~Qi;{S z2}zhFcNx-1BLOf`2%sg*#3YPLkOoIa5U`5#qz06wj07aucuVCVa^OFLFbv$hUd*sT zLRQHID9T3CRk#K~A--|DK@dqwBFroSQYn~BNwF(!L{3r6dj2_XS&j_G!eqwr&M4L; zVLW>RZU%^1FW{y?dG|GTL8%yk9)M*b;_=IvASk6JLdHscSb?15(2v_texVNk@R%48 zufl5mfI+E(tL8El2nU%SGzSW+1kmJSq{=_mgXNll(on~gq$mh!1>ezZoPf$mhgV+V znsj#Ag#rq>!N5QckVcav6Qh2B<+8w3tY-_`RSuKiT;*^BJl_Q2*&A8!(-s0M#tb`2 zfyh88m4!5ouoO5aPNq2p0|ta6TQC8drQ|4>*+2s@Trbm`tzJ+DDQQ$D640=I0LcL< z#x2V2%|WsXmrz~IhJGPd_RInA3kQyk3j?BU%1G)_CqYOz| zQFSs7b1Tm)tx9XP{okW(^v4PIXr+KfiW`nUzpE+tAsBhVRpM zGz%)M4`U^kL|c8P5>$tfDNIQrku3o=3~=lQcCnF>Y33CfG;oB`QbvXq46!xlLE=IK zkW!Zf#qgYjDqYJWXoMWdc4Ulo#W_4C49fBd8vmFisq!avqDYn(w*pnja*ASw4VfTw zs*_X#W;Tp*%_@f*8L_|e#JclkM6klMbZH=P2b8v1rWqnYz<>@ySy>a3ViqH)L}-vX zf@mxsln9kSfRx}MO7iTGwecQ0y9R7o@ELb&TzC&d6w*;jD&14$)Yl9ji?rQbL#7K|;i4~|h;&}o>P+OXW^_5nBN6)egZO^8n z+;+Uz?|$YU1~MF31+nFcRCl-;a>Kq@^Q z4meDZ5M13t{cpe+;e4Q1D6DarDHcaF@(Y8X zpT$W1>$emVrbM_>G*xFlD`L0-)>+Kypn_3$w|_vlXRc z<_z_h1XRh3jcX~ssK`+aI0#Bf+gMkuyupSvR?oMA-Did;WM!4KEf0$FFJwYN2-(o= z8I=s9kwps;D<4B)MXG6ah6~xZ91KL|D!IZAjeEZVk`)M{G6qV801V}7>S3)od&CN$ zvhXvSd3N1fc8ziO*E_VqqSb?v&u*K>FGms3HsE*NWhrcd(y>S)LW5JR7n>k6Oa==R zrF3UnrWza^kE)0__O|-YAWSgvL`8jxNnyq7VlquwbDl~W0hlaZNTUT}TtCf_D*C%P z1yKeA=yU{u2na+4g+c}hZtPK#qbOiP8Yp1E@8LBaOb@{U6%9!^6)AV7(#$0p3=&NB zkT_V+L{xZI6+%G5fFAV#gcQzPS13kws8G!f)elcR&aPHB`vJVgbH9kyL{>mdK*?fP zNyA7pGKEZ21CTUDgstd$ShgUm!VSD4=XhNc81=Fsjci;UE!N}`=a{7J{S~TDzzRcx zfQ+`*NiY~vX}tw?-?d)!mSB8<=fZq3hrVG10B&cZjwWf)U~f$!m=p?`A?QFK zFD*h=K2R`|(YuS?qF&QwU1^|=kY!dJ4_uzbkc>hgl7MA7*cC+sMjyb^G-^jG5MQXZ zl4)fY(?Y{)BbC`!Ni7WqLRol=KJHVF*t%Z50i@B8iBay^+(sXVRs<|kW#Q#9>6P0Q zp9ocf0ITyRXoP|};wlYD4eVFxrxn|{KAx(Qa=g+?k0J|b0hRKsj`6ZaJu}$G(sw5SE_k?h_G(|BVjII)|XdG`1X!8H+oB8CcP)BETqiZD(w4UZcuhdEV${f_rV99!kO`XzzyX8@L?L3us9*wSsM|r1 zsn{TA*}yYcISMocDXP9NDT>Aj!a6W2vRL8a2IjE_QK}-tdZKK2TbV4XPE>hpX#LI= z&(^;Jv@LlDKm!K)=Wu5jRHBiYF&-qKEiov>&jeR6kx-Stw$u(e-r4zM5Zr&mEv_IO ztBn9DCFS%$AiYwfq$EbMb~b2aWuYBaWh9YA8>Y9y4gpRO%H~3D?+DM((DN^D?s>#^ zY^m#T9TH$UsP8OvSbPy!xT>z|nUQ}K20!@98GC)m7;I5w7{y2*)|kP;;}$SRbdc;_ zjXI);pa8c5^_6gjHIbK487-xotVsdUsaCX986i;~=T(r;-7}#e{5>B{E@!jf-dT zk#bZV=ZIV?O@<^}v2q%$j~pNib16x{gcJ=#GDq>NtW4uQj)L3=i1$MS+wMu-nDvlv zyd=RYiV+H$m5obDr`0jePG>O}R!N9eqzI>nlC&J*gjEKWBdKhw^$+7CH5iQ5R10@( z;c47^u1;j*zK~VwF%BTMo(I@~auuq#dcgCO@bRY=gwE;G_6=utYI~KD(yz?uc%g(K zV~3mv(~JZqcvUTn%@Tad9$*6+V4g;iW};>Bc@2ayn_z+w%W&S7#twCaV44z@?&Sz1 zSxn3Z-e3J#FMruv7=2sw8oE*Om7qNo4V11^_aRlq?J1h%yx3 zs@ubh3mZ#<{X z$XBOYm&uhX(iR3rwYsi61v7S~C%dJ~0_NlDKqZUOCTSV$w18JW+_BaZVL?&!sspU# zJ2oSG_UUu)yB^twQd+P;r+=qC`J1F$gg&ndH!sZEj-J*R|0a<-QYy4KdES zdS8k>xWmF$R8x1@p#-z4+f{5}x|B?UqCwpVWr_DzR-W>D80BQf1piI#R55sB+tSsFbG`Ez@86wS0nMO-eGNcGI zDvJd=1DZ0hzZI=C!rGegkOC|rBZlN*oanmVQ85_tW5k}}^tc9=Gfdn*(FXU|Fj3Xb zGU;a}v=7eZb~8sJzuw|h5Qv0UjVl*w7E%q7l6xW(8L+9z6K-G{(7n8n8I_LVM#e=a z19DQy){t4b-cSgP!sD7aRW#lz9Dtc56WGSB53HtZbZ4s(KhA6Y>!<3WpWd zlPhq6atr8%V^(PKkrx~U#Rsu|{AJC4#yJsy6kHH!jD(pKIi8lR0klZMEQQL8OBEtm z3aI?m#Znv<3&hRjwlo1y={TNMsz%DP<^zL;7fOD&BKFl0>q8K?*0wg+uwLZ$@_3HC z)+0A6lB;2h5M}F|I>*WLn;u7r(2)2hI^k$US4nw7U`EVU&7mOUG{UAsYLVq7WabHY z<+NVpX}8Haf@yODy9t?5c(`&+S=Aq&dAorgfh^?>*MOFu#T`HxO8hjZ_Zl!~mjl<(L>fsZ=__XwG*X}(N{$<0 z@MuW6M-obJLLi(FQjT!8)u8J=oX_X8;>-4t8^+iUM)~@|_arvv8rvZl(2#oO>_^e! zOcJkj#)|(+|46L0NyUKqd>(9cjuONF7>AK@J$>szNRLHm456%blBJ+&dlw|dUhj`= zYATf|5MfTq%qhr;lo~8nBrRr;Zc15N6dWAFSPbMkoHeR`JPO+)R$gJaip2KqTHneL zW~~=$Fb%K;TA~tB!V^)kCk3Q|nUIx=47n9~G-7}NrUp1e06F48Sy(C}6j<|?N*Pd+ zkm_Whpj1}70WC9_lrpPQSJ~*e@DlugRLxT$NHX^D%| zqdQxdjNxKpGA=1LO1@`=ARW_o^dQL{u6De=@)sA&&F%5-ifx4uBhTBaw!h2*&d&my zlFG!?`jqS5pA*!!-GyLaSMPaBiO+jPom3 zKvSd=CM94Y1ULz(RIW(9FzuizivANMrarf0lqC)KOi;OgDYx|LX< znkmL}Be1s4R4gdvOtsNj)jBKCG3yX;1<_s`V{zGsRWsmAXAwkr3)!3V-N;1A~)eveT?=5J$v3(=bNz#dA2992AmZF zV5P2y3#$^`8r%Vt_GzRaTZsxzEYbR6W{3o3oxx;_=w_~-Geldg`GvXfaB`c{e$VL`#5x2}qQKw!{o}3Hs~8X@t0K1sk@c69 z|2iX??Uxo^Td$=d>EOyOpU>!ur-bvtSl>i_x=LUPDalT_K&eYaLOiH6ni0?>$2_2G zCfXE$70fd)YS)P>t;yHuX&|dG0E1;uhZPjSNZH1Ro24}HyqGy;%cWER5;jDx&!O^G zmh{5@709jh%wW1qhJaAAhLLAg-jz8V)_7)S%+cyP;^eTvtU^~-^7O1YR;a2ngwZ3b z(XTLYUG*=lBnOD%mk3F;t<*MR&A|8tlDUEHVoD&uY~yBzrx?~U4lO0H=QJrV63Ho% zHqkXz753Md_Sjv3JH!P*YvUrHS@v^OjpxVUIbTy} zhA6*~qhQUg>brWm(&ols*~<0X<~_7xt<6<$hO%WK>$g5D7;AeA#YkyA1Rt8s^H*GN za)wYRL0~YkAcYwgiM^2N2`M>GN})|iVQP{BMkgV;CDm-zml&*C4+$Bu3`)}hOJA8 z0Xv9%nsx_Rv}@s_VNGSv*8Ln~KXU>bXs=)i8#UgFcA#0QUpM6+D_cvRbLooW!xPmr zUUU32s3F&=xZTh5Y1z)`>Jzs&_Awd)M&76A)rzXph!oJoa6vdpQ}P4}fsucXzN}Tu zC^ysSoC2Dlq*cwJN=HKnRynMD8Z3iM^dY(|aezkkaE-7wI!-s2_zc9!hZcgv*VHBY z2y{8_VzH2EG{CqbE-Q|xs5~-Pd;%Nhy3wdWFLNsdrAQUZFF2}d2P3kY0O~fc1vxOQ zGEf--69)CwsKh`Co>qo?b>~=-w=^2ZRP`0oR#IdnCI!5P$~I}xLU#mH3+5BG(`m8U z8tyOk<>&nVSN+S^%iT-tULtPMcHo3%o)>QmTBjee@7yi<3bt#*$s1O?d~Qux3ps0^ zz8>T=Z@Lw12O%lSO_?N0h&D4jB zeeTjkaP_mfKIKvauEDIKjSRR@Bg(j=yf%Wh$sU0=m-zrt_7vM+zF&t5H8FA+C5 zUQWxj3(S4G=)6jJgce20`$C&rbZ;1ua?~i+xA`21KRXf6-urq6O6PY*AsPkF0)^`_ zv61);mRf5^+L`%(4tas9iE8+m(Qk##h_R+5T;tO(&hAHF8NZ6cU?+f#>Uv2)inJ3V zL8zNt?DNT_q__-am#8$j8UVqDdY%pw;V_~Bpsfk7=C_tjs_x0t72WFSrr;M zdr-&rj4LZC0|wqM9)K`;A_sj{cA`}lbv`hxcJf^7k_lvhw33pl+TKQkL9VGV>&424 zL8!=nrr1_1Ce8}laZpQ$AXNugViib&|VEa>xX?l&Ns{avh#M~b?CzELfN~WdOoF2@C=6G zseO<$er!3v=v_QR^9`$Dlmg1iw^<|AQ)^o*4Jm^aF$^(Ab~8}S`OdDtO4=g8@O@{6 z;FgxG@MpYB{w?c&_V&-cy5j^=xhnTUnO5zvmdaiAQNmKTx6CA0(U~A?#r$>}T!4ks zDHDyb$rw;i8n}>>4s2*WgUaenE?du0{Q%%=I2`p?YGr&1#TN}t@Dom3lGo*%#{PHgzN<_W1<)y5MX zx^I$M6*WX1o8fR-uUv6?Bsn4*73E)Divk7dMkKTv+JwS9AV*n&nH6m>PjRqLGMmZ90&i#$rI z8Bk(s$OUF5DAU@4oJ>bG_-VedUwzX4hhHtYSJM9NK8W*uf32M}XSCc6QO)fXASAMWcaAQC#!AnvXP& z30Jn0+ab}YkUr=0S6WEgj>wv!Z|40RI0?_l#KYm8P-AqR%%v7i3ZY?-KB(i zN`fbnn$|{h0+b~}pwZyfoB~SSV5OQ6f>t_R^;`*%bciPiVe(wU$ZAVc8NWi$%H`vp zRqR@6W%19Kr7GP4d>y+G6zD~ z&GGtW{{A=i{rCObuTGbrK`-$0zP%;iCk{e{K+b7tP8cnT1`>)wl12xnl7^U7N7$B! z4r|%+%Jm=4LauYFxgLvia#MrxMiF=Ow1hSqm?PQ5is-jcq7Q<=*?6~u^ZXw-gWf{H zmP`!if)yUE?CD2mV9lT*5Fy$+{-CB$i3>3X^2n&1hx*uv%PXBU-+?Yrp$q`ftDKUw(JG{R;L8j#v1Rcn>~wdt{!{ ztORkvql5dGq+!V+0NLOm*e>}+Ul>H4Ug|RpRCp7`7Gc1<;Qr<5%dU%nE-i&G9uHm zI*-wlrZJF8AZl)u5wbO-t`6<$S^^xD)L;%7r=SNwuXmR4nV z;eny>7}Xz{vIzaG%7X-O?bcm2VkMFr^G&=OZkVvlG+8o~t;Bl<$~(F*n)=Nb*G){b+5fXMFk%?iV?S<1jfv2JeM9G^aXwc1dFzf_ALH3abN%8VI2`8$$Vn;p+v3MVF+4hFjRpn z4M2ik9)KovIV@cuTZ*je0eRTta_@h5!N30&fBNe5>U;QC$Xh&`{n-4iremKXN|(2I zF$@5eWH*#hHPY2!oTkK)A+pB$7#a-GMip+o{dEXF#M!HzKaT*b@vLDTr~w+Ign7;n zR%_c{2WMkGzY?Fn=m(MJteV^Gx=_JKImvc3HXo^#+vRpXo`sijwWToSvvVRTqzvYu zCXHh_tU#4?zOsn2EVbU?XkdpqSo3w2f;(7zR}d3xbTQn8FgLTLne{0miHzzVU#Ycv zqgLXbVRppeJ=Y9oX!JQG4<&$0(q)Qa)WR681+e&4OBZ4gttJk_MY+!QSZ}Gy$G1!V24;8;QHV>*v36&Fc$e4-U zmZwk90eXqoL?`Ih4TvTa5FYI8ixe0mJ4ecC=QphwwU)J-RaNSh_Ai%>LFtcY;Hl8y zCQ1y7s8l&T9VIDZ$va1tN?gk}L%qCKV@IZ)D@8jzK&kiE{NwD3%uJ;~5=&;sF$X3y zjWnl?nThH&BAN^_bvGx&7#F+O_xyLiia&gF`t@((?ki5O@%9FflfT#Qz4kq(fCCd+ zGKFlkM2Z1RnZ#&m$LtSjbGn2nl*L3EK|`j3u*Nn<0>g&bfqAyMXZnW?sC?i=H&H`P zOV=m~b|fEbv2LYaKQw=? zkm2k~$Mp?TONlU@QjNf38U@)pA`on^Kw2Ud)Nb)4Wvh)rS(vbD#1vg0U108G5$=^02!XiEjJu$RbnRA0z*wPkLr6Ig8>9ElR4(^ z4;|^oY1GL7jB;0_j)i*3T+wNDZx-#AlFT9`XAMI%W3P$)fDyfz@&*>g(G`+lxuYRLHeQ)1?bA0s$=FhR*=*^5bHowdHfD6PdP7N09HMf#zB;{@Pt|;6i z`JwAw$^$8%&`D(NOfb-=FsUUf2iST{cm^~dSeni@GkoTXk~VU)WQ@dUZK^>mz~H9W zsIeXaJ`4Bc`L~==(8p%}%*{Oe=IzIyy~eYgLeKy5JQ}P=1vta*ktS4oNf}XyupWkK z{FqI)PA?ZyxmO$8j3W-^y%`7NvD2 zAT{PI+FG707-mp4Z1$AVSlYQfPf0gey1ZPDyCOwr0%_iM^RHg=KYW#c`or?Y?{W17 z;s%eG{$O|~f6U8+rpOD@sRo^@8`ohp7n?!Qg-w%`JvtCH(w>mYe(7`!8z>KLZrKc@ zA3($UpCk8={>s&Ao}rS>h6WfMoMVN!>h|8cJaDf4ajTboz}l~X65AWXhv8<;g}KF^ zbL7>iO#T8}owE^~3u7Oo8>}gDhPm zsGSE9)o89UUab=KVZ+tIIT)s-vgAa$3ZN^89SAo^R-LFfR8SUAcQWw3e&}WxinFZIpSwV`T;m1&ae7WKKtzZKmPM$EX5C=@JByAZX~x8wSFNLM05!VB4j-?)q7gX zYgbk#bC*4_LvPNUYYL|tE#*@yyEV$Z1~rlbjf!MY9Uf*UWtYjUBSPZfD9>sBN`8)5 zwZ+o$IVj4EF*txK*|G+P7{)yD`iEh-Yh+xTDFM}r0}Z|EdcW7=0D;`rjwJ-TkO`(C zx4MeZEg3SgC!H}Z>XRwbM4>lD7+ziD`)}HR_`3h*@0P1?VV~l7xqCO?Kk>z~+<+I& zrFb+=@IeUPbcaCF$=8U|L?&)IlC zGU*N1{B1-3==(lCN_>X42E7+aAPg27@=PR;iHuY-gD9nUT23Mq4M?HUorK&G%GPQ) zXX(E@DH1`KbD>-e%FT{Ogq_Sz!jgtcAlC*fl#a8gD#rRu&ev4}>Z%$f*XioxFmsI2 zMUfSwY|rP{zTFOCZOXy+2(fBWkwz1&#MuR_Vd+KsaSU*#WMa-$9&+!*WQuGs({B0d z#q{sL$v^(#>GN;lzrk{k_w(+6n|Hcf^4{&rVkbN~FKJB*wS>`2o8j_uJJqwzF(|!e z+&7XrJKcQAI00#1>jY#O&gHVHC8qhJt)PL=Eu(5fTs``8P`uryvvc0U^SITlQK3SS zCRXhdq_mEIr7YMv)B~d=_6+F!(p=BKZF}PQ=%1dUdw%f6zjX}$k3P1<`uysn)FlNo z1TW+Rgw#=E^AV|(&l!rT?NcpV+SDvdZ~{;3ZUme(V;TfAXRWtkPwPmhnb$ItdFv98vI*038oqbI@Uj@9+FlN?9 z8r3$+F|8oC@hGrb_?HlzOmp>c24W5{v}1p9-M{_LzW+VH`Fgqg68hBN-OeX(Pu%sK zGT;p3VqU9~=FGY&kV4k`fZ4(-w&K_)1?EBAi3XS1G}Cg-IYy;)HgQJon_bNplg+K_ zb6d&w4D-)klxMrx{INc;&u4z44uQ!th1%mYT9hPmRg#SH$U}Lt63fqx?wkpHVB%X* z>-^o%-M^l_aXtUy-?Z}ccYW{@AH1FMJ_RsSj?8KX0eOO&0Hmb@F^j3LW}!YyIoqyD z5{xxOw7Rt1%^+(f&^cd%k+lX?XmTk73(yIiG8+&fu6q|jtb)$bXH(I7NFg-3afg1p zVh%@fer1+dWTP?P02@OvPDFXZbu1mR0noLuW?5qy9VS{!rZK8}-%_$PO=CugcZX{s zv)xVKe#-y;xADz)$5&q;FFq%4`SaWkZGY7M1Pk|;5>G@Dv)ProGj}ZIaz@6Hc}gq^ zv6+&opnO0$3KlRK&Cm=DXc+pm#esdZYtPGhp4zpy*49*NqtLeLF`MaYb@|*3e6#S(GRfCifTW!TK?kG^ADY4#QC#dzQW%+10Q}P zpZ$q*DTHS$0ZKipWS&ewWF|7ZrI-zSWrhVJGj}qZ!INjidgdGxRS^~mq;=C&GjwOLNhbnxpB6#(>K4EKWB>2|c>4W!dGi_L)$U>T zBmAN5k1=(dj7Brs1j=M)&ESAfzNbypq?Xd7g+)iNMLsT%@s4Y3sL`lSHZ!zovTRHz zlWcqHa@Nf>%|5Z+8H=SZmTR2v=gpBHaV~4E^|=WR*D#JsA)wmYP}|y!XM(Ywi)dH(8u`{U#9T;JOVwu>!pKl@Jpjqm#Sl|K0MSvpk4Fs4gXdli_=!YSB6 z)I#e?fK3x1snyA<(X?iOPiY)eq;0BAt0NH-Ex?q+Y6iJov`{ECI_d?Ioxqq$Uhf2B z9y5#Mte*Hf5LW65TJ?_A$Vb-pr?Fn>gKHY8+crQ#&m4no!mc59qPt;TMpxa^GgDJq zf^s>yBrqX^kYDWb^H1{s{4M_YU+LF>!2a`>r;C@5^Yo~G=*MGvIt;t!3GECMY<8QV z+2B*VYTP6D;sh)So?sm^^{8@&F+x&zxYdR@5(al~m|A0tiHOPUYC2xEc|W<&7mHn< z^4O#mSYwn@xX!Hj#a$R6@VONYq1%8(G^>e9HKSF6L@O2oQFw8P<+g;OTBM%C_SFzK zMG^4)O?>bT=g-y_n=NYLSzP%g3VI$WK0GEL{HFEfV%3AJFrdyw017HIVbLPQP#{}K znVOOiLn&2_beat0$<)lc*KTL9wi{GOSCPr5s*8bpGg-=Ag4!v=*S4cE0W?65E;qJpjlh{2e(SA=SnC{-=LH8*<&R;%5QhcALjdg*oOGIdiJ(^6)}v}A`_ z_6B6C+3|Yv-+yiY?Y}$z_V;-8?Pu-v^}F`+y2s)g;ZG04e$mk}ct=u-COYY)8jNt0jM>np$#$>{=gqE7(?x4LkPD|{o?0D$ z)}Z)NS9zY1@?*1GlP;}BSiyajj8~tY3<{$5+E*dW)<(b@1h6Xb*Q&tTn*VkL_#5Hi zqqoLi!1<4y`Lhf7@I3t6(cpQi@xf`sXfBtchB^aT({$1x%wo);bfBk*tj6^$Qr;20 zI}r_LQ%%h%8UIjo%z#W27-3WMZcHMhxq7v8Wetp|sk=$AwjE&%3aV!E@i2vznSZYQ z1xo!eGPhAst6c(V8j~sp^E?)L3^EpGHl{ZyDK`=FHe{tG0ZpChpyx*&-*2&?4^5!_p#7 z5K<>NH9A*WI$e-6&>S;;XY)0GzDP62}Jp%%LXb~Ey z9p_~Atk$ms<24`=A~G>H7Yjy9)!1VTj4kKG#ge};++QZH|Hj9MhWSCr`o-6uTLAuz zFXGvYj6a|Kz_?%JC<#@zm^oC$i*g6`uvt`wQ2X2o5!MA?gJlSS8)Im3s(5sY@$u76&^OZX7jz7o9)~tb2y^+jt))??NXKoSGR1DBAksbitaF@wX{a= z8?Vh?PP_edH&0ix`H)M`;~@^G>}}Kmg;hMpG1Js4F~9&9Yo@gs+&TBu*-%zt#ZZcX zq{(FvIyIIP<~j?gQTYY_W4Qi2aI@p%w9e_AJD z+_)U%fe@Zt(&UngrYv~kDl@7943Z&9MRyZ@vwQ~tJ3DGi2jmeJtp_y&mA3VeS*jL& z6hQx|csKA`d47&&^g2WYNiex*PC!||JMch_IP!2Qi>x;^OP(c)22BmIr$*iF_6H)G{Yo^huvwM_H}zsBQ6P4QR$_+Rcug?XTx{J6&Jc)o!|VzvyYH)3O{6eaU`k zMf&8n*tTxd=8;!s>})INL0`|#z_8hxy`jO((m;ZW(JWY_c@3;(=a41GAP#UP<32iO z{|9FN%a2V3@QZK!@W>Da0RD~K;G^^PK~nL-*H?0oE2X7Ghr~!BU{VKLsMd;#YC;zz zw35k#okKTcLa&vEP6+Nunr0cH$)IjRgLS!&ktYcm0>Lp_%|>N?RNRMV9|4H#K(#T3 zP8?%$>1pd^jG{)1l~`GEfy$BaY*pmSdx=)F003jw0wabKyLtKIi}=&;ra$~fpZ)go z=JOYM^VzBG-f8z#*--5$r_szE&K=E3r@6_5*=TC!HoMa?xzXSSPs~TLgJ5&?-aNAA zk|AA22PHI2ZL*!F>uGd91~mYl@6<16?FP~;E*q?w6XvQMiG31o>Dfw2n-N=PR_V?|xZ?CUkUp-yi9PHwdwg`(z zH9~4b!UhoLw25@*H{f_lufIBj`eG6c=^B-|NkY#>-T>2PwTPN`svxoS3$>61L))5@chNj=iwX& zKS+Cy%RJ4753vxUME99d=7FmVI=TBEX5PcbIVa%)ML=w*(inPP&~ZMB??{YY~o zkMr({?J4+}a+JNAk>E4}6pd(PbDv=iFlvom z6fMCNLTF=^7dXyu7Y&xpyk<@$2{x}pWM;ee{mW^8(e5s$n`XC=o$TrT^8S6keduo& zy+?e7tz?miu`hNY)(nlBq-u6d4Fdrd_MTkf;ORfWma7;zpbM;4-rdm%>tMMRA@P0B^x7u{PIH=mlih*24 zE?pGeM&CjsnHC*bN<^9((v=bP4WnLPCHmE`i4ucO>xNdVv8vI>Aaa`fH#hm+x9zvT zZ(n{p@4tAd7t@p5qvG*oOHV! zMy^3%vvq8@_yhbujA~ffIvrxgO*CqBPK_3gqSnR=A&glcQZ1M~ha-(cxBgzO#rX9% z);FC095wv4BJj-U|3+YV_Q@|ZfwRSb6bJahKRv$(tDu7tIB-TFr~qR@AW#_qRQW(4 zh14N4rvzil*_%>rBE2|bueuE7o}B0eg6k$eOJE|Rx=zv}l*+Y>k*a|fHBH=Da~5b> z+cIn4hY6Bm9cvD*(=1D~81+mH%m#LW>E9d>f-XQ z_3QnU>lASi4(jN$d7aQwk&c2X;lOz0iC&xQs%zfi%H}*(=c%)Wnb#H~1vpk92~LDp z*pNgt?XGQqzq`1cUS05hwoAfJsfXq51Kz#wKRx8zh{cxWO1q0TH`{7pD8{0eTaWeP z8+_pAXIH$!U*PhQPLHTTQR`k9ibGqz=DNkzOUeh#tVOttB&Kv&B{5P+h}GJ zWb2yO^^I>)YkkhoUULmJ=VP&5;A-)IBRq^0=fl&oNo4?xX4#r=pfP{SA|)rM=#_ zdjjC80T><+r_V5Kp%yU?bEW{jXd2B3%27~_d^?xK0Jyjldy8uu1{1L%z12idqNO#cvjwb69X)6@4uZ%IRmr$W8iPwt z3f&DIGBRq%8H(-R`ZC6e7>-P=b%73=lg)x{J{ioy9Jd$v?YH{F-|4Gwch{d?w-Kf&J+z0%{_xI!e6#%YkZ+cLM2|UfHQDR==Ci9$N(;o= zsxaDCB%##RtaaPZpZ5CaSsHS-fU_~wSBw3`I(iA`%yt!E*b0?=n>Sy+YZ#U=Cb!pT zwUKpwu{9oy-@I~>re|vzgqqS+PqS>-&ZBMgoIgLaC+7=2U*7t1TT-;;1Kd7c^XoCA z6Cy^RKvEWI>B1=`D;l1G3`iE}nN4Y#={=`xlMg*fP1S&t=rj>-@aB+Aq!GOo6edfF zGS=#bD#{nSt$M!`!K{NgWTT0~x@OF%BCZ)T5*%Dmm?4Am8d%OIa(1A#o7?_(zwz(? zUZ4K9-F$w9%ZrC;dDo8**-x55B&KGQN{EK^8uThpMOqLRgeiN}P<2@Lv_UsP2{ZM8 zYs%`nhCoBBd{EnBiOPB~&6C~u_09D1dU|=`ca|5yIQ9O}9}mmJ;dDAJsgTR|KHbgt zxASLL^Ovob{^eTbUQ>T(?EvT6@_A-~w6=F`UtGyRq3*Y95YA(p6szcN!_gb97BW+k zC)1UK_b7)6CdUw2%A=@M+wy{eO4jUGV@+bE9+l4(YQVZs)dFf@O3qR>SIr+-LkBkn z{RdF!9C6CpH?YJl(0%wZvVR*@zqPJX+#SYDhg=e}t_Wp-$f=|EDSLV>?z`-ny?JwY zAB8bX$(5SGq@|^Yg~?6OsI|9R$P{EENXc^4iD|1y6z+l&)p<{&E7OdbYiXN+RsTFZ zQ5!-~>LI$R$)}*%%NO>$zvu6MAD{j9`ufw$*tJKGcm3&MSt8WgVCF4*qU1WYKMN4K zK*$byi~^ldk0}a;8LRm@jOLyQwaH_umNs8uZz7;-HI>PCW;@Oo?dH-xx!}v0`;+To zdG{`Ve(dia;^8SCdmIedw~JTp_VvXl_m^Mqcb~M1f&`Q*v7q^A;4Szm-<%ttTL@Pm zww1u}%#%tG17^r-o*S#_D?sP?HV4CH5UU75k_o68LrBY-2hUm`g0)OmCuw6lM0i?^ zr6>i8>Zlm9mO)R$d|CU~0XdaJv;k+eSB=Hi+xELN*SqTMwB|qO5b@#1s%=eLmm+ZTo!psLZs35!(NGD0qNkL}ipiExt3?Rw9 zPR-Dy6!Dp0GjxGy#soAJim;Is*E1N?M@JdWP2B`y3fe@p#+R@C58qF}``zQ~-%MAp zT1-6h^iw~)&)y@!2@<12J30e&wuB-;gp@|m(!#2zftE;9)!d-bRY_$UiPW+{B8>DJ zU{Rg@={oP|f^f*$S=r{Yj7w!J`;E~PAj||%k#a9(vMQ0QXtOb5sN%?q6W0g6G2q)7sM}X7b9*qMDrFb|Qb4W0kqE+( zNiWZmHKw8$plfoeCg=d1Oi_bI9i9>yeMl_eX(dS(vpbApZ&lMXIEv3Ef+`tamG_d3 z2;bIHP@a)g=NLG+eSPg>-jsh^HvB>M|M3CXDnc?&{gH-B2A}-17x=^9pMLwN{`#wVcYn3)^&Sszjt>u5BIdM- zfO|^E7&8w#bI%Frgi^f%9&%2vU>|Nx<@ykWCsrXD1Y(JvFl%iLWid52WK<}F!6x{P z&9`=YX*WBzwB>+@clz;d{PZ?|dWeT(f5;w_O?T7H>-oj2{iinTpXLB(qAg%rnDA19+r?9yj2Ho$f>!d-CYHWY6F+u#gM# zC`&n|oPsioCR@5C6`*^j!6O+S7PdGs#RR`*+>*0*ho=RWJb_QaBY7Mo%+xYFQTl@p zMC+`=+|ZA03Z<3d)JN|BdBU)PqGx|u?`A=xQVx+Ngo2<<=AtGA*d?SdnLT~ZoV)j3 zKl$S3Q@FJ>Z%vkJ=3VZ_f7eDinq;&<26{d}bV6o;Qr@82(vp}xqx!5En)c(ce1L~m0! z&8dW%2r5I$=q$?2m}=H)$!|{mfwNG~DBatJ zh6st(#X4&EBS3rvibvAtg53oN8k)oMYXti{VVeK{@Pae3Wv7u?CrP7mNA z_4Z_My}!xnUE;66+a9OHQ*r?agnSN}6qtyq$kri`W+3T=6YglNOc*g~f5D6U zc7NSoP1DQf^Ad5)$G7?JZGZDV-yD|r%jqC4SAOwodiDC^v%C3|3wv#Lae_VOBJ`#~ zwG&(UsqWEA5UMtKtiZUo++sa5z*$GMN$3g>smM+mu}sKr`^+%Qh1z+3n)qhM?=JAI z>G#U}8TW4c?CLrZeZiyQN97M%eiZ(>>CamJvGbo#?MLvq)WbDsD&F8^F!WoGFB*S4 z#cy2SEx07EE^rZO!p|NLiFeXtck` z|HspJe;+TuZM*yHPJg@fpX1$AK1DKptnEz&+1(842+2GkVp{Y-r(qVm=_JdLY6Te# z=!?6hnHeMGHi}4)k&YVsjOz3P;iQ{kQoFGIoxgt3KE3f*li!eWz~dYJ_2>M{+j#r1 z{Cqqe#O2cG*X_lp(FKlUE98IW{$D$Pdl6quSD&}Igy>#mx))6#+e$@ z`EJxS6I;cj&y-7JDPBfGnew8nq*SIO7)=?B#*mt^b(ujmS}}9-Y6tJ;GnLjr!CO*9 zq$p2IpbEUAYRz>bQ43j7LtHcItOo96lVwJXSU~sb{*(B_-`RJ6?4Nv{^Zn(iUA~uo zT$ZPdF01Qm2EhRrGy938nWM_!aG+18Jf4hj`(yD4P~q~=|l=4f3mn^KqLRrgo2msno5c&YwBhyOF$ z&l#s?NxtCnY4bl#{12($Bfspp+;iXj0y!rjq+uVr1>QnGWpnf^!{d~9-7Z?*F#g%` zzghb!^rQtjhuDx|DWO)ujV^zvmQ`R?*o_2el$u9KTWLpwyQ%#HY1Z`p_J>9Hi>79}ZDFs9YrATE4 zC6lUIV`PAq-Lq$8a4|8HZl8Rbzx#KGum6~@es{6Izj~Ul-$i>DPY;qbld~IZgk!3t z1H((GGX6v&oX5=K`_0g3#?nRDmad?veG99$ei$GJK~83FYU^Ng&fLv@aqZXl(x*`8nmdk<{poE!yzOrukMExj?~Z!Bu-(h)#i#pE?{~ktnqN+Snc$*+%+sQtDI(i& z;wy0hNX6KaHl8fkGSl{rgBvk$9sXRb>{{+6Yo%_doa&?RUCrBXf&b2@KXLi}gij5( z(~0|Q+}vfG#o6Z8j_F-S6 zOW(D8ZT2#CkvTz5vft+$=U-;Pk5thoycr!t=UF$dS9rK@${By@oc0leRw=9s% zg+^pUBIhb21GIq&p_;;GT5D-5O)&1%8W>5*CZG?ZTcihI^ue^sIz@y!gGx)$XF-yJ z>CrNy*+eQe1)#8WsuM}X-H@sC%bC)uiQlxs-q^hI>lYkJR@UGt5GMY5{ z?HBy|5BA;f;?*~}da;k`>X6f$z|kyi=RP;1Ng#rm>QL!RC5Wp1l}^&oNYQ2RA^1sa z)zvJStqU%TA%Iy?Tq3L)>DXDbW>dr5e4hQ{s@>kqFZT1xom~)~jCc9)R&U<*pAYeH z=x-N2P0+31yxx8OV*2cA_vzkm1q<{;_7hLZo_OQ`>{Bz%F#0r+Qf7`> za!F<&42`WEtC@LoPHIk0YO~FI>?Ye^&o{TbyBoim?b2*sa60tkJHCB$`uTnT`QiBU zF&>P%Yd4?HpS;|?yt#a_YuBQk7$+T1=qL3=gs{Z1jmpia^pF^nff7NvkC46zUq2I>_@7d45Lu_NT%#q{e~e&_m3{Wj;T>m7EkyILtJC3-7C+AyPki&2{`hCcVcO%xjGH5Fc6hOz9!|P$`c!%aTnQ9{{Q!#1 zO-4*&br~}0hPk!2WXqVP#g`6b3lVerE5xqhq95*Z|AX~^KAzsV9cWB8L9}2wRg)N| zx=$37DH}9Q5@};kUKGj5W>K}VKr~c5wPa2c?L-m2>#@*f3ktN8CK%W=0mA|}V;UU_ zG;7>dwS-km=PW9?b@(MX7uGL?g__kECxAKA}AQdec#yA*ntbeO^6BCpsLeov-pxx3RUPG?09!4T?zOrx zG@F*dG+{~33u00P(qS=`vLwQER5p6+Et@-Bi0V)+T&qwmHc&(=A~Q0m&g_<9TA0b+ z<@Y!7o4-4L{r7zF{pI}XqR%fM?dAK#;zqOydw003ni5Ew2Xi0D%oA-fNeHAO!)YSj z+}s>YG-oqw4J{oF89{PMfSBP6VKf?Mb2sitC)?yUVb^?r=`Zf)*H`oPZn~Q=i+Nm5 zZ{z*Xr=Q>H=Xb{+9`eV;>DuN`b}wFBetAFt`eJ_Rno>C8v}lQp6Qk!Ta_KlN@z{#o z-oy;mh%u%f4KYNO3X9qlpv+o>NmjK?EGC_Wf0dM$wNEsdA-`GpP52j-&(?`0Vp-&e zzC66Y@WkAHJRH=e3-LJRyPvNf{}q4v^Ze%SFkRXHBF%T%_jaB8S4%u-`AIs?I!%#B zO7x?(Xb4Jlx7sgWb1RvqFe;al=S&4-%yyZVu$zSB(D7t^mzFO2^ec~F2mdzY^lFD17oc#0Z0W%>+aym(#Wa%$yya~9qq}J&=e$^ z)L~k{CM%#>TacE{21F8$sHBUCg$o5qb2KHuR?YaHi6M#g+Exb@;K+!cDGe1WqBE0; zkQiWUy2qElnZN)3>672!=H;IImq)&O%-h8~Xx?PGYqj7J-PLGt-&cEaW)LSJEKX{@ z94qB!bZ>+l?r^uN88^zjWp)q=mzx#6La}K#H*c7ywwv01_W5eMxoR&i_V@dCWp-_e z_j-Kf!`t}jo!-7b{_u4AIrF%q-_I{!v|rs$Utao4=RTE>I(06Ar6VIFxI`|;IEEhD z%1~EXX4rfd*(p%UfpfI_60Ad)>eOu7(DC;YemT^V-O*(`EU(2cq!)|!n9}$*6n&ZE zq@VtE`KSLyQ`q#X-F~^hx?2u^o{r1oPygS;Uk-=>`(g~al+Cg`Aos%k)%#KU8Y|orFm`03Il}>wotOwH zdxI2J`!wsB28Sm=RLMISsu_T^44|76wuB%oHIQq`W?oOvr8GC7I;{mMm55ArOJyPw zNn}(xE;_pJ8ovBZeEY}a>)+z~)7y4&fAG7fm`+m|=jM}{%o-WBP`oC2N@gTIqr14L zvUx+El9FU@4v^MpA{&k7t2W)j%&fMo)rxS5YS|`h6WcsZQ%*ZxT(|4Hc7NYq?Coau zOVBe9PxBpb)r}xXx?@vExKF!n%zk4-(`tst-%iW6^?PRgwaE!+uTDT;lTl6?B z@sL`gA6r@fR@bWK$)n&eLS`H3xAy$uJSbz;PY$U!s;#9^QVwM&ZE1J%%P@-rl#jtw zi{}3B;c)uL!=bm+`^$g$zbtuqdjHga{`vhs{nP2E_jxj!((QCw7W6pcr}s|>{jda< zL?@dinpgeC!DqYQ08Zm`CgoJhv=BxbG-K>p=*hI445n(tO>Z?87CXaTQ|q0;1F@X& zAO!G3>u>w}_t_ri{E_)K@h;l~^Bwx3>y&;{SFQi8g#yLyCBjtdUo-U@cWURIlZhk%C0sAugd?i1sw|x)WEnDh2!?=+pjHGdW@%??z-9IK2Yc&jx=y{!*qJR2PpM9v zI`d$&PyN{P$3I_QOvn3sT)#a2>pwpJFaI%q{Bt~=Qu#!kY-a)jc|w>HOc9o5qoufk zuJ0HWA8u99+-#p6YQjkcJf*n43BoAlxaud!ExYn%pQ4>1U(%H zvpJEb4mS482FPGWByH_X#G*h#izx$UDPV@93k`^XPj;H%lERDy2os4!mh4kOE!I=k zg~;r{ZtB;c^soLHpZ>93{OT^Qzc@~xF0^hGav~?T*^M+#C6rAPy~GQxrlrV8Z$Qp4 zaorpbTGc_Q9nB{9;tNr6z-8H@p~i*~Zf1?G(I>U$(}iE(&eu27%YD0_aMg^F{r&Rz zQ~&GF`tfc4^?m$!$ajvz^|ZU6U%l9Ubv1pq=dIa%!r>T)C6^^*$sXR9Tuymd;xY6n zJ)xi4+JLy#F3myeNH909rP+6NmLmNDrQ5PDnB-Jm(6Y3O;?xg=4yX`X)l`HH%&*LPWJd-KfgP@eOP{eI{bL- z50=MEzqs?eyZKia)2F-L72QuzUzlmV*r`O)MOh1WXCfPt%1h-gN4r?C3AQ^PspFA;47a@OPs^_VYXl;B zh+TJ{JVSvd#BoA1K}!m9qzQ~t+4qcUmKm4eDj^R`_86IfEKyTk;?4q3NuEAY1c*5 zq!7yu1w=Sa84zThVTk|~8iQIr?_cCs-|(y79kT)(n%QBX=~M^PZ|w{j)p%oO5RZX1I^vt=?!hqiyOOn;V*9Z z;&Qrf*3gendjIzHuRrBaZ}sNQ@rT3mQ|99Z{cgH`z5nud_v&iiySHqSaq8I@GNmv@ z^i!N7my9kYwRk_FdMztH)wuXo>0jM(G%;5nKYIL1ohOFXGbd$fX@ZmLlu0x(I>N=8S1^ju#Y+yq0~s>I zF@y?`Y$%Sa;}A(SB5Pq45KuTu_Xq*mL!cqSQ!`*@W<(rK4`Zym+^`pFsMTIEWn=;f zv|J_A60hh-%ntJw3Y~Q7`h)WyT>trW{GV|7|7`L9;Nj0| zi>}Nk>n2UT9Tglch4hv%Wy%tqGZntDW8or+&MAC}Q-g)W>AeSyvZPlbxD@wDV^>1f z`psqg{PV-tU*W~)m-F2hoW4lh1)_s9&2pM2cT6yq{|2mTwUJ0RctAZ=T?S?}w;HrX z^^(^0))9+ui;2)z$7-m(!=ybY*4{!OUuX=zzo?k*Y?KPIWB8QNW>TbcUZ> zeMMVf*j3{*y8qpyH4TQchIL=#mHrB5NLJWT5%5 ze5#t#YL5-Vh{(OA$XeJaL3ANcVaa5Yh!f$-b=lm|KbQe-4cQ0>rjARs3AlvU6nzI8 zmfc)GE1KI?--RtF(ec{w``-V1+5hbd|5q;mlivPt?58$KGc{K`gFKsZKvqizkR}P; z)P%*9i3OZTUob3l9&wvIb%Kn{CMZ%&`V?1JasR93)9?D_m$bXjWA|0J+klBdGq|;8 zNplO_=EnV=aWU{2I-IwrP}Ff5VW#EJ$ob2B!lS8a}lrulN( zU(Yue^W6nr>~Q1sqc3lcKfl3`KOg_~r>DO>oZc__rg^`fu3qlGxZD5g(mrYJ)-d{m z5|%_j5g9$8Ai|JHFj76Bemp)lLuHL9a;HF$4GD=kHuYN(J&>SiZiyQ5nC|E%f>S4( z;3CORsc}BpZL|r>HqpADjvkY&J%vt|NAWlT*5^rA%1gtw*$%uAUAAL4naNafwq}5( zngOGNS@I1V0~m_*Bc!x~m=wGa8gf!L<-f>)rwf8Sc<; zJHB|U|Cab4clJN}(|_XfWAu|usajvZuppokAz80p7{aB5j$Gx29 zO}bDxCx8dWgXxIuW;64$>#F&S-q1%1ZJCiSr?J#pwn?p}Ra=ihW+o)Bw%!!S zXoEz8E-0*M!byJ-Z5;bvlM{iiVCDSlXi+E6|GHYUPoV(rZbcH@= zzet}D*Jyjk1WWj0%NX_&aB6{0-Gtmt>@dN1Ku|q#B9^JmmRCK$@Alb=|9aak zPXBxJpCTU87S6rV6-aM%cbj#564+ZdOo+uCGI^XrC-k{X7t6fRJsV&&OS4P}Zn)g{ z*RSx!m&^SZe(~zYFTPOwRCI#wb3H~TK&pG>b;+yPKO`+@rPr7^Yrv%Gp**n zw(J3@)P>QS<;i49q8ZUSO}P*P{# zO%0hulc##wcMYc~>K3DtO?raBVv_7KX$T0R%v_F^vlG%ab{FPXrv03E?&J~Rk@k@M z8J&R^m<$(&U7M-)cs%415W@0k{?4e$_Q2Fnk*&*)nNNzR%+?GifO*fz5 z<(GN$rCon|GhKe-?aKvsz=aR(Ld~`5NcmX26tV{N8k9rPY_tgxtss(n>vpQMLwFJ8 zwZ<=Ll|WV-Gm%uyL!V&jIn&(Pn&p+HXu|H^=vn%hS{8 ziP87IzwysrUc9{9{c5*+HMeV*NtQCVWywfoWKYDh9D9H2aa{7DA0JPTKOXyE4p0B( z?eg>E!@sl~8!Cp~*Q5fX0X@x5p6l3<49GHThhB0?ghQ!-5jYJFRilY^m;H)*G4ZAA z(tU~OyY@7faF8wU%KiChrCn+v?B-opOE_D{aNK|K0=OuV0u zntXrf_n-L1t7X2wu#0<4pDSK*x`lHxz)g1QRV-l?dQn6ooug-^JbE%M4PMg}(#@Lf zhGRk2*q*5ICN&I$7(>`fV@81CnrNVG_I5e7%XWKXFE8zWhb!20=nrr3*B_66{%QH^ z&&MAR$G5_;Z{XW-3#0JRZ6pPx+|+l*hwz{OcipIvjsE zoc{Xm@XwDAKRot#r+1Id$Xq))YA%EqxPUM8U20b+KFJnJ^rmPA18EI{TcGW@B+b!e z9q<}@Z=O5De8p7SF()m%e!4clou^NUmn|;EIhQeuev~58D5aTuLxcdQQZ$Fx=t{yh zMJw4g9kt=1np%|=0YRT8SFEWnBAk@M)q(+v&Y^5Vrqx(aON&LQ4v7{d*_2gQ^T3_& zu6FY^bLV3ArVgMPY5T*{P1-}#^zzdA7|RmNQOi@e_fK{;?e=_XzAxCN{&bnM*@=dy zC243o*MYso678tII9lYBw>e^;y2+f=F9JU0+%U~hfPsCDmMsx}O);!?X;=csqcZiV z_R#F@X?p9YpAp9?-)CEn%jrq>6I*M$eVBCc-D#fEF4}&w&N+RtQ^#UBWF8VBbSn)+$|3rNNn_00T?25f5r~9gFOTA)U>dPtLaY{iWYrx0e@oPrsCS z>W7DT`}Xwn!{NubhaV58x5{YbRlB*JU*GOuU$y&-`BlSS#<9GddC1eUJa(L(PEV)) zkcT%2DrG6ND5@hOdXG}f2&GKay7Id3& zVRqT(Pv`lwslUv=%s#_m9XKr!$D{vf?L=na7RyZ&4A*!27f*4|<;HRAyidG9Lj7ch z)|Qw$(jy?b(QQh^6iclFd5m;;O#z&9aZ+7#F-{4f!7~*uM0t1*(Iq9#TGX228m`vN z$!K7&NsLwGQ%g%GDJ?-Hjq4aKQf{W0cDSK`V%X2}4D?=&TC)8FM?VxNFgLg5M~m3= zDyQ~d_J&J3o^tw`3x(#Gp!=)+?xalbCg0(xdC8s=oO|oj=~R6JIz?m>4T3ARGfiQW z=uR`clrB+yEMjXe(6bh%wuVp?D}4(tI%c8=4zR-#hleE$Zy&Y%bLfA~-TyuK**~6s zHNX2dAAh&|my5e)cYR#K-1i_7#|aq<9!lNPV}YfykUyo677^B)4871$nEB|rRLEv2 zEfb{(HB16Xvu2hshNDg3CHL2UcQf5z`HMZS6L!+!XS{in|N2Ax%bVrTZcAtrjFz0;J z3Ok>CZVq)Tv8qTGqbN#hrA9Z@V!JKFaQI-r5R5hq*#Cq-7%*(uM#Gk%?lvUZYN?wo zQKAAft4LO{;;nnDZaDYkbN1P3#W&44#~A*wPRadYf8Tqpwcj=87{A~1xVJ020o$7y zrVQ#53J{lAAm-FYxD;Dyin?2xQBkSS`LNcRhJYD}U5Z{&9Ga<#f}^C7cS~jRMnGhi zcR)VR-m_USMM#8@qAG!NNL<419k+yzvnm1*qhTp#0Yxq6tZD$nN`z);J9#2FG}K%e zpaX!Qn!!j_iC&WW7+`miAcLYNBAS{Hf|k-qnG4E@Ntw9Fc~dzR5g^D2u0Y5@sYQm; z)s@+*@5hkL1xP3ZqC^TIW99-lvxsG~BspbJ?>&r;bu=2XDCYqr0vDc4A{zvSDud7H z*^pep;=F=c2nHac0Pd&Xr3AV|rs?Pi6u@O*$R*K$su@aU67apNwnhukPw%aM@Z|i? zXZ2U#yLj)d&1^3LGO;%j<;+Ox5qCxpQ_VZiJ-+0KA&8n2G%yvB%)llN0d`OgvzOBb zLSQS1-0qpg>KF*odj~bu4bEq7f5MZ}W)7=CV<)K_)~mQ&^cS0Pkz{b7rA_z!{HS@Y zaW|{-AaLm<5TQX#DT}5kW0Enp>o_j@bk>d!yUn9b_q^>-+uRN#XoQ>yOcO8wI{j(3 z!R%%FSpZ4RnF+>gD;LcFLD!@%oi4cut10dBN1{=j&5;AG7yo$ z&XEs=Ae z7BvBp3421tjA~*CL>VGDW&~DN6=T@-d`wl0$svPf2GHyPGt-WnhJb{sNVZeDdWX(? z$5@L}*8{Ti1r(f(bLk4l95WXIWGsX&Gl-1pl~GWYgkn{yk<-k!&@ma;0h8!vJ2)g8 z&=-^}CCMoP08s3ABt2gcf*UfjBP7TY3k1(DWDJnXq%I?KR%cnDD4i;JhX^EM&Jm#z zB9apG?vkWY$w8fQjED>o04(hmH-ht3=wWGwU z3IO&}YG|UFY!`?y1r*R7^gLz-BVaZ{5C20y0kIq)1K3L#%5J3t2*!yG&=ld-N%)(!`zzSL zlk(wwzbmiD`WUdBhb!yXd2?ENU>kga!T`Ns&QU}Wz#POx0FHzm8sR9EnWS_X&1Lf> zPBE*ZcqL#>kfA6*b#z?X(AES9kcBY-kq48MHBgo=I|mvn1qFfN2oQjSni?>ZssRO7 zQJ`Iwi@}tLcKete02LVlSj>!24bc$XF7s!GE`-w1XcUd91n}&4o2Q~o96EBw9Kbh# z4*CuOAxr7Xl)4HmX2_BV9Dp!Y9u*K^yF7z10uyJ#DYXUwQZW+cz@Tr6WQ@B|1AXT`NiI4S*6$y0Q!rfOrB*g>CSKaAUPxah~{o9@#^_fgkB>HF0X)0iM9&RHez zMv97n=t_^Z`)PW4hd+Et@KZ!13nmHaR;horihmW`_X3pj8w;Meldm7X`>S3{?C*`A zeNfaF(4K=X^Y#J>xrV$RmBAq>8u?0xTp$vZqYf#hhIsdz6b{g#M#L({Ein)2ew zhu=D$YLPoXF5uHA#lv+mOWUnC6Y6COblg;(Qg&LZbs>)_RYjq7Q4~g_6lRj3b1_LG zAw^QBsn;@glfbi*!8*!9h|GwAElVII=W@=w3A!Pnh$iVfVei-r$xH2zq(}*b5D|kX zGc(8t5Cs${Mkn4S^w?PK%i{U+M!dX=e>$6fVR7qvD`oH%qN)XasT0aOup{F$ox<(yt~iLe>`t8nm2^UfE-=ics~ zZAh=foeI$4=?_=Sd(O|AhM_-qQ`4AnOrT?c;HyTWM5V+WQiLH^$l1g@;5v-R8DQru zflv=IP9Qo&b|tZoF>8+5uyjr-T7fcZRj^t$jvabc0UtthRIS#XcP@f3+0e(LPy+_r z5jYu8O_@~<#d|h1He?UOXheiaYS}qcHSeo7OGezrF+~XsS>0%WrbS^qm41J&KmR|E zU;A7B?XO`~^bGd=eD?GIN%7A0{J{^mFV55L@6_M?|CG;PU|D+Ca6f1cZMUMb2IKwa zfMTaZ7lK}NY16g=Yfh=~RCw7BlZ>p8Wf;skkKn-a7@NX_+o3`n2JBN#ePR~KyYFuh zh22JSw~LA0fCR{ZfqWJBGv z-rzjI1e6DjWA0aZxfsvaaXrR}Mhr#e!$EnYp=*=+xZt_h2_h?vL&`~G%zc-ZefxZ~ zy1(o{zU&`%{YC6^jJrQ|*L_d}079fW;gP3n#y0^g_tS(1>?j}h@=~msv_QLw`nI#% zJ|r%3ac6$?p54BlXXWnk0%L@&avJXd6ldlTNPdFF9S<|2)qwu|BFES|0MYSdOq9EMUSQ@2;*wF#e8 zu;;Br;H+tkeM+fKi*9(bjE`6APga}H7X8z{-%6C2A!krUGZ8XGL<&fU4A+A@asI$$ z?tJm@?*O~o3Xl;Q8)kzLRNp{)U8#o7i1w~s3)6Ytt5X$qe9@RI?%sj=M)&@=drks1 zP2P~vGp-F`tWOSL?`Zh&5y5_Oau@0eJ^V4!P#)DUp1-*E>XG3AHfj6G zeb3;zQOxY70bI$kN+UT)Oa@RCh(;>WsUfg0LRLvJ6~&a9*JIx&kqk!OA~`|@?NZ_( zI@_CFd#8BqXPW(!?fo-;@@YG4XGM9|EjZiJmlk;ay@^kN)6H9dz5Ywjip}4{PaaB# zAxx81LX&!urCmBu@4W(-=ozxabyIrrg{V)8qX>nd=(1x(r91$*mBbvA0w@wkV4^@U zwy^+Y=ad)_R5Ah(n)i+v5m8mfUWzo(tVrevB5N`rr%ILxcZ06oWM3(QH(L~NidG@w zDi)8{@%1513uVQeh@29?lv*he@UFpV5CJMeqTQ2U0D*|wb9F z4fn3LKUr@+Sa)Yb+_4`}b{1;dVdR;pVK@T2?)U^?uGS#chW)M&U}U%}1{;E+kSP-! zSiDZIg6tU@*Hjb)Axx`2>-x#VTi0Jln#pFYXY<@+apL+g6nUV<2p3nw_FOinvc0k- z$f>I4nmwcy1Y?0B)P1`Op2(MBa+tP@VSXHEcMp&6z*Nrv{NwugTJ`pc7PE`i375Yk1}E^1)~G(j#Ej?%Lg>E<0i!o}KN58Wed#0_q`F2r#rTK`G`68$eUq{rw#G$Mx_-4Piz(xf z@)*3UzzVMk8DbhW1@MdoZA6CT7b_qmh2PrPr_E&7k!J zEr|db7#pJlP}2y4yA=zHutSdy9q%@AFONQm$mT70_JON{%TgybOs8&M`&olC2Mv=O zEREPL^2OEo>`KnIx>Zv`S7K4=UgM^dFs;C&iI^owqqHfHYdP=w=WX}Na`Vwu_n>R9 z(x@qun2MRbM0}QvbK)86KH{On1AyAl0RzyWklf3<0{+wxq`UYqIg%NNl1UUpRTIe7 z#S8Jr^RpREXD0_Yu-JoVmn`l5&wldSd%pzKKmPdJN7T*E&hmOKmmk$RdH<>%zy9pm z`Ehv=uFt#eL*J{veskPjl#Qmf75g_%pIu)6+RwC~|MKCl{l|;n{cgSf(EY+|d3uxI zy~dw?Fd2`^Sq-ocC-rvqVsM8?fBTo<^xk0g-W$IP4?mX8{dODY)vL7)?PuQ!W?2PF z>`_8#fC;J_5bi8CYb_7!##h%5;K`|7T*cZ|bINIGmFuGeZkOG9czU(o3Q+AtNm533 zt>nN&2UTxq=L@}9EjCa5JNoW_`k#eg{QMTHM{QTtHxGX9SGphk_~OSu#^3qA%g*rv<2{p4!%<3;yio7QFMit-hhjHu*r_gG8Nnrc(EIIZ~em0LFWG6k9w->lRI$z?)uyR=E!q#bo+AM zS7oy8eW1@RTeMf-v#Ztq+pog4Ut9nAw-%2c-aXle?3X=oo5r{P!qJ!ig5Ljqdhxxj zP0M$_8vC5)^?d&(ya0Ig2hD_&*4Ax(;ih#nj9JP&V(G5kT2AjA{pDY-PEPdu-;P&p zm}l=14r>!?vn%&7eei76rXh1=24l2jgvmgA9jao;09{)`%q{c>^XJF$2M_<~G(A|z z-lW?9+?ziuwteMi-n;&l&oA$P6#wXtkMfFs_E*={-J5^uKR^9`zJBmsXdf+K*uh)3 zi+4VE_1$lk2Q!>s12-Ml&r+7^?sZ?*sD+fTEMo`F3V71yN} zrJGfBP`kS3rl1*XshpCgZQ5R?vvpjyX%j8Iqy}dO@`Wz~lxV@gx^dg#b{tmy@Vx7v zEZRrQ;a=OHZey2|Sq8N%WJ*X>BJLC1aCn1hPB;ZXU{W$wQA{YRs$yveu`z&uk7qXp z6yqAKmRJE`&Rg?-?61{z-EFe87 zD3VNj%SFlPH1=u6)^Ffmb^KpO_w65kxccY6bLaa0=dRzD%l!N6_~uFTD}VJ@um4Z} zmy@?%DL($w)j$5f&mTRTedTZJS0~4R^RK|a7~m|=zc7QJ`$agpb&zN6zIWX~0dJ z&$fBd#dX>_D-^)ck*kUjuu`K4<7mCgx*Mp9f~uK-L?cepx*Z>Bm)zJDzktk{l$e59G`Je`bWGWSZG~dj zZ?{#T>u-HsTY&M@eeTWe+S;eT<3IhZUtYPRcei`5UQPC^yXz{{%YiL*t7kts)Z$r( z$K{QxBy95JWG~5f9nWTe<2Me&Tg%H&ul@WljGET>?oGnJoNqP%WJuNgt=qunZoTRr z_5Iny_2WOR*XMBa3kSz1@XlSRZES_X>VR&*Ft_c%de-)0dxfQmP>A}0#njidN`-gw9*bG~&TQt=%XU+&>E>iL& z3Q`f}43CT1-`~=2ZdY(L|M{=KbGN9U{^aU(lzMj9tsXu5C;#A&{`lKJ|2O{j|LB*# z+`s?Zm;dCy8s2}fz5LSgt0$`!PU(=p^!AHXUaR-v%eU(fZ{sZF@d*Ld^J7>>c<=zm ztJq!ze*!UwvQk}EW&jSoqm-qWhO6GTAgLxWh(%csG*Z^0CK~`qy=MS~kkO?{!P~3o z9RzA(5*)Yg&AsWJx4VZgF2l`}*S?DNEt|O5#%jNkvq5(2R$?ExinPuU!L2tbot^W+ zEE}5{83BvaXb7HJ#RGcA;M9nXi4YwcJ8%viGJC8FUzQj=OlodQS53GMS_UeBFz2ze z^*UW#+2vB#9Sl2iFlA)mNf{8ks5+1+cqMITd&ssLjA65h>d zFmZecdY$l^!#&^$LqXt_qJgUBY=cN5N}4%i?_MhJV8{9*+wl+4`5iP>tWzF+u^7`h zuC`r=PWKO|`@zo7f1KOJq?jMQ^;Y`e)3j~v<_&!PjqBIG7(e;+`S-u~%B%0VgB!#1 z7n5$Q$2VU2`Y)u#fVw_?a&J0Fb8@>Iu2#>_=6~^LuYKs@!Gq!#J~uw@)2*WTl`q0a zPhULTxVF!^-CRC9IsAO}_Age`I4)0%H|}8h#(Mj7cJmtKuheNfu9mQWt+@8qcI@YW z`9JTz|3O`I`NiA)zxhE8`))pkE+!0VZ|dKAeH@FSIyiXmonn3*)%DMRIQ-c+>)m ziG#sS+RskE@qhY1()54(Yrp!l&FR0@2j6r*S})%(@RLvb6Ib0hnv{p9k1nT2q50C6 zq3z_!b5|e0vQ07iz4`W|`=xi%ueoXl*+8IRMCem?ff39lHT2DBmVKm9c$G<2fewg0 z1QyBQ38shb_@X&LDE3j+pPiX(UREpSH{kH|gtzXzc4PbL1HF1Yxpuu zjJTvmFq#X~da-TSJ&gCN9owc7*iI6z2X zFJ-zQjsOfgGf*sAxkivFjYDc6WLvj?`pwBrl}+xy@L z_2OBG+nC$fEo8l|oSkphHz$+zcm6E3AN%oarQGPjc7ttd=C8f6e*c^B=t(HPR~)>$ zY3>x|$t7LPZw$@#BRF{7egeU_aP%64FKia~`lX-Se)I{$ZE&;s-B)0`4}t1~iNk8`(dP5^>5K5Ef8_6f zCkwRVvJ63~doZ=aL{Y?HNp=zOn|*$fWx|E;=o`QH(YL>I`a(vii-`^6C=W;7 za_;(zfAnAd7q|YSzi>BrdvRGj_y`|eh8LIl+N)O2n=k#$vp;_S_}y!er}ptPvvPB} z4SADdKXj=&dR>R!Y#7?EJlKmI1lH6dMU7cW(Tqe1$dNC|g?gg7A9Cqo|9aFiuEv7r z%`0z&nz!TGXUp}M{_fvd{^NgEhFl)siHEnJ9luHkua~bqFWqzavp?`T({fe}dU5fo zZ8dS(cLOl?-ny6`;hUItoEU za8==ECDygC3fDNT7|_5*%d2#`k;N)qZPO;HsAb5CMge(3O<@g@i@9s{s_!1G+8=9dr-D9d{e1`gWMqaCG$Q^gTDL@cQe}Ol0S;%A>M#}F&t~EbLt&u+9KvKWt~V3wr$7At#c%)9e%TE)NeVeR zLEHGT%^3ko$LKK$CE^HpVQ@cF6y=yU(Jn}I+ts0}vX9#|+Bh(+xjYQh%OAaXu=kzY zwOi{lr^Cr8CxOS4W4`+i^vx@;euZHVRu`+B-TuBtZ_BP>ifAr49Z0L_JTZAlCf%C* z5XeVS!vrukK%%Os-qnyr=7;p?=FP*~i~G;^A|-B{-XO}+g}aQa)K;QR0TlM3;~fHHHL=> z`({(1+Mq%VPD?~m09BQjyb}kYW-;Y$qTY1nCZuSc=paQ@_1Y?zWTMFee&-fu3r4;7;crr*r!jX2d??qU--0oiKOB3bn(IOa0 zHz&6iKRkz5C!IU+D8uQqQkL5n_YMwjQ{nw&?!WY_^HDF?FM@5RZd!Q-(R%MrfBDEp zb9;rex|~+fhAxNt#w#$HcxrCdM*tmAf?@{F6Z!49&4`Wc>4RtC-~IRYkN)25;>qba z3YV>neTpuGY-XZfJ%JX4Oh#;wv#2qb1(M&bk{khKR1>Zopoyp!-crG;BuH8>V;4JJ-WUKaM(=GDl33-fK`r&YDhqtD`^2_I=^Ou|AIP~|w*Zt|kX7lh%wLyBg_;3G7 zx9a9~@D|h4kC)M|Iy(CL*XVlFZ=hMNu73O99Q@hufPT^%fA+k;+>U3c!?o$dtMW%v z_tAEY;C1BJH};x~qDrL>ZT6sy#u$y!Jfb%TK!%DzA%f2Y3g}Uoy%kh@m=rK89aV0^ z*aRwBgBfO%Ar75vyV$PAVU#E-OIFZ~2#|?P$z_dMrHz|M%i+hDaO#NxNSKR; zZL^+KaI_5%lQ^`f%AVQSX>b}dTf)(is0E=hy1XpuQ4ax#0Ajy<$Zr0{ub%w;FZY)( zhUF*oTl+AXrn>Cg-qnShzp~k^H*tFNt8Xn`fSa#(7gyVh%_NLE-E- zH($*cPy3H9?D1!dKlv^F&L2EKU;WV0x4YpiQsV$zEpdq z>?~{7tD+-dC*aT}NUTsQfH@8>M3rPF4#61)t2~Fo6$OOS1?Nf+m4gC-1f|QXjV{*X z#X2np>rx(K9I~chnN*QU8T-6a?fULX+kbpHKI!6GlZl|9$&N`x3Kjqx-ko`{LELjN zBbpRgGu2+bnKKjtB_lAuGp=SO0W?Db58znQ;ZJRk{L*fOguv!FTQCV^Q8g%b1}_X$ zc-F%<%Zt9BnjDJE40?h4o70Rw)v#FH9g{M zhx4C%@3Z?K>j$4TC*6W`I1hG+;QmO&~CLlh};|zWVynJ9nA)i~h_D`_#k5GPap1xY<5z z6I9L3SKflX%0GCpX&=zec5D{%ct(Re0u5Y^2o=tNf{^Kn_vBt^YHOU`Rdt)_21Fz-weh3 zl84WCrsqJOe#;a{yuE*^lV;TiAA^`{)iAjp7Cu!UGd^>!! z9-g#Oat6tej0^-&0gwTZONK**Ik}11#PNh};;8XDW7uh5FlPmHXhx7s)XYGlWdmW( zo@ip$I(=!zpr2B)42_6|*;tf!tyGmnFo<4sIw?a^-EMfBQ3JbRr4ABdk)<`1r2$@%j1N5^;W&~|)&c~-yoR=#ol#iwVp>jx*Z zgSl?(`PGvTAH4l5pIcl!86G_HxU}<$>t@ft`-cZbzP$I%T8iVtJ($m{^=bS5C#!!M z!_7Nk%I){R+aJ8TynJG?y7}hkCr=;c_W83eg`TEK*8Q>@Rtfz`l@?qD9EP?azaCRg zz#f6AK`KCYf`JT_QOf{B)=5$PJdEEg>=SexMlujgg2<47X?B|EG&+&r|vU^!JcILA(ltb>OknApy&z^0moHp|u|-XtYnUDV8%{ z9#E12TP54shQWJBUcFae8WXVBG$x%YyfkG2L*cOW$ORPvYwsG51)DP(WA3(kwUV=i zo~>oQk#^{_NOlBHjZqOa=dI+kG(2C$=iPWQB*}?(X1^Vs*$|maL#MV6xW{hhan3wP zoCGYN6Faska-3d;SNHY?tiAP4r z6?scAs;)hslxkh-RDLYQb4$00onmB~vLBQ#d5ndWco~RRed|3GjlUfGSXfa_o9<>&pipzWz(Uest5Pr|0r}|L&vjf2-W*x4-nW_QOBeKe@YKr2GHR zzn`sr^K-v+D=qtn_tUzQ=*}dqM5i%^Nx40J(SQFBZ@&F@c>bb2{mevj8ZJ-Qe#SOj z=GAr>aG3|?VuW%qkuf`uzzG9KGa;$U&}X%=W$zI)mvzqMV01&CS8=_6c(KH{rTY($ z_`ke@1tN)pU|%>|L+8U(hOf}%Mu}4U?y9}!$}63{`PR><;OgOL#j_{ZLhYX3 zx94a3uY3u(TI8$h#v4Np+HZV6)VP27>|v^eB`q(~`VKeXpL*A77-$nrrg=+KpYTnztT3vU>5OdHkM>s<`(g$|oE6w5mS|{9))Hj&WTw zON22(+8mUH+w*$`9F}p;sX$2teGO*;&wL!GBsp;c7Gkqqp-~+$BQ}Hz;>ZfKMsi9f zLdl^I-a?=m9k^YWGXtyc-ak)yYz6I z+7u;aQeg#oX>5cELWQnis0{WQ_MDp+eB^15Y+87a;1O6ci;6I*vSF8V&XPsjB%7Sq zsvX#1rodSwIVIcO0bU~P&=i;uoS8Gq3MB!NGO#CY6&|l+RfhS@Fv*fe1%FD24rGY? z9xBO?9$g(4a9rBa$<&E1PS;Li#dSHMRyMD^{d!)Fus-d!!-J1MSxt>&!iVekpZ;XX z?Fl#A5$dvd`u)dn_MkqRJ-mNah#c)zu3MpNKKbm2F7?g)wjJF1aut-M~U6$O4XNDo#J-!V%(1Fg6ctGBdH3t3iN43mZcj)6$S;@dY zQm*C)>2qH{Z%&RUZeQr?;(6^Tee(V`^UU^>s2n@^^xAfGX8TR34*ACM!IPgnd~s>d zUGb!$4@DmUuB&{y+Gdd83pd6&#Hvh3b9d0>Ni)uZF=s%GR4lE0VB@Lf(SwaSbmSC8 z5ls;Q9cHJ_n<*-Q8?r04l1g_bBxg*XohK@g3wAYPKqvqxTDR5JHZ9h=SjndCxBbvZ zi3s9U8Ixfg$5!&HA0Kw(qc&|~HpnCf0!fh&$e>2M<#~^Jf>4oF=;|8h4(DDP=jxzA z2(x7Z1ak@rSwYYw<=CdY5?zU`lXV7zX;kV>vLj>r|ATpUp=dxQBr;6Ih)M;3suf5L zXiMSY#rXQZyHWWjB>@{;xJY9|yy|3|{C-t!IL|>BV``;?rg*yQin5*-6%3n4k3Va; z*bXf_@KH8vvLVknA#Ys{57V-hOp~&j)t@|BR%hF~AhYtbv!(WO8t+-@%*w_2_9u^j z7$)`c-jSyE$3MC6Tm#0RJ$Q6bhLlI)1Z^xFDMJ=Zh(qdnr%^|bAX!wbE{jx_Mafss z7hF1~;Gh`OP!{N&V?)!iE(3Fs73MH=2Y1R}`iu2%J^Y^?nT+la*KKbF3leEo6$;!A zKQle}(xl+fwGsAvd#BJ>?_Q&?e@zY!WVEx7KK56u^9P?c?PJOF&PYd5u%2Yb(iq&wt_91Gfc>5?E?PF6}D3etHl8hlCY?8ms z{?k65lMF^I<_sjO8MC1Z0xL3E1xDx+N>5N5D4Hk%8!-a!yrv$5cbm(pMy*B{i4qHACe2d|8!5tXZrsN2j zLa@R?6}a3{*>=Wl$OsBxs#&7OJa#el$y&>u;MQz39YM2bQc$;3)*{eLKH3g@#{#CL zz+eu=5oE}g$e|;`)kvSLb5$3&0%Voa@wBL*?K{Yk;VRi;#i^dDcALQr3`fwE%H`Do zL5Q2z4w{S0R8|1sMl8o6ZHi{G8c|s*ciwiBUK*#hFMH9&HVu$~Sef?fV(hxGwaONA7@hX>ZzWOHI{NnVTcdypZ?>&5ageOh0Y|sC+w)5O??8D!Mr0(F( zcDlEo929T7R^PbW)PX?v(@loq{Nv@`B5(iTzd88xfBDISkN^I&tA9a$8GMb#riO7j z#Ln|F=1-H(x#nS9KK|6&7mcI+Vg0qH22N;Qz&+#|CaJt`v!+8IHHellVjdJag$44H zUY{?z3n5{2rf87CAR{WE5&|GAvN2f`;`eBSYfeH&TDcA^knbK(*pRdzpOq-;Tg%orr^Z{h1BgVUgcZlB##Q_PS zQ=k@@6g?Vc%R5PZwlPXpGXm=^s&*+aqh5@%h;k*+3-ky{VHD_96+j&u=u0M|A+VCX zq?RfWK_TFbGLbMMB+gJ8WDm=PA6=3=Y-ZJ1;^4;&6^7dNT^2=l?peRcmeqSO3f?V~ zR6g`nLTcN^ifX=0(gbSr=&NFzVNsJ(%E04{sv>N*eejWjYYi1?s@RXiCNDSJ^_&YU zakFi|^TGRfHWv+%b0Lrrhk+#p$k|lWz{*A0S9Ou15N{LjHAh)uuEsu^4#OZ~ih=4J z7oqV)$hk=lqxRbr59W=7dfW^?1j#~aarTG*_S)>$jbHd%FHRrd#nZogv-u3S@3;Nq zdQu?W&0%W2Svqcq>kO0FzkBD7z4^uZFa8BMtj@pn;lXRK9vr_1`_=g8|NZiLH?pR7 z;XnP!@`i;kOzC9O&%(ZU1=`uu?a9q+%l99RA3wi5{g*dC{^N_!KKhr-@sFt3Xx(sM zjx=U7+Pb>Q!7IdQKB?_3``c~4xwJIC#&?Lwxo<3O^5iN>(}z zOaVwykD70w~5f?-l|OlY>FH^a`(sxZPB$0V{9*>>q7$Mcw$$<~4iFq$QSgb)>x5S-FW z*~?D+2?+Yq)D63R12KAqkwMI{3Q2AN9q`2lzJu__A>R(hSuwEVJeyJ+1aj*GWapCM za+4z?fNsTBBvt8?I5xDZD*(m7M2NNHjO54EIS-wB_HN+<<`^he&cz&h&BJQxi|TAJ z+DZXfQ}y77kMAB&>Pe+zLd{)@j-5k*rfE0ZQpt*Vwds8klZ}gZV516B(;TgX>3Sse z1I2n$4dW2POj#pXkKlc?9&Nt{S7OpOX?@Vty!hnm&wlI9U;n#@zwv+g@W1)L%q||^ z4e4g-9u33NXw6~ixLM~tfPLCOy7S83ulyzX{1=DA!>1qr==CqW<&R%^{-@vZmrq@_ zC)Bjk1~Y1^yZ^z@!WX_`sw?I3S>z&(KmO>WZ~Wf#Z~WfPt9#-6-bJtfjOu@g<2gcM zU70M24lD#bw(%gg@9w!n&Iek&TZMOziX#YdSS)FJy4a-zrXU7tA^@O{d5`dQ zSKRUL$b}-LjOxwED4Mzg0rS6?$tnnB&6SP$d{^@3APzJQ$>1d2?P`* z#B2`D?V_LrFU63!vjM=4cWzm;2X=;l5(L1kH(wBDhN}T^N#!BU9ROIZ-~&}bN>7Ha z9TGa5&Wfu(l7LZ8=yOV`tS{q`2dFT`EW9(`WrrdYa5!r6d2nNvYAQueG1pRnLs!Or z&`FWFH`87itk^%qtCXv_EW-pHlvZ||A$iV~D*gWvj1 zz5m(qdRtZI5OwTtM!YlMf9Dr|EgrpAeEq9&c09%{#K!Nv@%&%^$?D0&y-5Az4z30$ zcyfH`Cx;;U{`)@>Umm`4tGIr1{RiKK55IM7{CK?gugj~mM9shMiodtsoVzKJG$1r&L{dU0j@Uc)P9ll3-UwYu&4wjZNPhKz(LASfftdWaV>ou%02Y^F|B1?45Vi4Q#A0ev^{LB$gX zgqTNIg0W@^G$$fREUIJ{MRSfh4>1iXcPXvA^djqdl1qhEh7m9Wn5peZnTR03MBwZt zxeH(?S+O0!MZf^k?jHdefJ{9ivyBRhY0TcU2E>b1{-LJV_v#}}9AFfkl+KHg#_62X zC_aq!l(rjmiUKgWuw5@F1&&xaa0nP}r_QBG;9vxbTidP&nCDGNA^VV2_7O-id(Pun zQZ5Tx#C(;nLU4@5GQkuhFbodr{k_Z860)ETBeHb8k*lV_PK-OLtv+~bwd%P)-FRY1t` zejdL(tG;gIdk2%c%=4lgiJq)4$6$I?Y|FfW1pQt(e;VcdY4>ToOkKd zXXlq2KrPty85GIXPEoQkj%`K3CU0e4NT0?8Hsbo48B0;LMR)@d(PE`X9i^U z1RQ1TY`aNUD_N|E$Ozk9fYo2=d-u>2V;?&7eI)Q$Wa8 zmPH>Ea-P?iv#Jf8;NZmf8C3nMZ-HInDki0XhG~R?$PyL7#;*(L4UqV3oHw7&U;8xMYP z(VZ<`na+zbCGHMy-C1u=$(xl0PKy-dsLstMvP`txELY2Ve&FxiJ$t&S7vETa_QKVr zeC5mS=ihzf*S~W3*M9ZY=ZoRfA1^=t@tvDT_uv0$0mapiez3atc>3PE$6xyysBf<> z|M%*81Z?v0hzJx`2GM<4xxqtEdXX_i+mya*Xa`m}`qt$lhN4r=RpZMv2 zvueLbB3TN{lM-K%^3DD3i_^oSs=Qm`RlB_k+vgfP2XfPi!w9-5(h}1MXfbT9)C66d zHMvoIby$NR-+#P)yb|ppQwRt=g7OXwXxY%2nUO{U+XWAd2%VVsh+x3rz=8wE;226z zMW}0kP*r;c%?V3KL<&GDiwv>va@WSL9r~0;6Cg7-MHp0C0mJ}_lcI<0%hRQ{ zpuc#74+K0xCTf+84K#<+DKeL8MkcWgQkO(M#$lW{HjXL8bP!5kP&W)BIt-aWF-*WY z0uNo>!d>*eNkIe`g-=-^X*UyPn=@c+qPe=*667y&iLW_ zlmB9Q=Rf;9cy> zr%#@|eZ0SSZTk3!mrwFR^X}Kd2{*;9FW~00A8zkGPRj=eipAo^YPkNG{i9aG^v0vr z<}7Znd%EWAI(6?HS9f7Jc0fn|S(l!c#V~EQvMDESV2Dn(mO7O!a>^JJi%Q?eBxX#_ zL0!@G{$jXyy1m@E5ksLVWI>dHK>?kiDuXFxOQ4iZjlp0CG&ELWzyg_(f>8;+f@(@( z9?I&ZD)s|R$Q8jeqer7L%X++8%W@l6ZESPyEhkh35RftHLgTX`p7v=u$|wpMU^f;8 zL!*Fr0JH}bARuB^@nZe1s_4;(cOnW=l^A1`bxP|YZ&JS=WF2LdwN+RPYz?ECymZO| zBO8GN?UMhBB*fs*+)fdJh&w|Po3Rn%%b}{E0<*A!fmaFw1!Dv<<$^Ps1C!CaM?HS) z%6{dzsK)5fd`T%s05ZwubR;BD0#67Lps1~6msPAGFr+jF#;W0*#00k)lO^M04?v5DvF z=d1eU9UCu`tV`D(bK{0Q7H;Hvb=A7fWqmSz`*UwLaf27Yi*-Mal2d!OYNpML)nyY7 z!m}sm|MmZCa`VlLV!wXnwQKKvWxTrheBaLBdh64F_1Si~g64R5rI^>NgN~p7stNR~af9u_+=g*#P&Yo`1gIQosdAZ`IH@G@qs#1Nq*>YE7`fGc7;M@eD z3Z>M%tftQ=M{a(jEH=1)0(@CG9at{SwjiTnHr|S}5yph717_T-t64dJaYf%cfAMgW zleu78zLa5sB4!{|OGc~+qESHwQ5_f&q5%>qGodqfp359+s4AH>IN9ULHvEWDk6|9czn4^K|I4}P3J{83lxv+Wf|4Up*S#pA1o z7r8I}~iA-4D*SD;|WN)|0oIGdab$hnjW4^_YluNZ7&~m z@Vs}r0nSDX14%_2vl>Xun#2fZ43pWUPtA8fYk&M~yaX{~PuUEN#Q-uP8n6JVB~=p; z1Ja%L#}Gsan1P+K2k#(wD=IE0TJ1r70M-0((%hbu$F7(JS9@mXJh67MTgK%wpD)H2 zgRF=}gZdpZ4iu%h{mDoCIxBKWDq}`_zP!WlK>cHB$+lhd1Cs0PmFFo-)OUjNeZwSB! zh~y9eNWl4>kPtK@l367HW-U3ROhFncj>p(}{r*zD7Qeh-Oq|p?myQz=qjRW+AOyao zK)%myZwbhmkwHeVY^F*I9zm-D>w*KZkkwQ>$UHa#UCgVooWerC13TRo&pk)PL{U z`h%;}%NE#J34(}EBD*((j0Mfq4Acw^B$L^WC5S|7gpBAI(W!G33M>mOOPSR$+o$Q? zbUr_xmv`&(MjejGl@4l#kYj44-ww-lf3+ExN!lC-5&%dnD;=-0EV{HFQ;r$3qV5dP z07k?%2RsgPgZz=_X~2RkAcURJdpDt1mSkc%CW%q|A#M9{Gsg9(E7Pso2pAE?OkgKV zgrAD+5fKqokbpp$4T#XdDeWY2WSGH-5i$}$MpR^z%;Z$fc;|v=1~Ud^&ni%X^o|6~ zr@j_C6nTc>+vkun{lY7BqVY1RFA7jC3(PrZFbobLv2&8u8LAghB4;Envm18$cSJ?t z)JIc4pC4Yoz8C77i>vnX?DA@`lt&R%Dhe}6L}Uu;Tn29*=7nFkUE!;2xE!|Rpg@o$ zVnyeo#OxLu#Pdbo&vY9UxJX}x&to7p0*oCAzP zHiT-jO{-z-0i`ABA}`D9%VIJf9!+lFn9olt(pT;ji^aVsi|0Ri{%qm~c2zFiI;JxW zn~X^{Yf{Mpjn%5s&vep3`1DG@cXocZ1-D1MYR$kOp)h|N6h;UY6ncfh6IKPZl|IL#gw8( z$=2rFkGhTHVvu#xmFO0r6G#BE3qk-B8W>fCM1P)I4y6Xi;1u9toN(0D$fG;r)-e^w%bPli9xLrfZX>l>5=5 z?Cn)pxXp5$cz?Ny5fRhqs$!|6S@2ndmKd42 z7<<(*u2@6XaojpkR~ON+8;rem>o%LWZS112_&`Y6#!KG-YPLt3Qn%(@=KN<2scQ*wQ+P&fkD46XlUIsBwwE;oV$gL$uN zlEo7%2!OJAQ;jI3S-}~wQr-b|cDOg>49WydQUq|`yE6F+)Dvl@G(C{nQ8C}&-=E%^ zHP_4f*cVgwv$C8Ta+mt$xL9sqtcHtrT;$vX3QHngc0GJBr2ok29`Z)~>q)O`07lRb+5)Rop zu$Z1*b!rgU8N*b>$?P{_kfJqIRJnRy@;d%AW&FE zG3}r>hPE`mdvkJYkFYR-t~i{&dos}hXuX0x-@ECj7ySc*Pq+D=P)&#XVYn{Pp0zo& z&8)c?hcW~Zt=UBzHG(q@JXsCVh`3+`TC^MA%(-aVc2JZAO*P%yrf5Z@cLIb=GfqV` zUbwK?Ws_}_v<2%GM*}fT zfVLA{6TkG{0_jUT78#Ne0Ry5Lh(JOIN``qSu74?aG%b}FNf3>Jaw2rjKtK(ML|F*| zMAUh#)th3ZC=zNXbp{@gP?X>Yi*&~HR}Kyi{j$kvCYnnUV3df&2^ekHK;~vn9zyIj ziKMI;qo$lI59T~+A!G*3kr29+ks!%3kSkmrW4c_gV}v~iIKnhQ>v0s1TrGz*YGkV$v#+}N#W&=&Cl~h? z{r67!)9uyfp!?#@yKgnk`uy~GT1*daplsQO^JSA9an9C5yvZ$ z7OW##YiiZnQO2Al8^})EywfovAQT3TvqRtmqME^j1tX`1s7SceFeMNK$qGrdkJ6^t z#nfhPv-E0%X;MgtSW<^b&kRS9)eQ z3EnsL!H|bU2?+giNG*(qhlTH~%TgBv#AzH+VklrbE1c=gXAUOxxuo~Xa?@_qufQWn_Qy4nm{2b% z*@zXPb4mmV5rk?p?r&Z@d3btubhJOXi+N3hY~~&Vk`AEGS!sjlpwt0(O2Wl5!FouS zB0a#EM=VG=1=Pm-#!pOVXC0r#VVSfCG{g!ap%f};VxDq>F=!S*0|f+C0Wf%}8Q;0Y zGaz!#3iCHC^sXzwM-MRdw480_(2=~C|h&5UTSeg+E5FrQ% z8G+`M#ndc`3^}(cZe!l2ycOvbdxOz%G|d3AvpXB?xa@#CFD1UT;NlKgZG=XcRFYXF zck>tnNL`{QEI60!AUWwd2(V<#l!1Xk0X!IIP~54L0R*XJ&YH83qm&~l*JM8H$XQ}Q zROj;tT_2z6U%q?I!`cs5s6Ee1%8oH=g<}U*QH~v^O3P3eu8deOHw))??rIHOq9rHE zkVYWF;Dc4jDUZo`90xV6eGyZi`nD(?QOMmkjW8u@unrY$lU}9K1I{a7$ylMUi!zWl zHWn5gl`&dX)o6&Tt;Vs$1J7*J;~|Sb-tX&q3)lB@a%Zcro!j)ui~skBkM6<#jW2!v zgf7{4nR~TTXcPg^NXeWf3x-AH(vTc`Zy^S7+jMbs+ZkQ5L@zFoYP2Yjv+I{8IEkR6 z(z45)^V?)?OaeGEh)WK1Rux6Dm(%nr+2fWLYCYsg8OqU7%A+7+!KN89keFwVQ9OZW zG_Q=Rwo_RO?5wx~#%Kl{0348GLMJW&78EM+6P!+RGl$8(&JXzT@cQB2>wB|1Rduth z4vKoeE{dkuC#hOkuZOG6c-5uF*ezq;K;DwBqHHDiqLvlZAOr23>^l{^0wV^5DZ(6S zPBud>3_Jnj?xt>;1P#bQB}bK%CB+nD8ng7GgV_Mq115kBs6UM~?B+d&z_dFA0T3#~ zZnw326%?as@-~Qj>$P^`1sxy&dw?;efnvvjpj-yFNT5Kld%+kzNn|R=tis5i8Xcz7 zx=rISD1gnatc^Mope(}7DKUL~)=QUu^~wON?Xujmq7;=R4GGX#$n98#NZjxB^n_>GO4|_3Y(aZCNmevmRnNw);o2Fj?G4P zOgun@{p*w5S{}!%D51piaCT51jAvU@H?AHwleuq}CJp*#diNFBZ?u0l4D-&Yo@`;l zuNLk%hVdVKn!od8td7GS-)=^Q32mc^qe01W8OpucNwv6m&e)mP;DW}%As_|k2&n1S zxE@%uMKv=&q?{vlk^~^?kPOz=W*?x&=a!|WfzDF3SPN}xla0HQ0;@QW5o(_8ywVuL3z_hLvX&l=+0=={n<3^)g#AlyGl{V&3fYY zBo+{P({IQA`9XPu%NvbqVJmG%nb-(liLw0hSIYm*G5o^^`Olx7UG@FzuTPSL>A}(1 z$zEAdX3Xdnj;7X+i&3J1Cvsgs_Ga0TsOVZ85wlRInv;yt46~5|WdVy$fxII(4HG{p zvlWXTA8z8J9|i}^&ImnZiV<EwDc#pgI~v(Gg(;%LYnhJALKud`Ac`&9^(`Gb!#SpBW^AB!m&Nuts&o zd(O%tW=c_p72p&$n~uxQbPV-o z9?`^%QGm&!k&VKYcP}o7|L)uOe)iqFUp{%G>Yra-ZFqv!v}pUhjQ|PsqAhEmvJRWH zw&lUR@x@*l!m!??6dkxwvRRT0hQb6PfF>QrcBeN$BrK=fQ5K6uM|%C*iI<$xMu(J1 z>bf|Xu#WKL{1QlugYuJ2OKX^2ZKtC)$sf&6#PzBMJ3+AZsvFi=z@7>&ew$Vs}MmwiUolm`RcbyPL= zN+@ayd7F&`X;5KhP$f{T3<}etAKYgvc>3vhHS`_;9EBM;#{eB7WXzfe$w1&NJ54F4 zfN}TPF<`*7OD|>+R8}_It^NU=m>{6CF&UHqPMu@+P?qcig+{{!nmN{cI@@dZ=0^wf z*QV2(lWvfn`^?vOS1A9@~r1fPyTg1~%JnQ6YhzrSU)2--`C5xEs5~c=b24)Im z;E)``#GnDJ4QjLkfyjuJhzJ1zvw;d_70V(qN69J5n6%H@oAp^+gRMdYNQ$zXT$&=F zArTVqrae0^A0dJPsNv2H7*R%pey$=+&h1E)=LTdf@z6}K|jEohMEJ2`M=!z@R{4aixS{p`@}% zzIIKWL+$GWt7mSy*X&PkPG{H3`Y2TUp`11Kd{WH|Z4#zV)~k5AP0N0}==$XtH@3qr zNSh?9ff^g284!pW7_tEZc+eWCLTnJH0HuKgbAS#VqcMtsnu&p`XwF$B=bQ)8!K61B zO?tBtAejl0ArS4RSudMiyGgzQ2th_51l>WOHG@UVi6oPamPgX0t~+;awAg7L2!YI@ zDnQn%U>|^;QYM!z^F$>ojg8MI1UD*b5M3=p+OE>LHz~UzZPW_)f=$|Z>|KaCvpX@nqmz%3qa82-e8FeL?y7Ab_9IJ{j^+QRBSEbc7^ncR8DwAcmLAQ z{rck%zMod(I>#tc-!fa?P`$AsH@cwTie7OBV@>h zs`=;{3k4@5P*0hZNEw79L<0sE%$6-t&IvP#We`R%(;Wv{U#3~$r2rAYn9LIyGe{F^ zM}djQG8j~`EJmzlQiCh=q=B-AYBHJ4=kw;MZjLIyUzb%=HPgDDSCbkX!8pWi+byp0 z+12oT5udmDw9n5?F0-s9XW3<{5nig|6%63G`@5b?@~sfmd!?!h!RnH)zKuQ zWpnElT&5vyI@c$sMj?S$?Tsan>@Fc007s0Kk0V%V@WymoKVU|4VC5{E7jU?4K`6wo-4l{}7J0tOdq&$Vd)c0h^0+sAC? z(5z3<43fBQEA`Y*%gJWUTi8aWpimhD#*#vd4yxcTo{q!z==g5GUJVz|K)LU8En>6z zYy$53mzQG?ll-%nn{WKE`{3EL?-j*dix_kumk=F6pK+}?0>;FgA;ENv7!}*LTgTjO zW0zpqrlf!ZfU0Un;7Kdky_38A3#cdBZ7eeqs~DjPFqza zJ`ke1U80@Xc>%1ed6B7ts`Adl9I=GDOw?@4@`^b%bF9OptR{Pt8};5n<&TPT8p5QY zSxu9WOH&-Q@A7gTm)riT>lZ^_jKf7t3y@9H(M(lIfz%XC%>a-UJYa!V0o4FCVvSG} zRHy|~aO6oC71hjCOd+Z!k(@PWNt(c*qGeiW{?z z5jEyPH06{sb2c?lv#iXJ2|SWAAVXF~Bm+e>Fh<&iS6Gn$e@y*XvnAP< zB|%^{BqT#)bHK#TsbdO@+93HZ)~0ujYx<5)nznImy9v!}Xzru8HZ1z)tZU9&zlwg< zhBh=!Xc}-ju|VEWb=cVbuH5d+?IFJ%rngycMDI-}(V6A|dSoqt0-7?!PCI-h_4$jwGL!9mX1jr7+5UYU@DOd#n z5CCDzrpcsW%_ap)R?7%zfacPRWPx)Gj1JgUYp1O}Nzj+!1gu-JH(<0A>RWM3?0d?2 z|0YL;+64`#-d~={yZfC~k&}3z=#1S51+MS*7=nT*2qFhi6(EcSz`TJfcuY1vzbWr- zcP}2Ue)QnMv$IvRKiD{=+hQkt1GarzPxFfEaNeImNy+fVyF*ipTXlz%^W{k#)>wAg z*)AjG3{Z>#dWQm-0Jca5reqk9D4JC7)DbChLYk78U;_1?b^?*w{XTircG=dkY)$gr zI=b(%Uz{{2@bCoAK9(k0K7ITB+YkTfVmLY9{`T&7r-JqX$CN^~SfVycEVZ)0a0E}|is0@8azlK?WSsi}d8IRv$utLm(6R8&M& z<{4`*&PYuPfdf-icJqJ?K;RLJBM4r4@B0;pzJb1jF0_j!Eth@Ubhx14Aa=nWobN+_ z5^%|`b1pa+oR4HyBv;v@l%(k}ZFl45Fx;nfUGue+JJm^5O-Kw33DxEk7=XYWXao!Z z5iB4CgaF_GJb|Z~MQ#9UYM`Q3YN=9G=D!S7tg5Nah1YYYm)Hc2l$2o(v=|a9ff^`U znK!{|Xw@XEWsq!AsAililF=!7aDRNAjH2R>S=f_f3VV5A`fy**v|h9=(&Wg3D~m|B zUpt&e0?pu?d>SzswcW{wy~)Yt-)wGvbF*EeecI!@>faR2RT#+-1=z7vL(gK$N|nW- zcz}>l{`0Ht->s+5SN@~(#iu9W`+do1lUl}&t?>Z+)7(1*;YX{J>${t`>+$XW`Z4A6 z)6-S&?rv|Kr4!G)!*mGX(3wCctx71UwIZqm=R`%b7_FOfVM!n0bjnzyiQbfLINZ!33C& zUmyV$gQQwil9d8Afn-n>V&y3 z{*)IFdo8=?Z}w%~oPMw#L$g|4JpJekczgY;|Bxg-?E4=*I2+&GZ@nuy3lT9^Ga^** z3`!7`ssSP-MRW)<=x-$dYCSyh{@KOyA^Jt@8YffUyw!A|x@dy)i(x!i?4o!lWg~oj zH+GpXPai$%E}fPB^zQcl_4Y6}22(+A;2o==%3Nwu4M7C}2%&;ubmUHOH%+1dLA>Cp zYalDRQnL6K2ZOM@8vW&Wd+$H}#DDNdwHBPAnM0c1hJjPpdTP-(V%As`W$ zVnqQ}1fC1u2vJez(g#8#LKlF&`3OxbKIXoo_B6DMMO=2&Ho^4-PPxmvBxhjiqjMY_ zH4V4X7^@~5*(FUl**N9hFx~79ugBrdB)3KPAjK4P=FFNPq8Ta>phJy-9xwnfphF@M zLI5t<&N|jn`DOI$JikYAQkf6>47M-&L_}=LcSj7;JOp^+b2@8UOf+?7g zWQ43bSt(fZY@aniLuVK&H|Rcn_<2+1b~=Dnwhkr*ZvE*Bl9(d`Mq_eL@Q^F@XP>`a z>(w9rH_OWifBTF2?icrX=;Gt=UuGO1G|Tn}pWJN!{@0gn-N9C$h3_ZZjeJPPW?4`~ z2H*#Qp>=ZpZCCOr@T*w7-<8D)wLNKki&S{wJ z$}ijYwATC65E2{$PY+K!dJ+e_Yj)r6)9addH-GmR>p%V1SMhGQ8x#O)em7|ebH9Tb zn5qF1m;nN+Db80yM4Xe^M*z)CJO%==8OsMm^QeSkqB#3l%qrQeMpT?j3_(dz0Tq-; zLD4|wH^S8BEQ=8sumJ-`@ey37u7kL+<*Dmd)^(vjiJ^@?E*y8nv2X!RV3VQ|dmsXE z$e>V)0NEtQqPUY`KVEO9H$!=zjRh24Q1&V+uCWC4?iDa{?Q~?U1h*UK}5JV*dFd$O0YM23wnj#=5SryGD z1tD80STjln6)-e%KoNWg!KrIL`Nu_DmP{cVM4>UsVjyU%PQ(2);X-`_QPQ4QUdf*< z31!^DMg9Dro&U+dc)k9rIlK5od4GQ=zx=nh;UE9#Y=5!{ukJ2-?UJTPT^jejH$hN# z2jshLefu_VClp|lY7C01#Doemkz+I`V;S7^=cW2X`fL?GZyr4I`P?;54&x3;l7OX2 zoeSrk-|xon=tg!M_WhD=xmW=|03?WQN>ZInWoNY*24tx0M2jPb2rAIFORS?qTmto7 zJdE4BJS_{JMZcG{3O=^2Re1B8@AoI;lgq`+zx(=fwXnPUXQ%Dv+x6h#MrA-8z!ftR zGlPmo#)hhZb1d(eOa%m+$02y1z5@g>B{DTIKobLxfC@w;3VKA=gP36<6`C02BHv<0dh1%Q^x?VdI!#tYcw|4wbt|@^xUjmciQw_@7w5`rPE$fkZQGk!3Hf6 z8X^*rsVWp>F>_VxRP~TH(|D7H=efKra$iMENoS_!{DDF;KtzWSE4FOWBmxp5+l+lM zu$i|wTO_Q4V5w^JnXg4f&BSW0m`#dV7EERZMHLiDXzsQqQzJq}6u`n*&}4oYoe@!N%BRU8lUgrEwk1(XIkzN08R3 zI6ikU{MEnP()nTe`|f}J-};~auWl!7@4p83_2%X`C$7ZKVN0JZ!eKW7Q&{Tdr|xVy zwP$FQbYIIo+^yX&U(~w;LTv`IL)MxdB^Zc{DOPk9wW3B}Nq=>}`;PXl!wJfjmw#{Lde8h; z%3G1W@+inyA<{*ohM+?9WLaX3` z7=REFKWAo` zA*?KTUWw~=VSHDgK5f=(u~HRXLBHGC`Yygcbe84%PPWA+3_(U5$U-dM4r({M{M&l_ zl|IgUI8s7c%e*b&WuxJXWhsEjH!_VJ-|2%~H#ScHc`R3(Q1vd_P z{d{-x?Y8Cv!%lgtDTOfXvnnG3n!sGLU_eTQiUwkUGj<68QIGPS!7+zoh-QjLfU1UU zV#m53Ff}7nAV38M697V+mt8o!3joCwcy4zfV?$sAhb~xPj14tCH*M`&ZkO2hZQC_X za~iwz)~}-Ledxp*qoJyR%s)3e<^aI~vKo@2U^N&m4dZ4yygTGKW4%+!s$gKIiV8po zKx$5b5L=*#*aI{OZhrI8M2*yd=9P5yW~d;l2J_H^S*nT_vs7$SIG7f-WLVHr1wj=< zz-FG*eW&&rVd$-?Z zN#JSnK5&c!26P184_Y?6X`2#I7n|Ut@7fx7_uv12|7AM--_l{r+zuwSvA8%b@kuHB zZ{8lhExX$Y($sp&dScRIY=We)9m>#WYvl5ocLbV~6|_ouSH?A#`#RQUKk|0%^3K1_ zJPe1!-bQhQr@f|ci~aI8owd8q&)q-z@a*xbza8^L5KmgC(J%~Cwke&46;{4|b!bjj z?F#vF8O<{aNL~xSK9qO+ve!03-QK2`uU>3U&OX||`>#%yKX3-;tMc%(kN@PK-0$(} zM<2fV#g{iPp1-?(P195lhr7+UyY0RbZanNougCEwzyNAwP#nShaGL>`RmCH6p7aP` z0s}QO6r8`}b3e(vuQdZQ6$Lj3o92tO&N7feg_&B3#~XwJ61urHOJ}QtS7X6o7$G+1 zIt?AhUc05WCp9kHb`{##y2ugOG5f~(K&)0(piC8#fjMALZ6FS+jMcyr+k63t41id1-jJ9m02*oqE2PyhL(Wh$NCvBBh{V7_LJO`#7l9m* z2L^NhAzh_hHbW(i*19BPZ%D~52}WV>dB4t5Mi2$+MnT+`b_rpmwo+(d^wSn$NLkET zincD<$-&!=F5V%P3qJjrZ80Wa`re_|A(u^C#{m6<%gPJaw15XogUS>YR=ccaT5JxF z?-A~;Db4j3bZ7e%-`=`!?#A8zFsMuYV&mYedw9OTZM|0FGHR<3TFW*%qZT_?A!BwE z_eq=U@z6|ikUO!d;-O>%pS88Jezd@+=S%9{MfCss)#WFLJKvrT@qxemLg{K_b+@a* zpMU@Lmk!GQ{(%=YB4Jy38kP`Ff?RzL0#)yClrw*IA`FHw7PG+Y#^7F z*MfOx&2rI+kgGRi?x(UEM^<>aI9XmkX^?lDYrt_m?bWwgX+4%VCPL9Q9i*B?v&!{W zhh;yU_T8i9;~&T6)9ctDLc=g=E;OWC0xTb%@#12HlR_u8Zp7t5M<6Gw3VoTTap8DX z<@LKwTS~Vtm+SQC75wbL<8^bly2p&2%?T`>b3-*wuQNCSjM1_G+UDi8m93dE}XIx?nax;%YFn`3R0z0-HDapp0nq z8Lu;{0}U7y^?kf!)=SL{X;%Dj&S)E`DN6wW24)S6J`hFd8gLDDE9@51EudLwv+_;f z#IAK+gV<6)b`{Ck0i&p?rqQGzmH@7?5C9k}f-@0C%QB_B-j>&!{Bo47$UJWlm>H6h z13^UuaB6Jigjfv#lu*^`yu3zpXx_}JFaeoDt!k!bQnf-YRg>COAOQ?w2_T!Pq0D=7 zAW&1D|2PoYA*d1wS_L&T0zngn3ABPFtd+EyA`p}F=p(owA)<@G&X@tv5YWYQ`sD0t zV#;}7Y){(g(xGl-IdK2NLG$?W5C7n!t3SBtSibq|+x(wy^OhuC*J+(YAsmzrsTP^M zD;u>G9X>m=(|-Bn^zxGjG2FGM+=STLfUw-`rE=?=mFS9U;~DBOSyIQ;K;!sCDNx~t zN*;$T6Fm!iyHASyx^X$l?W^`aeLlrM+Qj>BuD5saL{Dzk_VrtNyUle{Vh6(L7(l%V zOlF{9NwMnGB58|7m7Som7G){I1~00eckX|A-hGzV6)=#UpN7}J`DMZM-S^)=y!>3s z4k7P{G>#IxQ=q!p=M8-}U0{(cZWvSSpC6cFvm9XQ^1=ktA5Fm3KEpC9{U`;Zyv zvG#%xm1!Pv?fCxK5!;RGj6}hJ!IYTQK+ypKvNPw98}$vx1%##bz4jgUEAYK*I^V|A z*q=7x)N{+`s}i7SRRO66pxIQtvEvw=CsI&Q17pZlBn_$F?DDIj-j!Ox$dJIQ0hu|l z2n1$~zzD(2gL(vpS=cmvj~aJq9*|A23V@h`RLxRqH37}4li{FR43b&_6axh{ zM}8+N0hlSzUN^HjF$LKmgH|dDEW<1;AQ5{B4TTO}B=40O*#QHHv8lN>ppOa=96>|t zb4jO%L-&?e>3s3|KZ%b%yZrRiyEp5XzxmJd`Y&$xFRp8Ohx{7Cb?}EoH#zOKCM+Sa zo;IhKCyze8I{kQg@cZ^+w5q<1WE=sjjH83Nddz1Z7jpo$*a(`9K)n(u6m4toESvU$_-G|*c*4|7tpMz>!DrLxC-r} z?ZT>QS0OB-Z@l;F!H1(jC0vRW*t31|f6&gz&(Y63T# zcsi7`#H&qw+Whe9XaC&JKV9r_SMG zL96zokDooeh_sM_@0QDMb-7RoaDd2Nhr{)4iv8Ky*=N)~8}_vU*^k@k;|P<`ffY_x zAC`h<<*u$p_NKH93Wi=LC#_A(s^>|QKmBYcHt;Sy$>sj#v>W*5aCu!X?&u#)r(eEI zH?PX8HQo$pMJ3Vh+co=`VW^s z`9ptx`d7dB&9~qDCfvO$X+I9*b-5b{xI?*7Eargxq_oj=&|q@ZxYpIsoeQcI7uAEzuYb~-lHe$g%$z*1NVwwpMfx_{5Q^YD#v*ub4_Q3v#fi^`` zldP_NN~vzbVSRCntNZcn8t!{r*2~@7~>WeF@#SY5APsT`h;I zsoE6TExN@Lt}Yz+)?ZP(+NI?b`_Q)CqQA7#I=Uy+hH9P)`M8V^8pe_a0v?mZ_S}oh zwMm7S%f|7ll+C{0@3eH>`xqw4j$j2K2;8uem(e3+V{T*&r(Tc0%@OTXQf zS2xcl_vDTJ^sasWvhJ_Dx3Bfvds<6HXoDmrO%5YeLFI-WVhh8+xVreGKC7voxYNgH z3F}|{_C?zMKR^D_kJ@(e$qyg=^56gJ-S&m)wAl_})z?iCK#jC3Wp8c+m}?@KGpS~P z2x{+TpxF|8ls$3Y;m=$KoxSSsCnNwi49atBAp;7^%(F)X1w$18U{xS;hKTIV6CkN; zh@-Vlb#3j>ATFuvsa^QCZ{xx@p^eSb$Bv_Sfr!B)I|l$sQ0s9wt!Cztk&I2vDv%{j zQm07|hx~3Q?}oA|Qe`%F7%4h5X8=efU}9)9+gS~1Zm3f=VM7B#K>z}U3ZgI=R@GW9 zsZDAbtOyi?WS}r7C}*7?0Z``Ak(iN?kO>e_P?eyH3MiO>Wq_=bpcIrci&8`$eN-PI zwCsX11J6R4nTeUGR?tFDDs?rU-|n97mKP@<`Om)4<0_*!b{vvqkc~f7X)^U?Uu=fe5vv8dKE)yD`LR7+*zPbm57Vly@8O z2X%E2PR%KNx?1O5|0%XozerBLzn`9mk8bGo_u=brWZ1*bf}*H)El-4Di2PLP=R{rq za5(=ccHt)7ch5e(s>|K&>#zUa|8Vi~4>EV9=~ri0CGFcvyRis0=s4G@s_nhM9V@Du z@O-Gx)*gI+-sABs127~5P!yUMzQ?;QsTm?V1&1*pc@Q8q0FWcD8^GxJCJ+%bc!%i0 z5M2;r@Vx}j%>sOn@xu7jH)S0MKMz? zGJ#al$s`F@v^?YZ?fnhOKoA(``bI)RMg#^1AOM7^cePE60YBVySNdq-9`851ufP2Ntl!?JG~Ge}0^lXW zn##Ir>4F+e|JBj|M+D2M3%T8 zhVigbOCZxQVs?v$7NfR~0)?(Y3H6YjD%4sK`E=D^k<@8?HCY49r0M| zJGROhz+4qn@2a+k=zNgAf%b*(Ru&tzkl64`QhqEKRH?7##ev!_WN)4dRpDY-n<0?RG`o^q>(_!3U?l^ ztroxlvr!WT6+rQ*;LM9&gZFL-<$tK7ooxbQ2h{sm+&SKqm4) z3>eHas#h1#d5tZGt~N^-7tk+b?7ObLSgb==?t4?;r=`tNqk&$HO4i z01Yb?yS)72s{5Er+?1|~zH_{wX)O1H-ji7;zKu`AVli!RR2@haO=FrcX2uHa%u;0u zD!@={DYcpbYHXFFV>CZ#99n85+sB8?PsYoi^K_l)y19Na-F{78-~43xG2GIL z4+3v~xqkZLAHqjJd-S89uYdKM#q-N1{OZ~H>QDdu|8T!qliU^6;k)*r0cw(Yp}Y1)%6%u%jp|1Y*XD-~br{ zf`S&%s*tmlYD3C*soqazJ(Z~{h?pWB-)5Q@5d;Qmq)MQS#GnCn)_C>(=MAmStLlnU zRjX>PTF|PPB8biZ{#?*bGqlB!0BK$u5F*dwFCoL6e6Wg|O-w65Mx9VIO2v5*M-E&d zj_MoqPMM9_=Iw(4Xq8f^P6(4qW*yCaQqH^OL#S~5)%Vx7dnM^w1x_Eoxfm(qNi;%@iB9-iIeXYXkB+CIK}=eqOl zTff=dBCXC>7wek#w=XY0I!WbZ|7VA9zWqweJ);CZF+-+_#paFsCYP;%s*<9j;ru}0 z(XBh1;S3dLSF;gf1v6oq!D0%;1mtwF^e$8gpdeYB0DA~LRCYBs%U5qKoNzE5Q<3wD2l1p zG21xSyHRfr<$kJTR;Y>!B&xIH2h|K`mC>U`Bti@}m&wdN5UmDail7QmE0|UVt)d26 zR1Fbej&~7&0?-@_Ff=1}^AJfyfJEd72x(q>DOe`0s%izafM(1Ew3sRYBQvqr&`@mA z2XP*VjF13z=A}ykNl+)%q`6?8z&&}BMt$>(`>kpDeyG2d_>#kKX*x6(F2d<$8yb$Q z!lzyXYO}x?PEQ|paYZ}JG@y5$0ksdfVrr(EorP3z)ddglc56_7x;pVXRi zOt*#lCW*Y-y*}+eY#pdT;C#Q!byp3_vU}`d1(OBK6qlQE<9OA!zl-I$La6n%D+nBd zVXZqwnVUDYQsurFgS4mA`gj6U%iG@JwQrhfx;H6^PEItf!o{lpgS|YyMTwVBpZ#(F z_`|nj-FP1Mx7Yvq`**TEk^5nKbN~EW*-ce;wl7=iI_YW*^9VcCBxsXNO7%W=5v^UQP!Fbbt;Oq!3gEG07@5 zm$8)Xkat6!vQ!mNAT?8+Z}~j0x#@V@pXVq<#n~ExCbKJ4XDW~)s;CuE{U7N-b~I2R z(A>#w$36?g$Eg zQcx0>3DSg;@sLSW9o+8kcGLaI`QL5Jmx5n`9-3M&_=6ulK6%9*q5n+yq(C|3!AD}3RguH0l-Ajq@`E?o zXxgv3INjfP-z*v&v*f)S#~tGqeRJZ7baGx&9Zf`~OsOCt79*$ttlFRjh6Wo~l?j4h zTp-YH(6)<`kRkdK)G;PUg$~~DVsp8!?e`-rfA@KLaP{aJob{USAzIs=d=j6IuU~!h zXMZ!iyN{$V*562g$qc%dtpJwwpf@BpY83^{qY0qCCk~pyKdiwFL91glKmstTC}b#r zjwLWJ&s=k2A(|qnlLW2{O=!EWyX3gy=uKoI48&|xv8{rVMfRl&L@xLkV#Dl!Bg7U$ z^7hG!~kqa zggk4hW&)Z_3*=hoY}W*qRTK?@z4-v#ns3N^a}J4TnR(_wn2zLMI>}V%kB75Y2WszJw7J4 z%q2&Zor32W&U|_rnkU{}X>LxJXCIxOE`0y0 zT-^?DLU7JNJmzvgnniR>2bEijheLju##hFUTvOpB5}E9Bu8NC=zjC2t1J(}EnM$o= zv0U@(#t89HVFx;j6fhnHnw|Ub>%&KNF^=8Y4}X8GRKDLr4v&BQIXwCZBy1%y$bb3I z{^^&${M$Ffi^%HF{AQSjsSec*me?Mk(Dwek>#w@S1N1$4&I!8Kq9ZDKn5IexrUPU0jXCr#viAX< zy3Uw^$)g7(q`=O5atsL71S%8F{!*H6-Aw4fBNC{ZsYxnOii|n$O5IH9eysbVMUN<} z`GuL8g9J!N_JWa`AyhDjb58k~R-PN%2~D7ys(^qU&;IwTaX~~r@>`Gy2|Y@d&92{}M_rKBb>@3u#G1~6lzCPf37~f8EqhS|90|i(m`d)k$Erz!6Oqx~O-gU-l zv%Sl<6Q5ij1SjKV$%+P!<0LxN^22xZ+4JcVukQQD;pD?jjbp05jlcf8@815Kzq+_Q zZ&&wnnx4P?X3TkY{&dRw$GN2jPb762EdGi$$R3T8O(8BC7TUz03c+V(Hp5# z0duXheo-$kxjU(03DVc%7%ldE=KMw1K3T@Ardt>>F!UW(tc)Jwgl-gL0Ow;xUjrjM zLUO+4rez084xOv)z%e0^kpUJ(QxGr)FjE0UM-b3CV5ooyqD8eNIOMvW>g`x=2i;AY z>kLmn##a=P4Uu?8`_ELK*_{CZm5`3SX(TmN1O-){vvG?J z#XLeuShA{$5hI}ck4^V2tGj9s3T`0}pFKM{yNWUFr<<(z-O{*mhRRHcg@MsdJYPl! z3R0sD^(`1gB1CB8A_AdP9^X<`s;zNIxHVTDnA6o%N}*LOk5=a&*Bzg&+CO;spYoTlym<$c+}N!xZ0qn=K6dwcWSJigBIO66@Et>j8rL9wWoybi$U?ji)dTiptryU~ybX3Z|vO5)nwX*4@rXGF`s5i?M&5B0f25SAF;R#00cE*WnA|vsqM3iWL*VoIoe28R!2Yu{sAaK!_YXutV}dGZO$%m8yVdr~~GVQ?fEa zEhY+JpeW9R+dgf9L3>y=zJIhhy}H8I?R9bs?3PQ*18_wHz{&9_T7Z<%7sE2e)$+s8 z_qo^!+pxKw@J>b2V3uMo9NTB@sx?fFN#mAv>{=qQ`1tgLju-7o`{2Q9Z}pp-%|_gp z_b*;dU)`iv=1Lgeke5s!O~c)^e^<*MZEuz<_Z7~xgi27b5H#5^6`ew*WjmEg>$Hr( z4ub0$2P9(&4W1eWSl{pHn1Pp2o{%_QS#|Lpv7 z{q6Vs?k)n}-d_*7T<>lZQfz(Q4^^wRcD<=vB{0NU(J+G}_yh@Qw(ua3fjDEpifHWP z*{MBwYNwZFc@n}hN}%Lp;WpUHyYmp9bnQiKF9LSpn4MCEm9NJ$uw)7Ger#E1Gg4h^o_cJ#5ltT%utI-wVGKctr_zK zb)uTBDkvBcdS{I0azKW9wFu2==T0xWle4&N%v?Z#ni!o&pR<&Tr7;f6E-ZYFDXB+} zbk^gUP#5Z*g}UAU#wf??Lw(xN>U{OZVzt^%QPT(#3;0E^wd7& ztGnIwR&=Z|i4&zjHYGVE*o7d@dawA&UH+`}%Zo?h^21to^6Ks8FZ=74Z?1m$lkwfH z^k*(Pbi{-a-i`OT}lw?GxeCgmMyDhR-Yre+MRpl&q!PeiK8()CWp?50ab{NKmc_1i_VvD0|6pW*v_DE4fgCQ7sB4_i$Pz_B)W>a(q znK(~cGuCQiW~hk3jO@v=IrJ$mxI15TkFTU%Ro*EMRWkskGNR@VPXaGH*RH~2=VI5i zhoLS{T4OB56+Iy2jjl6KwGEc0^DfrX0-O_iFxSXM;Fh^NZ(vitN(iNy#`@*q?wkJ1 z-|eSy+D5()?dvYYA)K7MCmo-Zp)d9Be)o$6Z`zYGCL={p6&h?9$qT}mw5D+jI7IFP z5|E5~Hy#FZ`$~yLr9zclrolXbGbl_%mjnHD%MUbX%WgBCJbZHgM?YHI6-uMm-`dyT z|LW$}-M#+t&%ZeN!Abk8%iZ-pZ(rGTZ&_N$`>Jme-&6)uHYEThs0y=94GGW$(5Zm~ zFh&o!Y~u%y>PKJfubzeOqy^tdJkPp_+BNZ<-3sYE`g86U4fmmIQ~@AFWTawViWg2I zYyjp6q>_Q5cZ5RBNMKw6lBpO9aW$%92r2>ygk*&1*w}%OGlUtBPPJ-NuDi+ZC%M_z z`%$K>sxxiksO%gOpL3&(7|!cCwK)gFh-yHfbad_+s1X2(6*DwLFbBYh3_u8sbMKKm z+I7H=z6L}Da6pJOH)fkz1*@tFN|vgcp(K#WROh<}IS;zAP`<`0MMy|sO4&rZSv%TUZW&3SM@ zWZSW>(*a#P@uz)hLXm`TZ?(;4p!aCqQ5i?!v@H=*n71)PIm_RP}bn0 z<~)tV?fs#3r>mxyV5%;srE9JP-ShNH5V%g+Or7gnQnSHdZ#*Z*E?{+pbjxq)FjkC~09;vUxfLk0OeMQUM833<9zmqJzh$ z^yx?V*+*{m3E?>bE>#~Dw`5y%%_-wK`Gw=N(426)Y$%{HqPggq$i}^R*ILda2KES2 zEFeUKN+`&Lh)4otszO`|D_OM)#-=)3NeL01v3C@a8HfPqMs!nEO;Qed+>P}xlquC* z)dav0LBx;{$!1e4vK*J!49b9H0A_--00wLdpw7%p6~$BtWlk;{phs|MOz+_pD4?ip zpeQrB7LXC?*yu8Eg{_)Y$RY(xCY`J%6fsdmR`TjS#OPuG=K(zs<8jd;B-N~{B+QA^ zq>`mpP(uJDGG=0D&H;Mxo?WwS@bb(@3M`IF1s|7ZakcEJE7>xc<21@OINnZE$rLeA zTh{A$d+5oSoRW=8zdzKpU~c<(Gfu0kN4`0mh6yCAr@Nf0h-6WbY@VDl3lPWdNrTIr z)M~WqcEy0-ZpUA)cdud-M9#r)Ip1lW06lP=Ab|{f*s4|Mowpbe0+W$Wm{M|{BXLnl zpj^yVtb$oUF@kSmDO_qg+hDgpnVQEZpML(;i(hFPKR)ez`)tVPKApw*^uvdX*Kcp$ zynTIrzkYjvef?$~TlKD&X;%W>8^6s{HAZ73@L+0K5P^#VgAs^Cz@ics{P@H0#ShYl zpYnr8jV{g>yVq^d~3h#uL25hw$jVbxr8O7$@2^&t0$y3dkSkE@J% zO=Vz5&XwWJp3-@SF$c%1?rdCO6;*X$XjN?v=32Em;*SpJu=;%5BO;ih2#S%|Ot~eV z+dbG6&`gU#0WDUO)LOFDgqpPossfTa2Ncye8Gl;Mz%1O%vl2NOf z8W1um5|RTVBM&at;K0SpQ|e-5WHN>jIH)BV%f{}^0-*V7glU_rkOc^?IjtBC5Y&gx z4J9+pO~~vQk9IfL&MqT<=tCH%cVug+V-~T>W>KIGi_5M*gPnPL5MAsrINj{`$=6u* zyStlTRXDIF8o08Ok#nt?xw;Z^Bkd*vBJ5cy6IvCnLawq1#Lz4oW0Un%kuY-8DI!7% zN@zQTDY1oO=c6Fv5Zto&4;eh(e)BK?Papi_|3$)2H>D2J%IOuZz8_w^&FS5Rhh58~ zb|K?vv#t<5-6&X98spDc9L3NuCjl$!swego0X- z6%~}t&2Zg0CB=>q35nF^u4pw=sZ~-fLr(iCuZOzHl0>Zn&TK6JHKaMRX*j3d&5RM9 zsWTw5<7Wg6Q~-oP2^697OydN_nM#VlpbmI$1a$-k3WQZf%!E*9yC~wZ)()(qX3#7m zT11LUrkX)3&OronNM2)uAp(0uCq#3rFRGy`sA4Tvv(|!jHX5-Zl4AoVBqbJh79x@d z&*0+YlhTFafLZ(QwAN`i4U&z>nt&b+JV>@uq{H4XE`9jKR}FyP?S^ta4bK}bLvbZG zWv_ke+V(=n&0W6R95xP1u2Zqp(~@0moM$u}$#-@rDQnuxF7G#39oM!C{p256glQb= zIPuczZb&Wx7goRMorkl)J=7G4iqE<31_LsxiBGytRZOQLWe{(y5w&q*Bdy{i#^ru| z8J(`GJ{iVtZ}xaaPk(y9c1w&;^hV#T-+cdn7g~><|K;1KP-X)%|MqFjlS~}sS;}i0y z!C!>-L8LQoT@xCVhy_8jb4}MQTBU=Hhkd>u?@W_w(NbcwWOj#<(3_!vh;cFmg-n=% zvH}vlC&V!W0up=l0of}$rXz&~q(B<&KFMZZ?+&t_>NNBG=WX?zcmbSO0BUH4bLS`_ zIZ#GKQ2U<{3Q=i}^%#K8E}lqxpg^TIAWaO-0zW`q!Q_g)t(d*7i@heiE!JybE3dQL z#ET~->p|Y_VB;Led@g#R&dkA7+>~nI(bWz3U^#D7lJ34e|9HipI4H7t*W?CE=;%Io zw*ucC=*}RkRMB&1Bl(C`F{?Pr;t7{aI)wuBpT^zze9%qF(tro9n$yZ%us?UBuO3Hb zVmdl5eCBA+dql@YZTz+tZliq;A` zVa-qriXeg^n_~zbnrJ>S2O|fJHiMExQLAY|sWi6;%px7lS}n~DGC%}G$VVDAnSqmB zjm#^HP1}7JSo|#kC8npsMWhk_et-`6|g(t0zMS6TyJf9HT6%Xn#{Ha z0Udb-6kIe9iZ905D7a?H=At=(Wk;uvA4Jh_a3X5A*W_Nt#s21L;Z2t`4*yg;g{b&hZA4i`J|J>p^5Ig z@T*~yIaDLZ6QL*2>P#x2fCCG@Jo<=#=V$Qg&sNJ%KBVR%C71JHoO1=@=(+cB(ZnkX zt#hrnz~l%}RUHAvX5rh&u2d_vrd)s&tRm%x8p6f~ac=aLMAQHYaqdP}tw>@9Mn)iJ zh7>$Ypx{UxdImrsF%wBi4=Jw?^>$b9r+TPT)J*Z%w_?W>gS`(kkOC-x%uy<|IRQ*Y zh-!)|3<4merYbZyl+N*8M4Zw6hyp-hRjJOVAe>)=xkUhsfXylxfEBBw)LgUG!j`pE z6j4AU1PJCDb5VUT4vG$m5Rgn~K^L@`2+mwmL_=isibuTOY_ThmoSr@4 zTqe6^2Mata6m*liJ2&NF{a^|g@6xB=+%_+; z;l*#>z4_4xkK1ti;?I8BeF3kGnlGMw{Pbsc-)&bv{ATy(|NiCIzv))132Vc~d5pyd z@4uRM8|O!G4Tx6*#t|`?BC!V71^&Ua`h(wvkAJ>;@WB&+^E3wHTv;l2NP)O%;;HA8 zNT*`a9A(2`h=e6qGz>tTQK%#-(>N7Lz$uq%S1|Da0IwlQL_t)&DR?%(qOPQA$eN2~ zfJ!E4@B1Q5Q566Tg&7#kAqF#t#7Ck~6;qL%%5EyRL%vP*P^DBeIm(fCWQ`sl{UghC zHj~kD+L=+-3W}x(Mkb&FprkXD-{{y>J$KZ=ypS*g1tpm8AmkJ(8R0ypp#d0K1r?EM zrCK(fLAHfdj-Lwj>LP{)e1qtP2;7kzW~Lw&YB4P&#k7J|1VyrWKr@*c%xr7!BN+2m-ylV`o z=F+x}iMqIK(m5uc;0}q%SZa12TOT3GB7c38|1zibWERJp z#3t=RST)3EzKvZ#sYn2WKzzTdoDt}dLbq%eeRO5K-Zx?$M<_nodf$x2!**J`BA=RFh$P{pimoe&>VQ8$fArlE)=u@R(Bl0-E?YS^m{6(^OESZrp?(NS^4 zc}t5%hz@*2ZjB;3H|w4T0+Mx~%YIkx#(J}_yIe(772cEUj!p+4HaJH0Re=-$kPT=i zdg2jc%?h&tR7~7F!2bjG1#Gq;&u#EklmM#{(6O@7m>j_TEGmG2WrYlqrHU0T6*Pku z6+tsZ1n0>|a1oMDkVXoCZ09^YS1Q4zTw(v zRLr&IHuPrNhQ4-7)m+`}ls3cWjd}CXbZ{zhVNw%~u3O|>l#*!z@nkE3K5GiL9;Vko zjae_~M)d4FwU0XLot>==%l$jGKzyG$LF?~uKe6=({hjQd4Se?X_+%?5{NdvA%7+Ad zTcN-H<{RyBm;C-$&CBn;+kJEExxlnn88-L(?XW(C`+bo#Nx_vriwz4gmdd@f*Gz(s`?q;Yf(a>95K-9z6lH8(x4n!p2^<}ooxz^?-pav0IoAK&EJC2+P=n`$RH||3SE}B7WQQt{r+unxwm3$bu zjMM1u-!hwd0yK8!O-7qkhm4gJb=;LA3j3xBsMz?h^s7=|11U2`YJo#XrYbF4i(>m* z&;g;M)2+ykM(u_e`tw9TxIO%A)DPZFy6#7NbdOKui$5Y*k6&Ip+xWv6OR z%&na>G~7-Mt_cxMO@ikL)q7Y(T6$?iaxtEE=%tFUheO&y=e!qFBf)XK@eVCEiW-}~ z>?cGA1|s{C1W*-)2$4`>o_4cYH8mhbPy%xhJX2r_ggj3ORx0F_r!lWL<>rvLr6v)b z9c%B`y*8sNj(X2A%0OTa5DjS7S#*v$2!Md8K{b*2?+BHE0HIdV`5gi=L;x}cHe&>@ zG@JgB(F_=%f{9r%$*M())Qpy`R8Y~Gu7TdzqiZcT;G(iO<|A-Ml%$v@s2OU3TEH?E zIp)m3)C^~L;``s8z3&YMm>pXV1;p)(f=K5&E!g?CXOA$Xe4Xs3P$8^63bPXk}2!5B_X=y!g^ zKl(lB9^1$=j#jMkj;rayabq20Qz3d?KsZ8QV zBiyQY5j$L@>7dk1##*PWKx7J|)naT06|@qn10_>pVNgdD2_lmxN9X{}pqN;aldq0Cxz9i+E?kN?3Rbor2XEvMOEKu_S25-ScmCX$$ABo?HdmK7ac+-(LSZtzWL6zbMqyCQ7}3*!WG(93T~Q z-ZfDko|PYeE>C`#n@cz#trMrQalQgLb|j_6&;?p>yNvZgq-8)K+eP1@(YluB~ zGB(g^snl_h!;o+HvPpFkm3gp6#QD}dTb%J|-4p{-L(zHeXCwxpBgn$$U?iBSAR?;H zg#~J;W@c(;b8iQ!C?g{&12KacDzVzJKupMrX#zc{Cabv$NCqh)LTU&IHW5@UBeTMwQ%HR;~zR)KyVM>{G|D0T%>oqzZSiZaGuM76_?#I#Pd7p z7OQ8GPRcz%JnNSa4%_kO@Va<0w{j9sYco;Wk9T`mS8JRPA)M7=Uwj({${`TDEsmD#oh4q!8s1EtG%@rsC`)FaCi4k!=jCl zVew{jiwpVV#e%(Vl)vBZuBly{oUiC-zu!Fk2-4$jh`8O=5uzDFVr9N;Kkoe*gWDgZ z_0R>+HK@e4i9LB#8%mkR=CBuI~15vd5(x%NQp_y*`)%VaR;v}E>B9O-U`W=3dW6`>jw(8(mLiWQIvYo=1) zS4I&yk0DqH_p6g?Xj9E~otu{XlSx}xSfKG@Sbw~`{lR*Fey3+ugPm=g)$V*T zMOU)(Ghk^-XX&eb95vW$)#8&K3uQ0=9gFPcHawrC+kYSDDOPs=bNx5t_oT zAXPQz+6Ij=skusx5vgPqOuPMd%3H80)%}Df1G6*|%nC5Z&b65>Ps2DJL+A< zicA!XS~V3z0ILMX$`mLvw!|Kf8A4UbHBI%fFL#H$nd(qP>_})aH8z+#EfAP#Mp?~w zYM22GWXQ80XbzWvAqZGi6Edk#RpuA+cc!96_t=p=sR;p3CDr`wcHe1F+LxkK+yZGkWiDt^zH(^$7TkyGczn7VpFc0#w`F_@LwTEgJ-4yKq7(Y%9wdk^j3!hYT3hhAE|U&N`7 zS&T7Ry&dWUac#gp@Ch=Gq9CrL#>8b;hngx1Hc+cscJpazQEDRQHik}>#*$0UUGXgyzvAW6Q2_&aoo`L<6hEO40q4Z^ydH^-!c9%Pi(c>DbUWtAj@! z$ZQVu+I!{?!t5RfGy^MUN}>Q#P1ICo-vi99Q+&@twHXRbOlU@cOgy6>m;eYWswxz) z$!fN$QdkpcMiT(wdG7ZCd_yi)=fU~eEN%0exMq+{l1wMA*{oO#Fy*YEE|NI;=^0(R^WjJ;Xoh~rYiNgJBw{{G{`_n*I6(~IeZ z;%)QO{QFP(OMdGEPP6{ca55Kyh5B z?SR+a?MH1^-=%RkDcS!1^=OjHsq>Pl>)=7>&J9i)$oU`%7an3STC7Dd>iCY_ZT*9r z^2v{zzvTMM-f!u2H*POkweHv!WS;fZ^O;mHR0L#@eV%s74pmB`y$AH*NC!|t^PO*2 zZFiV#7!Q+-Na_QrDwyD8W-3tU)V(@EB_{R&k)UI0nHq2e{yla-PjVRLb}x5R8H<#e zxT!#3rn8Yv@fhSZmG>B51y$vFM1uG5b~FG{m8uFLB3jMBKmf>4)r^12JXRT4;6DNuz#0UWR=7odsez4?I7+YCoFB^A}AwO~z964a!% zh*l^{YV*7nkQsg8(75Ovdvxg7AsG=d8=x|JBvY&y1Wk#|W!!PJTzt)lO{Df*^dj3S z%gM#%<4bqdr)jEhZ_`&>+2z&!xO<1Wh-*^a^zm*H{We|q{Z{HbcK!X36|kiHB-g@Q zw1tec>@S0s!knR^3#+z2Cpm2|Z_@UKB%6S6xo96zQ%g8@>HtK5m-Y408um(Km?jIieksoN&?Vh z>C8;@1qU_<)74~#Lzq=lnKSN$sAxn-w(nf>2LKEtOb(FIn1~sjA<=A6Ffl8tMd#m4u?JWqc4zZzc9|anf}wKTPSXH;22dhe2)(EXmuf zqif@k_G5l?hF;;B^jB}ot2l1D{;_!nhY~1@?ka6HPtG}29rU0BqlLCh!+zMP#KXQ1 z+(cAdMt|kd?GN_?{mP##R_ABU1FAIXUJ&EhGWFc!_U8NZGq|8D~w%AWvoN)7F~bwd;isc z*+TdH^|$~2``gck^kJZjPd?sm-}E^3mLH$B+dO1oXQ4TNa{BC#x<`)>2QB%&RXywq z`xw%lA9DL4;b9n_zy08=yVJat5B*;}Q}|j);(~s7{*3 zx*qCgEQc&v3SFB_D+;&ZF z7K zpDwQC@D4jz@7{)G@BG>I>+NocpMUbREuFL%i{SQsuyV6OY5cTyknc9H;WTxAaazy# z<@f)M<|mW<=?D8?x_Ik9Z@aTgbZu!ZxR>jD=axN0p<-67Cdp$7KoB`Qt}dHw>d2>= zCAMVh*3)jvnmnPhq%<)DNF`GsA=E(DIO-k8;5`A7L$C~5Y|Q08)y-J8NwO%a9a(M) zM>^O07%m*2e4P0aU@)WH3DK!ILT5^-2r5ujE7mzIGxz=}AduPYd!w1`07OKAkck7D zn_FMsYfK6@L7h~q3TQ#iXcIZv?(mIJ}yM zq2@@92#TxX#;W^VwrEY3!OHmb^oz6AA1^Qd>tXlHYn*m|w;i|4Ep=4XdGH;dR1l^l zIt6B1Ixr4&dUEQs-X1oaJWT+ui6=c@Jh`|$neG}2r+s^`!&!eCaB;ssG}4ruB;61D z-*)-s^RHi=e=|K8_h0_ie|i&c|F7R)|N1QLe%IMm{J38|IbWSE|L6z(Kl${*KmXv- z2cd6#t)xxfD7Wn3+mweITaRrB!k zZt0zSx5aY(+dl{55$$kuvoiBb|M=;%?WQ|BIXyjps-0@PyMMWe9i6_r{=?mUzq@|& zv&FZu-eV@}r8*gID;D;3z*QP24SUo{V07xL9uS4v000$t`wDp~z@n6opJ_E=anY(4 zfefe&7?2lior9h{TjO|kl;)z-R5youH|5PFQx!e7Z_i{OGo0x>NXLR=via(2MyCHj zVvcj6C|Xb?ML<(EP_1*R<7}{*+11Eu^1d9IXil{gc`}DgF#Dgeq7~E0G>KI$dR$!< zQ=JP2h#dv5Ay9101?AbmWMn3qO*51UQo=G>Nm?ov1yxi+A|}to=ikqbAW~< zfMCD?2&`m;pr8mKW~0&0BoU}XjG=FkX`i6)_U&?&7wFEv{^q-1zy9<4Uu^&S z`d5E_G46h{Sbe%WJv)24c=T!a?CRm~oIUw3J~+ENwbRCvOet0I4VzWN@Zr@L!Rev8 z>#}?M)o)VzGM4MrRr{-7Jom+4sr%=je*s%qe*7c;=ufOnjsNQIxBtK4um058VoMOu zx=GyT@7`TMI$d7<#J2sThtI}sTAZHo$IE*kEq~Zya`;};({s|mD8k!~$lA!DRXFZ0d`RL8NKl_`DKl|#F=hN?bd3gExvrj%f z`S64OM~~sd%ab3TU0%gi=k4H1)ihANth+;5m)-X-zkc`pFW;s=t#*6% zVaWqrzk=TaKe)ZV9EbI<|3Y8A*bV!tH(W9ecJ*}WE4(Xiw_QJ4oW8mq7Sn&8;v&C& z71Zt*5I_G(|LKGM0Wa(lzFYRMe)H4A^VO&8Uw-z`$*-7-3|qNLQpJrP zLTHt3q*6=X)?y7YM#rilU{w+%8hzyiR;ujK3ZO)<9YW8+6Fc@~tY+DUv24b&9cHGd zRy9?C83zPMrZW1*1$W3lEn+{Pe>=Y+Qb~{?$8v%)4)EuHVu&3)h6S?N$uZ(aVo`*g zGoQ{7Cx#sFT*JLp&0?b{l4zm{5sj7rJ?UA}U#KSmk^q5_x>{tjL{?R^##>c)%qcRa zGx>3MGuwNwK@a9xPl+5NGa|#?Z11&xzwi6_)U!S_UhH=ACDhh|M)Rsp@*ryJSD##KjgptYN$H=5PyA>@4Buy?3BDtUesI_ zUHcMwa5y%YYMj{Fp*J9>29PpwG_q#%+zwUuV|keL zR8`H-{*)y=a5hXXHr=JM0*h&P;j|+F6DRb9AOQ>^;9+i|Zp~a{f!!`!f>^{&1n9(P zoX~~>J>(qBfqdRpsG+HAfi`2C zpcSe$)XT#T>sebXz-8Z{#)Az$v=Dg0;sI-U8CZFcwN%~>;{msYgxSVPvrkfR=>f__& zSDR;&)3YwSSCX_{4UdP>k6{&F$mg3+FCOV$c)P(@8>Z{+?pc?%-HK~7EX+M2uvjmB z2ukFpHOQXIB#xGwuiyXtFaMt>{qp)|_w?a?tlQ-)X=@#y^4;q??$i7<4=i{J#c=Z(@Q^-#RndkASReHG z^KX8?J|3Tc{l$;|@ag)8A8u)A?tr~8O`amBGzf0}Rm9e}R7x)YzL z@-5V|x%!^Q^Dn;r=AnJVD{gItO$rPj;67UD{5YYz*4FB?%g?uYcsd=0MQYctck2sE z6Iyr+dAa#MMt9s>cm46FpMCVPtww&|%KOLt`!?jd=RP72&tjhExwb=@rib_Y$1u9S zdbL{bkmU1cHv@JnI@UJll#u|*yCm%F5$1pyPLOf*)oSxJj=%cmi+_Cc?+bsW<)MAh z9ln5zr#7@#SG)P!`O}F$vh;)1vs=}>`C&5a0}hu@`zTtwElHl>{`LJQ*Td%Gr!O$S z+6k*I0Kfio7XbDa`($lxt9fQWlCZ?k)0w47j-`i?_55#wS~Qmd^U=Xrms zr=r!&>^$*U$jJm}b{#+<5e5VU>33K+0K{NKhF}OFr@#e?Vi3*4-5kKfJscuD7%38f z1^^NxFbfIvB)o9YkPN}W6+sowB~s){+Q8?~ zF>2vb%o?aImmNe3A`muVVRj)90iuAV`4ZrO;qhG%8o(kP-4TM&13bcs-5uD?Wz+Rh zJrR#UPcwo(WJ=8X@c4GtW7pZS9!@Q<1B0e{e7}xvv;Cd^`qkHO|9-aH?PeDR`}EQ+ zJ=}j?#u2S~pZX+;K$)Mk!8UFDA|4eZ1*Ezu}87Km5Z#eDNm_@SE8mn$H7o-IXO#+Vsb#pMm^ux2sPd?(&S) zjN6ok>u*!~=Cb_yRG`<-8~mWbWr1J(@^%08=dTQ8TfFNwFLHg7N1gWnWl45{;!fCy-5 ztOcIy35x~6mkC&);)f@Cig7FVF(20a zQjuGIwDGv^zK`^=wc62KEY-1vjnwOqN2zx%_Jmsvma!?%9m#Qf#2L?4CWBX=Qw>&1jnuKkn|@diDP4 z?gv+&(qxc$-S2uqoPZ6e2W#*!AHE#l9qhH0aa*Pzbf4VZ-2D3eFMs~+|L3Xv1tkyrui@b z?)~r?T>bc?E>U`!^Dg_F(?9&`FP^5$&wlS;?w+sn^@}H(_dohc|M=<6{x>%`y#MC? zJAZh6!G|mR!1Mv$?$fT{ULtjS2NzDX@fN{|v}-h(6~`vt)(N&LuPJ3pOy~$|R&-K3 zP4nYeA7>ra%@OT8R%gI@UcZEkJ(PmX0nRr8im+v%1F#Iu@!X8jaK$h%bvFoy2s)GA zVd>B~1QQVx(pfS<06>BSOFYiVTZEdcRreNN*&4Vb#9}%l?mTytvrEP$n+X6O=;~U* z=g`?(!B%52bSzN`Vk81eAqg^rU@!v1QqVyNL}PRccd#H|zEOdCmNf;$*g!p-#o;hv5emLC2tHD2{_QmS)@xy() z+>YJqkmsH@7qpkiA(xOlB+DRuik;wwrAyr+3sP&dj-x-$dV8pMlbxE`;&3Eh9+u1F zn9j~?2tx65%N@^S`z5!z^nXDj91T4z)TP^km4O&}x4a?yjMIdGDd&r$90bZWyi~UH!8ngyFN1Z~es6zNsO9lp99D*X0 z*%L53k~w*tg(Qwk(j&sb5M04M0K#dx!UQhIwoH_O5sVWj$vLY_D6D`G((&Qj{ppFq zNDwTV=M)j-)+R1x#jn!vLrM&As?(mkqFsX;rse^Cx5-f_Zw(|v??th;5YV(ycEd;i6weSOjolRx6@U3by1KYunn`@PTq;0MF+Ki@nr4SxG@|6jiRpMH7w zKLb2)wl;S{-H;H_fk^!3`}F+Zy!Z*e{c`{C)&1?=rL4bf<9|)?`t#N0_kZxSSKq(f zz9gbwwQ^W}b^GPdWuLCf_JZbjxACif3J&guKl28x7pgRq!H_zI%l!6jXE61l`%kVb4NL) zT_?Du^(ILtJqu#6IaF&^r@1^G>;BZnX4QRBxGYjS3`BQBq(usVff38GpTN&D?=$GZ z7k(%aav+AG8-$xHxP=EUlyqEj)39*T6Q7ysOd*K8*o_$c?EZ7LW^SMgUeVej2nHf! zp_CwZlv9Wxra+>Ft?ovu;0<)fTBx=#4UcoOZejc~AyNpP1J)EkO3O=*mczZCbsUSW zK*A zHAuE{bGiA!tLxu=A zyX*1mfBfQqzM5(G?8Ps>z5joYXR4${r;;ziTby%-~$BzW{S@wc3FPo@n}b1VQm9OBr})YX*e!$Db zVyQ|l3j_4;LKDlGIsg-rU@)LC5kV|$LI4YIrq#5zW~vw(YTyBY&IFVr^(dXE4k!f@ zE^sV2w+dFtN6;Cy;ra4S5gcb-DiRYidI}IX3FO5~vq)KSF)*F?)Io%VggtX6+OllW zFF9|RRzPdv9*_con8drYpo44L}GX^ye@<1atN@pv^~e!l%fmlfyd*PBnT zH!lZX53apti)lYkzuy1N-@p66kLByk4JwLX0b%$l*i*=Q{N5k@?*HZ4&Bu5D@vr-| zUO%+kcKVBo|L}5q_0Rv+_dfZ9^>){HD~^j-R9?#NzH8m~lhfb+12z97!#^7~Z+N45 zFxo-ry$xz<2Qe^j6%PMjmIRC!F5^h`<3J{+(|f2!H?(Nf3yF9S}n(C``f}-Oe4De|nBWkS|!DrQJdV zj6@8K#Kc0tywKgeL1?q;R>CwIx{|kug)2==$#RD&!@@OT0wP@U8Ey~;R=ibc4a@?9 zoDsvB&iVl&6mUVzU=olP9Rc71vWq|w3R*}=33xQEAW_#dFjjh$v$@6}9eep826(!2jaqJ%-zpZ8Z;pOLg^V2TYJ$I|sy6X~^_Cz5em#{_yqReYpQ+ z>b8IUyX%ji5h7Nb`D8xMj?nr|f5F=y(&t~E{`~JR1%8mzMJe~!mje!)+Ef1I4o<)5 zyta2Y+P%2`WOaBQcki3PFXpj`{`NP2^5eTdyGU0T-S@xu)BhdxKh7WjAL-*iieLOc zt`7frT@KIZ`a3K5pfDlD`el}sy0sAnlEtzWJxZcGs`CYG3jx zPGeQ{X||%K%$GF^0iUMH+CiW%cA3}Dd#IaO&ycl6z^%)*8fmK)T2Z)KU%%*ov<7!* zkEL`vbSV*YYYLcJfz_}g-zD!>w!7v*5XX5;i7)%h74?0l<6Il*?&gBaZhyRgxqW^! zTpjo0tUC1GxO=wUJ*Q7UzWJ={)~CoiRRK#J)Eor|iC)&Tq%Zbg{->}1Utb^p4D$>M zi93pDd;+KmQ!?(x^u2-c=l|F8i$DKd^zDaVe!+)d?yP+B@#gbSE?#aNj;Ha|+IXC% zCmB|0{mJ(GyUmUN;CJr-^2?p#oO`$a#wVtMSIFzTUOwB9+&iQ30kHu89gqF|F33CSkT2@Tw$A3a3 zrhy`%bC7{A99biGHw<&L0M~z_gkl&jTQ5MOrE$mv!hk{q0E7@>q3&*B779_x8kz=x zBOxO>hn5@!f`K}A0A_MxAz_C^ye3#P zU!!a}T_)c0up&uJ8IcqB)QMmsU?Pb`8JUQP!+m!1IZjfa#$M#1zPT$8L++5nIcHl{ z3)RCqPj|<=c6i^x#nsIpovhB|om!E8(`uW?2TVG^+Jn2ixfm|UsTv*F#_>?9NdR-F zOsQM<^zyR19P&+l7zs7HCw?|ixBjT(P2P;=d9_*hU02{mT$Gc(?@v4sP0?oUyH2v0 zRR`O2d5Mg;ch7q@kpP}68`}FwNcmM0J4}aBg3bP1COCfO5$5p~i;?*kF zU(7$R@7}I|{&&AQm=($Y7jNlP)8d}-5;m>eY>CQ zXrT*NZSkczxVu5nGV{SD|7+kFgaUwoiIABx1QQZ^3S>lZThvAl=4N5h%*+?I^4S3x z2oxLyOiau~$PCB;Ov_d|@En<#n>AH;i`GyTBEDmKc9=WM9ZDA@$mg1idqY!*8m*x= zSV(n2fD}u_&k(^}7#NT_0+1zzBuE|LmN*ftIbS5c7TI=b*L4?6IY|-(Lt$ZNMg|N) zUf{SAgaNa=8Xaaki0XKlCETw1L9h$S=;TdBkJ!1DsXjf3G-yRQ#TC#vsKBY$xyo0)o%S_#c30(_06@cQvygbWTq67FumXZ zw!E#+u0LfOD301lCwqDMv9~t2@R`%9yWGCwQen*06A5Gm@?>bzF@lX{9J*u)Tg-p| z&42v${h#>!J~3C=bSc+KJz6HvrlU>5+;V-K+5^we`(gJaEmnIRp0ah%1fZBA_faEZ z;Q{*)hyiUr%(v;NZTtM$?*7}4c>D6*W@Euv41%H?YFP{yVo1O>eKebH(M0k{Gj!A zfWs-YF(_26b8XXH?~mnfYDcqb@Ldpdxv<7L)(?QO?7WtsegptBC4i1e5E3pLKO_kB zz)%J7P&W-%U&@iV49gewD=z1LKcD>_iHVSicPT z5sNX0@RCh~rDCrYyizOP7O?X<>kxSEc>(hIZW9852~vi<1sYJc#66`o@fP?Z<;#>e zIqmwCm=`NK2mk?pryy8te%`_qrlQ9RPg8r4VGE~2AvNdP+`~|?YY7uYE2Y(zxV+e| zE{5kHz53J#y}SLofGQ|D^(jY}IEJulQjcB+YC9QpZahHW1zOw;&#$h(pJ?c)=cMBO zkRxTORuo|_I^o<7zCUR{3>C#}oW_Yf?N(W;o?7j)L{IKz8e7H)bDbjfL#lz&QRaFa zO(0frn9Aw+=8HG~`5(Ud_j~h9LKF37%^QPXN8eHRxx3Ih;ni zyhtluKg9!z5dhX82Aw zr>|On3BUi(U;gi{YrX&ZkMMp{`n%iq50($N?_R-4(~I5p%hlypY(4E-WmK$=tt2gM zCE65I(c`fkPUZ2aN7ZU!K*-^N=yu11TJQnQubJv|a)Pz<{6-4rp;s z_*g`kw?*g%I9j*-47lPyolKtQy62*IjOQUdNFu)?>K{ z>S6tI{k=~i4s+MiHDgE@-;j!$s`MyqfCPl(m zyd&u^`WJb)I-ZW0Vt;s8Z+97T8Q=cu^`9KeuRWD&kf~!K3SLo{d#+}LfKf~pIOQIV z+wE*P$D3b%_4;PhfBz@bZbe=405x|G5jSY6Oz4mBR8J>M@NnZFUhwAY`jK>>*u0wV zZ~O5d+95l%SfiF?PU%X>AJXG5{Hqtkzkb30=O=5l{pC;h?T>GdkC|_~d@9F&eRww> zZ}suVH{b7h;0_#4B0=mHW>vMdTBc(;p2~4<70{6oBOEatmlYAXFMHf&u7bD(?UAD+ zl!ycX7>rCD6cFGJ;pkxj5MYiL0AapRch8a0f9eY$B2W+lBjLh8Bmi8xgY=DWL^h19$Z0?=hWwz!Bg;o$- z$QrX?bdn_FkW)8Em$PIM0_Jc;XINUC?ig-X)GO#@YIZ7inCpFUtuf2tq?o!D@gRZJ zWuonnpW27h{O<0EGE6AdhRz1bpcBfP`0 z71;3R(qH7XQ>(T5I$sYfP6KgBE#;oIoSq&Y-|@u?*~>fw*4zCwPp8L+caka#A8DFT zx?Nws9rtscb6>HQ03)Pov0mp<{Gt5j+uL6s;%PeW`CSZoQ;xUC_{Q^*ghCnI=2}dW zAh=y}C{`!qSb?I6C!S(L5-K^SE1M2|8OuH@#XEg)IsZ^`RVPA!I ztNN3ReEM+vO4`d~|Lco(8+DEN=(m@v?K)DkYHZWxLw$PqsD8fN{BwvuK0LnpT^RqJ zZ-0BY|Mhied2M{^x7DS7kVBvDFT1{9u}TZq3bnNHR1Z(};aK+7N{jCltN`i}%j|pE zY%O~tAV+jxRIP%xuc&;Bzj5deq~f`}J)5h5bX^6V!J zpzt6yZ)T%ub+y)_)^G!i2mwxzdhdIZ-Z=qFEO~4Xa)l-5TxV!AT8ZXo#tOliTojCi z0TPaY2_gxk47823LEKOpgam~Xr^G37CI(_&w!s<>stUDQF$gzsHT5=o)plw&&3aTF zA;K7>Pb-kk)tXYY(Z%fk5OGAOwj>01_b}kS!%F2#6)>0B+&U++gWCf&(oGo``^8j4-0bMu|&N zG3ZP-kMJ5ghgDP8idHGw(xf*^n0m-Pac3z(VnD((Ms(phFAeuuYF#eh2o4Sif)POk zvD}6wkV#TU-HOsR<1^~Egc~GbUm@173eT=K0xOTcUk?LI>brr(=c(Te+sjvSdVaFz$G5kq zuP2>6JV8%wok%B{v{kGm1VK=Xn}c9RW61+k#9Te;kT$mABwgyH7jh0HFH=PX%u(ai zRNMiGbIPHUH%HO3D(`vD#y55I`zQR3P45(QzeWwi(p;EQk`**7RrlN1zbt?D>h@=! zt^YUZPdR`1Sx-N2_HYrqUE&_|x?kmv2Sg;uY+=<}vFX?zp6bJ>$7;<2JpzCn93UBZ z*qP45373wbdKe&J0y-y*5fH)95+Gu*BZ5PiEi)fCcV80!y1ZBr2*@xHD8c|FkdT+K z#8N7QC1I={Rb%$rS_5s=G~5vgSWq$y8FGiYqa;SOq}|9otU{|%3(l2VjizqPM;|a2 z8@HTOpCCdhiR7HOnXWQj6K_N=lXS=iXLlJP1+oW(hI&k1YvWn#X|jV^(=ZDI3)8vk zNo{m63kzcKUP1&|>SzH2^Dsoe;nhRjwmAB9FEb3AkA=R+6Yk4_VEg>ya+1U0sl;&? zcrzcWq=p=&o=(TRXU}go_##@~4m%aD*4DXa%-uS&^mo&=KR%x7#D?=>e>xn}27}xg zM!k1iOTmG04a0W0-bt?z>akkknzDErHDrLk%j+~iE0e3EON7tUG`*Xq_w{I$Q|5$) zRS2^IC`LdO10ocX@CdbJr>^|GeUqNHAKiRzwnqEnv&-Ed z(dJj{4%ZuAZSrc>(F$1z!c5h?nIEU|I3AAmRJ^%kKm@R<83bZm{;;GROaT#zI0(tv zLm-5J7?KdW5iDsPBtDC;>AU=ysT%m=z4(s$fCOmhgcumavG8?}h!KgGwpmzc^Xg5h zwK!9qA($z5kvr%+kQ7;*&Upt41#hm6+U#wDTA``8CBcurFddN%fRKZdaF)!Q+z$Y4 zK$5>5T{2&G{ETFqs7E6VMVZaj85y#B)0m9Hw0J8s9NRorJ!n)7b*tcEFjXHtba@I0 z4afmu0_J^B8qhgOj**YGJ&X@;^!Q=d-B|mb{q(Uv??1Wv{F#2dKeWl8UfpPawW99g zVz;RotKUrU-L!D3iX|g*V@1Uj;%k^r&wGaK}X+8-8 zF*oO3>b8A9?&tQXd6?=5NI50A-dqbX4T%z*8RgnyZ^`A|K-bvKdX3O zz`FqEa8ctEvTK7(SsBC39SEQqKqxjM>-8j%2e8zfF-zx1aI`YlBDM67@22}-J??nl zbryX-KHsel+x2mCC3#DE&Cmt1hL`4Zn~(M3Fg;B5P^{`XtxS5pqk^}kSOP)7ikJc* zMNrt8>b%TD4Kx}A2tYc+dzM0ogX^-i(l7wG^Ir}i2*NH$)PU&bl7b0MbzsL}ps@Lmt+k6>oFYf~`hdVnXNWP9P&Cq-FLa#EBDi zU0x@?5ZOxFVp=nHfb8KOGx%&#LWdyEO#;;1RG(TqOgL&gwm6wj?&>i|xKWK*95G!` zXCMJ3!erQNeHOL3gU%*9wh>Xo!`-|53WuLPd+`VRI6U6|!gGI!&F1El&8m~*srfWp zdh_AVi{d1ddyh5d3juKNKYB6Lhtr$mdmhrsv5W?DV2@WXpABi9+exR?T-sbj2FafK zblR&y-lUD>6?UCu=|s#?f|&AU4`vNrJCQ!`SnB=Z_E+!z?pGiFY>u~T1uX@EVNX&8 zbv`u&iOduhKtZ9#giZipB`AzC0TMF^2veP9l8V@hS8(M3rbJAE6c$LtD6kkILR0Or zcpKulDZ62iQC>aGH{QQ(HRJ9gUCcBzFzbw*VNTdOfaw@--`Jqe$>9U#PIh@C za$%4hxp>rYg))`HR1T-QpL{wWd;CoFM>KtRLZ=%XSSz-K)21&Ap;EGQ|Kx2Fd4h zzho2vVj{BTYT<^a-b%EJtwQy%SaN%~_~(fNiGYCwSx{0hc|cl$tVq@*10h48VY3)5 zPS6T~VwizBc{pe*tv^8X>kkiqbvOM+iy;5NA364gC&;-sF0;i7^NXwDFi-Q{G*E|F zadXkbB_(^bnLL$j-WpPyqxlS{5||8f|hg-F0DljqQn>kj7$(2me^!aIAAC^A_ZCkU<^hfi?C?!#e4#* z9vU7F%X2t*v4r7wupZ(}l8`wihJ0qJgIP?`ikh_srGaWh05D3%u8Xe2zL%6BF%m7% zI12?;s1>S0teK+<9p8S?-|i2;opqEh3pgQf%l4wXeqSEU8|5Jr<~0{}wWzgqk^o$5 z0U)7Jv8ZUxSx{9IOUHdmnOv)blbW?4OgHz_ zI;5%WAD_NScmm2-yMH10W1aGQ z#YXFp%We9&zkX(ga`%xyE~f)L?RV>&O$-Oxt}fzo$n%KLhk?|7{rc<2`D8Go10$Tq zr#g>KBIjY3y2#kC*Vx4xH`M0Rd2RL1j}~j)PphbxtIKun$Vn131l1}cPJp?;J4~++ zzy8JT|L>dqe}3}UMkY;C_;vT}k3ajjL%RIaFaOim<<}a39GRGT2x3_$2(Hr<#rB14^7s6Y_krBZx#aH4P*Xq!16N9zn_=K+*dC0pNJOhWm+&!wFx`^E|$t zQ@1lSmS{+>mTEMpla=`dUhmo)PMlr{?h@q8sjy%;7+_;N&8NeBI_iFmry5nm0U&_D zVaf3?TiG~sn9&gcIIweKaQ27@!|?N_03|?>6GjKf5biNsn?p-ji|`2h4v!d%BN#9M zNCXp$Bw;y~;$$}6az{4OKgf;*NBLwMF=4m@np6%A+xLWD-;h55<-+hk>F+EKW(^meQ^@ebi z`6Tnx-lbVPq0+^$x_k4e$F7J)Gy^W?bw9Qk8>$gb$NS@)he3<<3hj8< zPjWec8P8P`_ruDh>$>44t+A1}a%yE5I)_XE1Tr@pJKxWTHy^%wJO17Khrb%_olEIr z5Kj<|Q&b6^9~@MW%zy!F12Gio)_M`aWKyq)3g4=SGU7Qef#3w^U(n76CXP_xs%@$|&*L;7=XR`GG#mn#w)Zmp2f!uh2Ke2R8kmT{IfB6$ zse!vM6)0r%E>VW)5%R(|)YeR!n}HiG9m4b8H+Tu0qDd){N0~|MYO=t zUBff6M4*OSG&BdS5kVnl1mOW@ZC2M(0W*X?W}JLvMohpync_5&mH7|{NYZb&yDhJ^9-x$2ZF**u zHyJTYqC9wbguZ)vH=gD@7;m3`^EiI_=J>1o`fZR446NKtP3l8=|JSeoBe(7@Mi?k7 zAqp@fv2zqJ-e9y73b71A%x#*zflaMebHkL;N2W_w&fUd4InaTGkXmaVzz|?+G)b16 z(h$wu@rdHjK8g}u_vVx1!>31ocL`&hq#*$X%&jyx5=6q>ZF=k^XOy)eFy)S@Ia;xH z^77bDV|zI0{S=cg@iwN=8-$fHqf+>+ zR!nD&0-+8|+RVWr&}rH9A`n6%&Lo-pM43QniAq2TD`rJ&D+;ZF+9Hx9NtiqA2I^8I zCO)qUJRH5bRwxx}g<8F7=)yuj?;8RMh@E)($^tGkAyfc3zdLw_U{_SYz_18HT98DJ z3J|6mp>AdoVM{Y+DY7xq0)+Q}av%pTcwj{U2g48y6qBBQepK}9D8$Hc4yZV7sU)Syi|6|L+7frDhAL3lVc4HCjo zD|Yg}W8sw`ZP&x`G%;BeLjvMvh+wGNkZP@vfRYS{3XjzjZzx^#Z$Eth9)9t0e$1uK zCuJ?xR5fA3BqF)r?$)ck?sh}Y9i_w#qcbpVZ7zNq$CI8)9IIDH07pLy&MlmY0Ljk< zsc=AK6hvN3rie^H?oLP$34xh5B5R4Q1_Vvb-CY3`mQz)ryPw%<=h#1!xS%8^k<22D z2^nb_2&%a?YpNBspfz*=B2O8+9#W1hA;idpOVrcJ6--Hs>jbq*RW}WF@W3EK3?xE! zA|!OA^CBC;7Nj$fMYtKz@(zTBbjNu1+Vg^cG1Fzn9SaI;p&B6)8UX<;Xgr6-+k~{h zCt@MR5V(b`xjW88zCWCfcW?f3|Mczk$G?lb?OzR#<+eaD)Om7An38mTV$ovFHH)Y2vdfus9%xvtF1CGg zk3c%khdRuU_eGo`sRxxfJlSs^>KET0|8cM1Qitg3=tv9n($F!&*()YgBMc(2GB*WB zDnnvqOe`G_b)M_o4m1r{n+zA2*nDSJPRG4BHNQXLLApzbAm~~N$8c&(dIp%7f}k1_ zQ#ZwYAZddL?c~L$@a?ys9&!DIkJs495l@CR42h)+h%9N{4Sm;h$|MO<0?5M0YWp@l z%?}5?8{3mcWdcNO8sP2-$Umfe{4*miWF$kPA{07&r!jHAUiZb6{7G zVqV+|Eb-|80AGM72+IzQm!ppmF{VXH2hq%D^~tO>Lse)N7J%eQfI5>NbB8Gr2_f=g zTXI#cP%G&SwPLHGE!@Ha7G53DxvuQr8Ri3FNpE`qLbwxx17e^p-RAQaBNmY&yL~so z3O~P70)m#Z0WjhDRVj$EBmfu1>-m@N02m(Ta`(9RmOu{U{8s0$uU13XE?$1}v$VRt zoA*WMK5f^OIv$v^miYnpcOO1fszQadVy4R8^$9_pV*(WFGYe7YE-b`o(m6$6TJ%|% zyCmd7UB2E7yEP1iczO9U=4lP^ROFm(MdI(3#k11Oc(^!-AIc z%5u;_Uu44oP6)X8$08sg%oY~VKUtaJ9N7+LA+|+^wq&4SnH&e6^QZw7V2dgKRQ7qZ zYUAGVJz8PB8u$m#U;gy%ey`0TXaXzsexR1DgP@_muE+gp+@IQ>jo_jaNvIMaS=MXt zF7Ya@qTt+ARgsWorL*@OU3gPn(RSDKrn?A;@^reodAY`6H(d4%o9?u`-tJF_`E+=i z?uAY9aDc;`_T|I)>-+gzhq)6V3jjBZ3M7&ky;TLOEKCZB#K5K2oDdBJn3s`wyPJ;Hj0d&J89o ziFw6i+x5HcaAP^E);hP;i7>P0PB(}&aE?O9dfJa8GY_kwm&~gqkyZJklb$vbT%SM& z`Q)P);1sIcivh>9?Ye!;`*KV;TwT3DZ4afM?ET&G?V;WQwFH5|rWIT`Q*EXV08p~H z6BqzON;{ug4{xl0-&_9uKH_|%>Bw+ zbVKJ&=HrRnII9CQn-qbJt%num^(cC^KD{2FUv?L>cxQmf8r0{0wc4i3OskYR3$i5^ zZqUYt;~bND(^>V@thxt~M3^r@A0a^VrK}qOi~$mW7A%4SJ&J%6fjeYG02~qx5*-A2 zU_^CS(-Kz9)!p2~JUA97JW&7<5`qvVN-0sIOf0nAro$JMjW%e7YQv@g;J`?Nsl%K+ z3sFL5WCQ>TL-VleI>TJ4R$P1*X0+gS&oW5_Bw!}uKn!BQB`k;jotScA6$LGR7h3f2 z=*uC-!Sy`3J)iGkxmFl669wYAMR0c0oKHQA6z%*L0Y?OM1WA1^^{D(b$K!a=PcD9E z!q2f{!?=4R#c-Ru?yu`+BnVL?3#< zUJ_yw=miOZ1vBIF`&z9XeSVtjLC0g6>HrEE$ zptL9rEL;#Efeg`g*5%-Y#OW;Yb#sf>T<0hiTccJ(8>ol5I|6_&UpPug49gT~(K0P4 ze?(w}AYejbWCDO7WB@=#Gjl_52w%2a%ixHXe;z27@CFbpVpbqr?i1%*b$sV>U8)fa z^2JSZ-;eV_t%7Ms^zzv&-ve!BhN{At*H@A!T?LEb%6RS)Y6{S7*c>OO!5zB1O-x9_ ztIhN6W@{UW*{1n8Jpu`?2k_Lbb}JbQ6vQs^DjMem56647p@-C`{`R!*7_CwZZFN4( zr^oT`{o(!N>0R?ZT4iR62nUi7VNmIFrVu1hrRD`&bBaUp#EnVCMUZ>!1<0_N`w|-n zQTNF#2vZn#lB2bz>W)N6f#Apj;mU*wgmI_<-a4@rf|q%!tq~8ikDc003^{PON&`!mrDGOK1W;@qD;%{R>$soZAM3kGk3cMoqt z=om~69>Ic)gyF7nHrUa5ib%oWps3aimMw3T<&lX1yext`Sb&! z286@xR#Z(DYvDF?cpxw&mrPxt_4cLI#VrJE0}?Pr{@Md-&_cZ zXi;$kFe3T+J-~<&#K4S%i0BwV81Cn^6E0i61p~BPnU|0IA`0UG3|f>0;Rwrjhan8k zko4vHfbKy75Cp0sxq}Rn+v(}>>gGp^-H-p^_nzJSWM97d;=_9mU^AxE1am-G#A*DH z4!y8c2TW@=UiVoTg4%SNP=Wp*WS|VB<_)mA zVQm0&BBrzgYXnMUt(5RiLZ+BVB#{yK8LL4MaDY~=ppm4_+!DYL;V8}O>?JVRI^r;d z0jeJRgIxV~+J8RU{=;K8;@tb1WJNe*+2+et+OB2O5eC#nfG`9qK(pp<762IG;by9;LL7Q~CrBbGCnh4~1+#2wZH7{y zRGh1|!m4HgXRIy)GGPK_AV%asfyI%*j0*+=D438KXt@X@K>%7TKjmf5z1074p?fV4 zdPV|64(7A6FK}5G`m*U*Fi8tS?o4`g4zC^o4l)aU#;resaw7JY_)#gH#?ph zOvM9r_KK}vZ_x2nMhk1=T^g=n>oE9mQnOn2-jaG7da-dv=rVMDpK9%h(V@+g59?v@ zfhqcKmAM~wnUR@^8@bqY5r%|?l)NF`AE)Dd_x|wJ-Td1YcVUejJtWZ34H}^W%94nj zwE=NsNrKcx1{4i)HKhg}9MukW5=9~5um}_i!Wu*(G@4$8KNUn&cWT~q-&&k@r=O3r(md=R9_Qn0^Es`>YwG z*i5H+dN{s)s^2ynnL`L8K+O@C&8|n7E?ou$Kn(Q|CPWSa+Yml}mM7zA^+X*d^-{rjAk) zW)VosBEi+ILRD;y+KhD$)o6yxx_Nm+5d|X(1(6FI5)nHy5&7BXOU$Iibk6Ub4F(Vh z;1urR1mMdV9+$5O8zCVDF%dc}-2+7OGckiO&V1zkN1 zjge|A7FJ4Zx0gMxwbaoLk|?*niFz2QgEN`0B;}Ommbi<+GTB4(_usz#o7dCdp2}N{ ziVo)N;YbL~Zh{QWs@00n-}0S(-1)!;0s=n3+g ziQF5g=^R32$cb3Q&D=R*0uOKnZ*>mIc{6OhT6J~s*%W~blw0y2!I8)JuS7KANxa(m264EBW8=1=lu@8U)0i}(&+GJJ zExm<{f~*o3W=!~5yo zx9|V@{lni+_7ep4jut?ZY3?yR)b(B0?>dKUdjYS$qa;6eO_e18A|wEEb%OvR42NK3Kt~HD zWadPa5hQ{E+&q?Po`r#ffKbn90O$lx$jbt>S)r&N3Ob`XEmT}YAP~fqgprwqnE?@? zhF7<$OMGoctrQl(N(u5JvCEK>2xrC=;1C*K&9tf40(Hh(p=qdv2L+&000KymAhKa1 zPL2YYfKo66N-!Z&5G{J7fLK`m1RkalhGuxi@8YsRL&L>_=RiON%jz(|mt)b=SYF@( z7!(FF5=eB=JC`DeyD!TlimfEnIBj_~E<#N~p^cU-Gmv#xo-6JS((yre^(%DW^+q;q@ zCTpc@NQcTZc-+qS})oD`0K}N%~Ecz=RQ29T5eEsuqD@ znt<5OnVG(qYzVbJnK|Vy>GX6sLBQ>0KVJ=fFDptNb=U&QgHu3fElo~kE~OpR9~B#S z3rpMs0Wdk%*4)BL7>UUN-NPM;06|0;I{-mKcP7xq&C5gr!jvKs0%D+p5}}706tf0i zOsj)C8U`XRoN?+%I?7!VS*l*Hj&4!3T2m;6<_4-Bge;sh4B2wxF84_iW2PnP?8R!+ zW@n#QiB?q&&A}Fz03iwzt0a_;D4`^x1SvVmS$M(A&wJ^bIRH2)SXeZ03ou+_4zY~z z36OvZ8JB1TAb}t{G+fU3&PcGh!NJ2Jz)>x%xVL}?Y_#!i=vJ5My4&0cPE&jsw{-c* zi{BT#zI^e?I2N2JAXo?Y^#NQ+kUR&iBs40EdZ!AXw?fFr~)vCSu#wLO;gwP z8(K$TE2l#_ZTpK1n{qfl9rmqOM+Bmdr3qA#mC2h3t zDpK^dP2jQ18yTpzLvt~26r*w`=kV5`VJJWZ0|hl@*IH`@$pLRXfqm?6&Wy0KL_h>4rt?Ee?fB>1oBQO{jO?q$;ivS1!5D=tb z!!Qm52xJ#wS+3U*7^q>U7#3A^QY{e8tYP?hp9~1-0x2;h5f&1_Gg!B(H*CgpiCVxK zK>#79WPQf2!<ho;pefBL2FdM<6D)6p=js!8WXw@rVK0~;v~e`SxA;oKf_X0 zJ`eN_(7^%2APig)mH}$ucRRABwEx|ce(|6p5S|wd3-Kb{9VCpy!J}aXsZK4TLL%w& zXU-1V!W`Fa~Woq2zJ>Vb_}Xs9#|9V26fHa_gh z=IVQ}a-7F^_4V!k+nG*DB#@(_n>i7IocTAu~3$k8@r1Njub1JM5xq^U(>=gu;=5Alfv~ zG=&e@l{3|91*iKY^6Bc~qP%-atLv0_+11P#%O_eUgS+X&`@`$Ke${+W3={~?$Nrc%Mw4ILAu4u(F`5YqK2^-LWC2k7DfXswn)Wu`jA zT)nnXJM-8PoiLb#Qji37${k<|O31vJ0T%q#nN7Mt($T>jVo@_0S-_cn1h5z`k(T$x zlJtZiSYn_{R=``z14w5->r$(uC9jBCX;uVH# zjS1(W8&bmMJuJK7q+;}8JtG1wN1^eH^Vw>9FFz&aD3bB1Zt9UZOyGDTEW@S z>^fPN5CSiOtDJ$F7GQy*lszIt>pD(It(>PtO==Mk&p6`Uth4&Vde{uBkFMayWWVzk zK8W0Tbit)ssq<*Wb#97>)j*wfsMer31cjNCdAKu^1_A~4!sem<>izd#f(P`p?uO1G zygJp)J%Ut+`*L{m_{GEcTJexNmFS2?6)nun6p#}lWDfRd&Dp?tsU{+k#~D*)PROBX z7O*t^fL%djbXV{S)Vw)~hoWjlH@8@vF+_+MOd!lc+@+kP6B57*q3HFDq9{-cSw#y# z0!fg2mh36>GWMZGLKA9U)k!cmdzb9e17+gm$dDGe%GtEB z4EO^K%?TV#-49h6f#YSG`;l-xE;Nhmrmny)Rr;#-v zrEdFtyS?mV?LKzU=9qKl8j7uk34`RkVSuN{+g=7mtJNm00Mo#5v$m-6JXNdFv3$e1>Cg+zctXf28wA)@=z%3WH$fEd96i0)0@9Sju!8PVOtyt*rdL1+Mm zMyn1O=oH{iPKziAFcEi@hc0*AB}@no6lTS>YO_*eYFKNiFJMFLvUM5yK~j>0Nq_=8 zpoCY`R%*-*>KtX(TBy}z@9h5Iyx9245S;@ypl{5DKtu}5Og>`?hmo+x1IKwPX{##$S55R%-x8?@$m34 zj)!@2oP`t}RRe)Cibq&e2OtpPU=IXH>>_ReEH@YZmi(|k9fD?J&Xgf_tDFttU^?m( zLy#ll;1aV>b8Bn7h52v03)BAyMJ4J$~mRT(X4=5w6HLR073^t zZEn8ge|Z`2g)OgjM_|No^ad7Y24HRoP|X<}JZgjiy0D|5X&?h?EPWR`*FlUy5<(Wq z!if?KA-J1G^JZ8Y)V{QL}kDs=Z{zNqki=R$)9wq>$KkGzR$T!k{zX@?EB}Ed)aaK`A0uq-+X*P zd~^J^%(nxlLoIV12@)|jnw=HYV?OBJNc7P6U26^XxExPSn3!t@S3^&XLJ7017zfV0 zA?$jz8G>|0>+UKa%OO8KBxK}ut2jkDPVe_m-|Uay)cPocQ~1(n)&@jDMvknij2^{2 znfh4tlwp)SrB#niF>JVO)D(PPCmS&}zsOx5>vgX?tB*kyD^Mj&tKz2HVR+bGjXSnfd5}^V3+MWMm;>(Pne=hK^y5QNm5=xQX=^ zb=O~KAh*KY09O^e+4P~FzBzpHSa16-5k@#RKy@+zG+45~0AS56kbs;3%zzfkxr5_k zjt=7Gj1LhWP7v;HjgSGz0TvKV4Sk`AHeV_`5f1K#6?))R zy3X>w4t{Ul{~-0RxF3*3X*%kYVMM`dobnB{ZrJSPYWMk-@9=j2Fn#mk>9<86HtT2A z&}vI4wX*_`^XU;QCK2Qr^~qzM^UAEWqU-f#m>|xS@$~jIziqY`an^*Csuo59LO|loUK$}XF%r4~P%~4- zMz!QJb?dE~tcMp`&|;oin{l*Jx3&>_dBNL*cD0{KAX}SeNW_H;qd7f&_4s#hpMHI{ zd-2Kc-(GJ%z98t*%PgNh;o;r!S2{zn(E%smRhHyyRYR)bjR=B(5ObDH{p#|nOWWiE z?L^H3Itv`9!`p{%j`olw`h-B}O_|WcND=55f#B$i z&2LHmHAIVWKnD&FBM79xGcOt-z(c|@f`}pjmvEng6B4-rIWGSp0m0}4uA%>=_0L-0 z&FQv8t<&Q?f2j4TK|}0VHZO0!Px0yfgNY5B6V~JX-m4Jc4-5o}cQyo*)zFY4k zbue0DL{g-aS*vZm?w%zaiW+lJmtZ`gz;<<6#xW78P1E?WuU~(=|N6!H(`Vb?>GDrD z_`^;A1Fs)vc$@vtX8)$Sl~z)O469IWVGv3j1Lw7DHt90MN_?HnLNhQ{^rz|m&BL3! z@)oeJ2V#PV>gVN+f<=oONKQ_%yza>nP{GfsdLmlz{EJGIToJ-000NtuErdeU7U~*_ z6NR@D4u+8+g3u6v5t*>#G;mt+D&atc0M)dpom$bkL8)F94FeE4LGG|0s7sJZ1Ox#A z>Jlqz&8tEyI2SL4N_A5N4+vxg!Vt2A#JDsLfLjPJ+bFY!=Wcp9ID$Jgs}bD50k|dr zLI=-A$WDB&{8waQ0tW^J#Ds1@2vo@k*ohf2K)E4q&KH{cR${p@Yd&B zjbnr|F%ozE_59&Wh^x}FU_U(SW7T5%P{{!TYF{v_J5#NNqXOe9k|fB3zd}k~mJGXA z(}vCR2*3gisRmI(LQ5WGmXO%O9;T;L)J z5?n5IbunWy^@5?`t;GU_CCaUNkUBU)T1+S>FAlRXOw+uUs~&UXKJl7)Kl!Q3;lt^8 zv3l+WiaJ2o^R5ylyFhv-!KoWHA5Y_lLwP7Z?#ufX{_3XxZ(ct8>2CNq$*UoM@_PTi zc^f+Q)@ePlpaKE~bHC~3nx3uGMGrj&kTSteW1MSwczk-;ztb33Yi2Egp?SO!dnYlK+jMe_g(7DyR$j!q(RB3>ML4i4dA8cQd5sx%jG&DG90 zAzB(g*rf-!&aaHMFu0ZgA6vh6o|SJu08;X1j^y`^sQkB*$f&nnjPwF0N^x>_vkr<*!or44ZIZ~rId2-s1kyZT)jC`?uS)R z!HB%-1`J0_B$xm=4L0iNhXu*1h(1pZ9!_!l_TlgE=HE7$P{`}z2yu=?0AQx%7J#%E zM~H}|H&qN^VkYddy14|+EJPG!t-rjO4P0xIMA7wrr0pia*V|kkXK&&Z0fJ!xc(Hk5 zj>FIn*w=WR+To~&;-}N}cqngfu6||!r}YBjZm%<2f36R4pFLl`46i$wr{g^?kUgolTPbB#)p7%?;e z!YU~^w#A1E!QN2{M^jZpXaPZhZ82OUqmcB8x}1dtNf5%rYRpB)(yW%4YcvB#PXg3O z?y>74$x@7AfbcMP)n=uI)|e~S8m6%b?T~^Ymax)4wODA$CArtL1lR)+I4B_k6J?aG zq^otl3A|npFNf{T7B6w5kEgpq=02Nu1+kST*Q<}+?D1(_ceVk1d5hVCM$RAHEeM6rQ)z4QhmoCyU& zLwc({L{7Ytl-#=`IKga`w}}LWiIQBzDUtJ<)>$(Kb1yk58LslcU59|Pl~JCK<(Qyf z1a56E1@7)1e|tE+E_IIt;mF7Uf!czCS= z0s}E&C(sEe;)E=O4yHb7JGNG9^)icB3UFk??74@2fRvDlD41}`-Mjgf|&*I2nF%UzO}A3v*{@@a^=UiHH~S0kGp1j!>4aKBn5-zdhheK5)y z`vFs%6j?Jv21b|F&<|-<)gb0FhZlw#}Q}cWF>|Lh*fZXww9l(!(OCh;+>FM82fSF7x&(4Nb^X?^vPIyzcF6u}**Lpc_i7-7>2281zPeg6FL<>mC5 zQ%#xP-5(^Z_XXLJ0~`0=EW*8uCnRS~kivn%0Xi zZjx9KAw0yOGDd8JK@LDbVaP%Qr@}>2A;R!x(3;MxE_Jnag{rQBz{DBLs6&c@C^5^1 z^mlW2Gwbf!p*L?GRl(hHt9C&UFa(4X0h0xivxAwx#+9w$ap38m%6>|3)BfAwqI5X$ zu?)vvZMZjWKc!M4jknXfyFUo#+Ln%M#FhBPank0R)MD-&27Tgpy+uSK?L%C~7(ph- zn8tmx*?aF~rlxK{`_yvE>$<;ST&brdd#0LnJ%0F`FUz7W1T>HQCfN~r|8knEt$!l@hXD#hj!cfGt1HGZ?yNiE2uO*r?k-)s0ZY>{J>Tlr zwyX*Zcjj!uRGUYFB;!84KfeEm&yTjc3as<|Jmeuj8^`|e&A4(p+#lY)ElXMa*MI!! zhu(Bb2ZKQ<7;dVJuZ0T(XJ+oLp`s8`2xZ(!*`=}28^QsR-QA2mVYhyIIX}-YYhT=& zP)d|CMyBz0ZN1*m!@FamB(hn$ICXSF!rHnjAck$nL?dQSf&`ccV0KW9rXI*dB1{3i zdFr%Th1VdF8@i(e5i&)gRC3O_FcC+1xLUQg_SKe(wS~F^5lWz#Mt(Ey zD2xxfwC~j}^;DN&L0K^YSR`LT-*xoBIiF zWwgXYj!b01lrW`EW8QsydzWLxbX@d$zMPwLiWt)bL%yjy&MES3^=bbREv9iW(95ab zZfEPOd&LnV$F)8`ygZziuaD<1*X5!63wR~vade0EW|1Tk4Qln;uSnRIQv{-qg^r2d zNeoUIIgdG|cmN!3x5wrSBr|W_xwZ>7>+w94Z)N&TDQ~}f^Ffz(iLW@=o_3Gt^;+v0 zp0dyDN$=(TyyDxt?@F6j>!;Q$v>;&+2gTfbGjCTwi*VO)auG=&S*9@^$rBnY2S#*O zXL~r^e!Bf|hOdjADEV}stgf@I-K{AAg0^r$;soK~t`wn!76_`Zbf*9hM38|9JYuVO zm~}Gk0E~nfw4oEs4V~O5XyfcVVT2$NCcz=4Tv&J%EQF+PYPYI$?{(?0Hc*3rh~#OE zQlKP|B*aXJ0O;Z7-qp2f?a|iFC;)+oTc$s~0v0$Uf&nrTQ*e>D?{4Xsj`z*FBEIwOYnJa4}ez6v8@lEe)^MFeW~-+PC;wXZMhlHfR-W( z(g;DT&JpXPH&6R(TL-!uaX*&(W4=R!G#sqPb13$Dcl_;Pv`JKf%Amu=dQb) zsw+fGV-Yy)=9W0*u>USaWT@=iBBa1h23SaTQ=#8|JA5}NoZB_nFFu3AUHPz+H`hA% zdad3O%^4p1bL~FtMj|xpL@7@tft~Bi%W_*T&vkje)u&n?i7pr&Jcgkl+3E6xo^s;E zrE9PK5&#fNjEKBMzS>a631gJ;ozb0F+xfd;I;?(e?bf-e&LQ5~Wsaw84^w+Q-ro=W zaTwpvGg#Cq-+%Ldf4yC=%g?9l!^7oIfAR0f@i+TOV+_K?MNg{%-J5ieg%ON9hDQWr z1Soh2@sQ;>9!GwQ;X&K|RY0Ou*V>f*FvMQ+OWO5`IykT3x4DP0BlhIPi3G4mWG)#) z!!*>CSdbae0Ugn|C?lE#p>_b!&EE2=XWW`0MBxrWphyU2!4$-iD261NDRW6&0Lh_w zRI^#zx~|a|vQFU$qLOLImNT-TFv>Q=j2`Z;q224Y0w|c15riNoPf3v65hOsvgAg$T z-|r5;8|e4b{l9&|fAe9u`~BZ9KYjj}YyS*8AKp&$%L~Rbw_2@MuhDMg3(?G}1VCR- z%}?E1^XSkLP*V#J@J3M(mE0PxeFJ_bGS)NB9QV6a=J`c+_83ybnhr?q06GGl`g4uA zhqniK%XGi2rzv)WaSubLAtxaj2Gspfy!R9wkUYrQJ8=aO7qe-)J3P53ujl2o=(Tms zSc<+q@^Q!Sv|L(l+;Dd&px{GX)^_ge2$Dw3liZdCz=N)Hd%UzS%^o^_HJjC(f!z&g zDHz?^TB{8s0T2Sabq6FyAyk8<`mPqDBpa(3XB?24Z1QQw&7J9FfAemC3-}G=wE#W{SG5Eg4cPkU?Qrh#@z>e}DHo)EBKk z%*#(~ag=vc`RxL*-@$%5pw2hf9dhU(1>Su)yo)t2wp?unf-)4MkzS0`MBz53JX1PMOAo!%DRFSeL9 z;gKbAWKC-QOhvxdFL{83k>Gl%%iKBNeV2!iMc@6y`|mDa|8VkE5gcmMRlWLF3I;aq zh{`}50;#7ymO=v^GruWu2bP&wLbTIb^=6}%f7l=Y!yrHE&$IDLfa{_T#U0wJ5k@Ep z44f!4Nf1JZu%$C3DrVrJY#5Rn2cg6;ig)abs|SW_I7ARL5Kv&4fEzi6(^hjQ0cOk0 zS-6xjr6QOa#k_h~tzH$^2DO111_p^swvs7lOhW0eTxSmt4Q)_6)gIc>ZNoZ{Bum1a zX{)XqhZ|(Rzsvvi5dVJM6{TM){Cu?+hTH86_6}TjJqwlgVt4R0$B7LI`82#mtOfQhgWI0+Inf`kGB7Aa1;jz>ySzEnoCJn>Wix9bGx19=S81L)>d!o6)^%G z2pz!{5s(DTNdW>8oa?HoFjDkD1xi9F;^BEJVM<85F%^g*;Y-WQ|0{POtn)1@t=h>AM~WsJrx zoIDcsXjqaDsSJ|Eg~X9OQI5##s!MMTZngc>#U8Hpe|wt$XGk{(!Njg^Kxr@NVA`u0 zC(Ahl`j(ydzzul`P63T9oH30V3<0&NLkdE{ZB)D=G~LZZef9o&Rt-SekkCY1Z!1$_ zEC^!W)oZhP)pc2-R#f$l!GgKOSRiLeA_)nBA<)CktT$^K)wZ2abOb=h1e5@>n(#&z zh)~3oi*q`6{papWYo9%RvHHwxjHErWrCS@}+QVmzwn2XhtEz%C#VhLv5WoN-^mP`T z$Ri9z#*|BhfC+^MyAVep=FCtMXV3Or7dx!gPBx2ZDij58Jx+Nw_9@(zy=UaOzsc5Uf#Wp^@2BrkSz>KQ~ zquPSV^}42Ae0$__o`9yE`0g5x`VKit;;DS%{pF+1amMmrz^?IWSPy0aS+fDngw!W0>ybrjel`z&A`gaIMU z6oJt<{5c2%I1uw+(_|cG7(gi7CkM%yJPT#Pl5&}!3gdVfT2jtyrDWbZ7u(m59Fp9C7Ij2mqA!dZVAYmt4qf0=yjx<;|1azvm zTjMO@jNpiQEPIU+!{*+RoU1iHs~mP%?)m;Oe(a<-y-q&f+VxSNoxQ8tghcXkyOdVf zvGnDktj`F7u!R!Y+=$aa=oq2Zv)j6W+hIrt8F#}VxF--M1c$k`TeA}`r{zz#`n|`R zFj;6|auW*-S3+3Y+T0kB7;_e44~t+2ldvFUOn?quyF3NgdU#YDCdpie{r$T)AJc^v z*G8)AI^K`DOc3Sm;a_KARz9XXS=V&f4f}8Y)_+qShG_EbZ{OBcu#2bEDG1jcmB9}x zcaOJ!p6fD18>gfaIgf|*Hf*@9D>mAvdr7`>*9ccEDVLO*_m1=Pa;oa5<@xjS>+}40 zsxOhA*V`BMTNd?5uA37i$efu39oYebf_Ev6o;7IhbJb>nK*(tV-Fi{fT_O6lod#A2 z>ekHdSWGB-U$519Fe?I?Re8`XC<3{fA^+&PkcD+?x*o> zofi+)L*vzutPhf3x3XN?KOa0}g2_q&EaN4M`j- z$rASKUD*?j#9eTX2p;J$9;x!R-=6y8vh=(A!5mdx^`>?*eXjlN=tvO|$}TB`5&(2{ zB?B|UW-21WM1&4p1}O^OeD&bOTy05_hcbafZ*8s3-BQW!F}K+?kclQaEnnC#ROX;} zE9?jk$HTwbQE7|)!%%;{{qR(OI3DtD;^q3-^ra-<@3@*Ts}Vu>nJLwNv$`m+2r&j6 zcx0Rq8QF-qdT``@`KHBh-;QZM{qHIzm3!=*5j!?RbzQ(nM1Z%!3M8UtO~Syjs@H1j zFchHxbw`Fo1rb}=vY0N2NE8XsiHQOtjESrRl1GrgVi6-FCGngk3o#|490(q!zOH&* z`?_k>9?j4Jh@&J=#Z%gNSb~7&5#iRosjdxMMQz?A5D3^o026KkcL#((w=En8$Oy)S z44dUWaATW57`PFMBz7}K%tB0zK<=&%6oAYGM9hd_?h2hnkdVX?yE6cq8)A$-T1(A3 zfDI@bo`YHhAs~ARyoX`ZY_+cUyEn(lo*$pymk;JDFs=^ma`lc=xjUregk>DzILMe$ zmyV@KF890$r&jzKJD2Cl(hvf~c!c+9SqNomexx+`w1A;3)3DDp0v3_6#NL3?PRG1g zoh#jzwm{QX-SayNVK$7#Zf8KZS~*D|0XTyIQGjOvH`s(#=&SW4!vIVGm}nY;njbq= zB4HSs_txvc1!)*|rHHJvs;`J#t=GB~O+!YnFE7{6Iq%tqF~5W1-}A7acDc0S@4oqO z$Llw2Ushe88{WM2_7x`2l!SL{?H$)TcVGzCltO~<%5EojOe1qKQt0N9JnkRco_1dR z5`2cZMX>t-Y^F_HGX+Tu5$YajnNVax*Vf!MI=BO5PDHG&o0~Au5M8zR)f%W1LNYS2 z?urPM3S|~k>I$$SGSP)05oD1erNo(tkre~FclE3GrLE9b?2RG-*;$Yi6T z0yzL$(1r|i+nA>afee6F>w?l4GgEP2muTjcXedJt1q*X^7B&FjyS@DWkp3287|Q<0 z!>ulvGLQ^-mn7HaYEeq!L6UVP-IWB2dc7?fv*av~*Gr!r%?CV82n1{px}2JJ)jDNK zG6j&xFi?>k1LE$GbKeI=bt*_?r68GN7d!6uR!nQVtc$G=0M900=J*A?1tKxorZzEw z*XvAnP}x|%8k?uk*Gj~IK6mAmB&XtKwizIgJn;_d#UM54$|OQKF?(wk9T8Nu;>}T& zm(_o**K{0q9Uh;s=$hWX`E4Hl&71xGeg6H;RPCR1bzA1@H&Rnn$B0;&y@vv#fFFk8 z&Gez#?GVL>8?2BzKD zUgx>lQn7YWa6l5Kl4&S73T9w(0**j+3sr9|R)xM@5kLT8aDX^61cd_u05}C0Y%mA- zE7yU|0T>(u0)h~?>xu|NAd#&F5MW^jNX!hT5uivIP65bigWxJ7AfN~VI=LfvKqOBk z?Kqe)I)qz82?qre=`tSjuvWt5@phwWSDji{ym^{zZ8tDKBuYslR3uSItbU$XriMYg zaVHN?kG)zhLIq=VvzKZ;h-L6)S)mL$@xWui(oor^uyyep?>*B2oUe1`n~QXIiWgR-_)4Eg5XffNK9s`t*aYmNHoAk^a5lIi%?%UymOyq zI^Qm#Yr(@1!-0b-pUhf_;=_kPFUCb4V$?4WI#Lsk(+MD3OaJHng0d^eRlrWAqEB&F0TU+V38{>$_H zf3KHMU9Y&$0D0r-t6Hy+Bq4`upxcmBAT{lEu?XlL!Xf7@66&Tk%EV~I}?sF3hdoY!^0ypt+rZAqwJvUzqanV z|Ni6dD7fc=7Pgsg+;4u7JOQw#P|#6u+<{RCN&}V0>lNU5vy&h>kJ$UQnHx9huHFOG z49Dr+qVm!o2O1|DG>G(^xl`oRBTofB9KZSaQC`j$ud<)sH1YFl*JVD>FCp{G`SZ*A zU!;Qe0X~J@Xc?>-P_ZA4fJ;9 z-EkUnIp01jsd-0UQ-TlSVfH_Rf@) z5Y0kclLV5GS)fzA4t9@f2Y&TI}63L)g! ztS+#DnYUgJ!>f;LvvvPf?6rmShzY{Nw*D$3FpKrRxdA8wBO)l!0}xpd0f2pbpjh~? zm=ppKM+#55WK%X-!+UMO5`qktfW~~dcON^Z;p1*hF!&yU8de;zc6V2w0SPX)BAU!} z>YDFbiZtJr`Q>S;Kes>sDuFf3TDx>0j)t8m49F4;?2)XhwM7>%$6oL|FYWm=j9R5MT%Mo7o?6oLGb41U?;*vj+BP{ zTB;-Mt=p>FkR!swQyOvsvK|yXr2T6BVScIos@8VHhd1d%+8ujs?x!K&&g~Xvg4#TN z!P|rb8{Nsbb6-@{pqkFt+A^Mh`urEQ9Zg*0a>kc_opwx#$&Ju31&uOJdCVzkpqPVU z1TTiQ&DZ%a=j$h9wGf7E3c5+#!T^IJnIj-k<^XHmy!RoGkaDOeCO{7`uj*vrK+Ci!p(rZJ(TVi=nA~i4-+s%1_rCOa^92&M>!u+S(*Yl%tUzgi8`_=f|=O3l%CZ!0h*yE+WPfV$Rpu4`p&xK#aoT7Oi%00MLB zHHa}$8F&J8bzRzu(FBD!Ap*s~BIFrNQchILprK%WX~7OEH?@Mpf|@9JC-3v2_Z8F~ zFHXad-@V)argylWe?3f}eqDd;sW0topN8j3g7uBuBan9>uaw%_yoR7dch1*vV#=!C zR|>7&(MB#A^RDdEpp0w46wZ|hTYbEIepvqHW&ZxAD|3;QkubuTISkD^08$bT?7ozg zk&_TbboZ$J21Z#D1h}Jtg+orvlC}m+0QKhDn|2feW+o;KQ;ps`1iAn@GJ^;YDGGC@ zBvOdUaqCj(+FPr(tWXWr(Gi(3XC5;Y#srC2Hej;3_wH3qJE(&BCglc_U#{|F7>Gd#f*2IS#PVt)MPPSxaI@$w0D*-R3pC5nn+QvHcFJUX+J{_k;)TMXEIviv`&TFI2=~xbW)!TA)E|h>70i1v_Ths#1 zORGRoZ#JKv?d|^N!;o-}Kybj;>PuUHuJu>Micq?#8AxQ#Bu*NjV5k%UjVpT%pwP;| zyRu`L$P;7ER03h<+Hpky#M)eusI?kMIi-n*xzEeXMbo@LG>9V8ckkrCfBXKw+kNw& z{^iGiOm-soF0d}`+LwH{WAF;5-7IjFgu!mD#UhYUO?xvzg2oUKupjS_LrFBCl?-FW zeQSL_Kb_Z4m-Tz|OE548LI;CZR}@Gr*#|Wb3`Y|p0Y}0_s3vY^9Vonq5rUzHLQW|W zgN1sJ4kY2G)?Q7Ch=AeY2;l$(LA(uK7@57Wq{NaE6A=&wb_+Ap8q11n4b=cdDVO2Sn?d2N4wa2oAy!iU{F^i4q|;XbQkA@hbdf4hAS-V-N=gu>b{=6BUY5 z(x_FMS7Sx+s4pu&|MK&vFsLESV+XvxJgl{+_7aS-yxdMd;oF_SL%RN%v>!;?kMZfb zh})P*29d5ROu(t*emIU&Yrw*6Mp{DFAi(k6`)>*{6qd-a%&ooLt`E0)F(E}3jMh%) z>tE*h(qa)LP*>b84Q;KpZ#7jec|d?3&V&(xW&xz3>Z5tMbx>Aa!H{y!feui>OnK%kLn)R)jyQ+C; zDNG%@s=McbnF%~<*V@d?Fpwz$K}1kU%E&@y)@nBg27n&0F+ULzz`L*R1Q_OKzA3MS zFf)#s^1zAH>qaEPy;W5+)>ObaK^>gyCEI zl_Shd1Cbzk07)W5Mj;YY_OL{$h%hEVA~^OiGwTLrH`>g>sV+65FV|WLAb@fLAG=le z2N-6|*C>tq>Em=><~=;M^dK*Pq5ty_{^?Wx{zb*Y%rm%T6?9f@R_b$FKTdb-hb+62 zOHon^$DAP#6dqBCIrA-Uy50ikOJBS>cs(LyYB?)^~s|I?y^tL2UKx7&shm_DD1olH3K?xk+4evP98gXt5W8U0}^ z{O#}l3%A{y^g+^p?CG;FKZREm)gS^w0|+1|L^PPu0x^&^Vf59&00?rE<8CS`F{S9) zV(JZ-e$w`<;jh>AQ(@TccWyaCTWhW9xXU9IUBSW}4KdO{ELN)`{*^x7Lzsaa0<1?8 z7Gwkmha`yu-PAR57D91@2r%DvNf8d<1c)Rc@YsAk=1K`^yW5L_g17XCD>yksw?K*nXcEPH zA~Sa%hl#k{9e7!madgOvz7hqCr|3jj{2KL>)E>?3c_mG^&+zb6e*DMl=Rf=By9XKy@R+TURyW0gxMULIbzMkkkWAk z4wAX9ty<${C&%C3e|xy~ho?o$^PC<`0ooVydi%ql6h4+WcdHqm%iZ{2j%D`rX*r#v zcP`$0S1@4q=$V9RgA*G#1um16yS%?E19f3ZmUn%n)A{Lf{r)!pw8T@C44kZ!TXWKq z@;D_iR*2P`_sAn4q=@e3OyFRB1wcuR4(5u)EaHG}5#BxAh!UX_5hTO_gdo_=D1ost z?geSb#WX zuoBLaP^0#?cGIW;fCEbe669&xQ=|;t(t>P+u4+v!B_Q^--sbDVl*hw9=DyDBzuf-x z?YrOq-M7Ds`1mvZc&$$@)~mH^|GX>e^VjFzyHPp(?M{ZajK}=uqLsn|rK^Q&kSlm% z#(=tNPGLr48r~j1WFANhv(4L0KGmPk%VX!Yl!Pv=qDK>o9*!NGYxTO;5CkcR zWCS3ixUri>qWKg7iWslBdkTmRlo`DNl5XfHB8b4=Ub~JE2+psHMFvJ2ScVe0WMXCz zVhnKa-I}jGmeuOQ-Z3DUA*UF#lq4mONm8(G( z>DBwAJ#y^15oYcY=qxea?IuUExOA#PR~es*w_PYi<9VINclBaVU)uGw-sZ}YL2mQ99;c6Qrw=1f0{G^e z!>9vcNh5Gzoo&h^9n#bDMOW?fvfM70cB^rvaOeTTU?3-(vc!&uaY#u{m-**UKfirT?~d=@m5+JJ z*SNMc-{7|PGpxEB2Fkgff3D-7^kChp>@$CGl~`2P7EEo%mZuvbLyG$@_7X+VuSLa%;=YZcBZ< z-G0?{VshwNaF>ALbvg_Y-YRalj_$FpRwD)x3jkXkJJeVeLKlQdDCfidM0;&@Cll$d zs~TCoosn)O?ZXE(dYg9N+#l;*JbeE3VYy=P-+lA<(`{{awfy*`KXqEzAgeC^c+m%| z%l+YwsLb_TEdp97P*@5PBGaBw2Ig0f-6B zB8Y+oQ;L#O;zXE{0%N=E(cW8KY@J0{@=ziafig%gGL&iHL*|jTmI2g7Z);mx3pMi1 zaUh7v(;!l)3?-*=8_hc^B2ti(aJVB92D1{PY(jPbG!H^|a*qH|bytUABHSF;#10+; z1|SFlPJtFm%FaZAEGQJ&mCE?W$vdQ7#^e>D-_iL{f<045xYZ#Jw@>iLKgAitXyULWbU=BbrBJY2-kx5e?7);MKQ{=wL|T8>|Q*P&q(Gs%bpnN`-ij1Irtr z!}#XShw07Khwe%g0JbtfVPu*?G_33TIKQm+{Oje9XZ>F3t7^5*9D*ezAqCdHc8E52 z4;)KjN^R*>@^IifD_I2r>;j$$v@1yR9DQv< z?uU1W-=w3w^y|`Z^W)R*}whlJ%cLM-V0G6RKj7DGi| z8DoNR@53AZ%}@$LDqJkM0%{UGFwIzUvcydQW!;f}F5_SNQ9Pd7cU+PlAcSBQ`j=8{S&!{A&^A5FpzIsxGfzDCIJ8%YUT!x$gd(_1?aR*Qa2j_;Wp(Yu_18)SfB-u z={OvBw!}0FLHGHo&`dPFy}|WMy`1p*b3FbMfBf^)*Ps2%1^QxORnUwAKv9-$+hqZa zt2q(?ctY@SbOi9N=7~VC(WV0okpdWjK2D`9?0^zqHS5 zUmw3bKAxVr`FC&r)zj_k(iRjr>;)C(xLx{Jf*~hQmJ%eT2wzda9EapbKOCpKocEN# z(ioBX275j_$#kcgA5XW-D3M3fj`LF2n*swNSVXUe43tpS5NNH72osZAm^&y4u`q;^ zfpxVa-j>K2C}D4A%_$IYb2x9;q=-Qgh=52z5+YV|Dj70oBqEG3S5q~s&Fg|~!S3M* zNg@{>3QfZ_mfawS%sCvbh4ornU3=^1VHBImBDrMD0}WHoIWZOhbl-;kw6$bBkPs!I zWFU6p&BwL15ye+H&2ngFr z8)EZ}y-H|93QrV71TbLuG?kHyjN!?ua}TUHuc&yg6iZ*#5K7{mIiWp2U;gm*`45YK zwWTr-Niw7N)=6YP?SSKU0~svb3`3p9krN_#21!=ID~SX|bp%69Fi6{(%-t4U+VSo~ zhQmOU97bFp&fZsP7&Vpm-%RgfMQv9){PwrwfAjl~&;Rn{FSq9>FGMi9z5UIH!)R)L_ukE;H4{$@_hdK^XX43{&bmt9d>|S zg^Do3P!bQd!CG5u4LNPuJ_s{6GxrX>89fyZ5X5Q&*XDpC7!cExa$W^T4AZW8ai%97Mcdxm))4g%)3Mr1XO&l$P}-a5t~~XkY0V;8(SD-lZYX}mf7BZR`&1jn#yR$9q%OV z3zQ*B9#D!9`gN_9R$^CO&-GEgLyaXa=JVs_r-${Qn?Gw?1t}!~^x9o=VwZs6o*0cV z*H{8tgyVdY@&*A0JTkA{IZW9)uG-e`om-WyYJrrSO4z#yMOyPZzBCLh$obvpP!ef^*9a#Vl%Hghm+V8;0Cp?V9-OH z1G)R)xQB5sRWpi#R^V&MNcu;p;TUU=Vmd#|x35koT z#q6OP7~4)=wsZ_TAS3}G%VQa2aPRJFW{9%M?g(+)Q@BTX02mP=hlrFR(rZyD5Yz(9 zdsk}});Ve;cOY@ekR}|aam>d-j$@h-5^AV=@5@~0wRbagryvXlDI_~C({U`5a3%s* z-^g^Js#|skkcE(u&=7H(*?^O&hjA#nZ$+CxW`Y2cKoz8%QpyltE|gc#n1J%WfdJivnk1IfXOaa$e<2=P_O2S5Sg zNVNIFF#h_{M*s*wgzaW;`}9KqZhd_D^2^1B^4;AB+Lf32%5eIY4njvxBlOtGaHJ1N z9I|G(6Mwn3$AwO(+h4xkzMt{rWQkISDN8cyEW(5kd|eIPgD4Gy&DwedL516GIb|p% z@7NFB4AfO}YtDtxarMiT0BsDyNMYL2FqQoFaJXAL`1-g5LFCI>Ki?iP_}u<9pH}+r zUt^_j{^sxB9*@8N`OkS;+S*QbJ+}qC`k*NlUWb6*Ya$FlL!yAC;CMIe-sEq_amU1- zs5kE5r}_H*%OC6dsl^wBMi$nfgL!l9hA!(G>gW(TkwO(Az(8<9946Av-T?rCfn>`m zsO%DvfjdD*Br|U{v||!RgK%?X0Dz7W28aZdgi4B%L4=YBUu6cXBLpM<@}{d|mo-U9Y!RtNK-@bjEvwx)y-9_TGuWA z$xMl83%UWu_Nr}3Z^j@ZgY9e89E1^(kdVcQ2oadWIEa&ER1udPb>bV}o|luYOaA%C z@sIy6|KW$GR|O<-U|awID8h+w-Eb@fuixv0h`!w*Y^iymKm^)$IrO^c*>*bHbASLr zub#pQHE8 z1M~(?%)4c+tJy$z-031;W~Q8X-^$@Q?$$c1^Y1_Y>&M5-=a;_}Za@6;$K&u$ezI@z z{eSrIAK&E9Umt&bseisWnDnJRTBeluu9n1zx{-rjQCI!Sor~rqx!^PnJIMuh2@rtX7U+i5+`D@x2SY~y2B(dc z8yw-@B0OFLO@;xAOc9Ek(GiIfBuN3@O)WYba~UKsMUTsQY%fv$^di6h^yZ)c$-n>9 z^nw_a$jQSzHbvAH!bW%n!N4|`A`HN7OS65yV?fYfzu^M{UgnNW-5SBR zI79#=2BPGY(NHNEw6IKv_ZcG^)M|N!_|uAY6tlBOnHv`8wvYD{-(mK#1c=DxOo@-KX#a#nN8PED%w=>XOprG~{wO z@a}oOUY6^W@?m%1kSO`xZurgJH`DNqNnT@a)jC&}oB8=B_F~shkH5%Ru>d*=P*-hb zrcA9Z$mF_$G3d{$eeE^wDQ9g@Es9MBNL|f0$%SJeLV#IkVRt8DF2aOCfI@v6mjOZ$5|?IN zd%RrnqF1G_&&NOgS$_PZ&ClI>AXo|@Ro%!4Mo60)65JWl90L)_IUoD9+mAU7n5H+!_XMeB)MVNtrgFG@cm3tz`K7;oJKTSK z2Qtofdbm8*wf*||G^J1b!?zXs-TODs>q;sRoMWi%+Q>kRgFr)=h8XvJw;w+w+5wgh z7>AZB)Wj?Q0t1|+Ck~XZ&DVm@IgaI+6ZCE2f2uzNq0jitl3Iv@5H%Kia03vNl zY<%@zxd#FU5(z}0BVa@j21XDd0+UO^fj|Tq5qX0rx`9>mMdMnb8#;g_%mt?`yI~k) zKbBp_B1G!eahdJ;d3m1OyqG&73Q!^}kP4-eL<;5v9zbrsx~(2-v*zZG0E9?Hw0VI6 z2pqtT-Cl`6ju3)DKt#dpOq3Zglqj|)0b;^b0_CFXmv#9S&d<{iU*U&ey*@Q)AOSgi zyAX^>9z+nhoo$0Kg2Drs0{OLj{Hn~ua0iSnH5l=_SOmOgFJo1N literal 0 HcmV?d00001 diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 1bfaa0258155..e39bdc0429c1 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -166,22 +166,28 @@ def set_use_memory_efficient_attention_xformers( self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None ): is_lora = hasattr(self, "processor") and isinstance( - self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor) + self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor) ) is_custom_diffusion = hasattr(self, "processor") and isinstance( self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor) ) + is_added_kv_processor = hasattr(self, "processor") and isinstance( + self.processor, + ( + AttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + SlicedAttnAddedKVProcessor, + XFormersAttnAddedKVProcessor, + LoRAAttnAddedKVProcessor, + ), + ) if use_memory_efficient_attention_xformers: - if self.added_kv_proj_dim is not None: - # TODO(Anton, Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP - # which uses this type of cross attention ONLY because the attention mask of format - # [0, ..., -10.000, ..., 0, ...,] is not supported + if is_added_kv_processor and (is_lora or is_custom_diffusion): raise NotImplementedError( - "Memory efficient attention with `xformers` is currently not supported when" - " `self.added_kv_proj_dim` is defined." + f"Memory efficient attention is currently not supported for LoRA or custom diffuson for attention processor type {self.processor}" ) - elif not is_xformers_available(): + if not is_xformers_available(): raise ModuleNotFoundError( ( "Refer to https://github.com/facebookresearch/xformers for more information on how to install" @@ -233,6 +239,15 @@ def set_use_memory_efficient_attention_xformers( processor.load_state_dict(self.processor.state_dict()) if hasattr(self.processor, "to_k_custom_diffusion"): processor.to(self.processor.to_k_custom_diffusion.weight.device) + elif is_added_kv_processor: + # TODO(Patrick, Suraj, William) - currently xformers doesn't work for UnCLIP + # which uses this type of cross attention ONLY because the attention mask of format + # [0, ..., -10.000, ..., 0, ...,] is not supported + # throw warning + logger.info( + "Memory efficient attention with `xformers` might currently not work correctly if an attention mask is required for the attention operation." + ) + processor = XFormersAttnAddedKVProcessor(attention_op=attention_op) else: processor = XFormersAttnProcessor(attention_op=attention_op) else: @@ -889,6 +904,71 @@ def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, a return hidden_states +class XFormersAttnAddedKVProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + + def __init__(self, attention_op: Optional[Callable] = None): + self.attention_op = attention_op + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None): + residual = hidden_states + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + class XFormersAttnProcessor: r""" Processor for implementing memory efficient attention using xFormers. @@ -1428,6 +1508,7 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, LoRAAttnProcessor, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor, diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 484f9323c69f..106346070d94 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -261,6 +261,7 @@ def __init__( if encoder_hid_dim_type is None and encoder_hid_dim is not None: encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.") if encoder_hid_dim is None and encoder_hid_dim_type is not None: diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index af647fe810aa..a0dbdaa75230 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -364,6 +364,7 @@ def __init__( if encoder_hid_dim_type is None and encoder_hid_dim is not None: encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.") if encoder_hid_dim is None and encoder_hid_dim_type is not None: diff --git a/tests/pipelines/deepfloyd_if/test_if.py b/tests/pipelines/deepfloyd_if/test_if.py index f4cb52d25a8d..2e7383067eec 100644 --- a/tests/pipelines/deepfloyd_if/test_if.py +++ b/tests/pipelines/deepfloyd_if/test_if.py @@ -28,6 +28,7 @@ IFSuperResolutionPipeline, ) from diffusers.models.attention_processor import AttnAddedKVProcessor +from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import floats_tensor, load_numpy, require_torch_gpu, skip_mps, slow, torch_device from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS @@ -42,8 +43,6 @@ class IFPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, unittest.T batch_params = TEXT_TO_IMAGE_BATCH_PARAMS required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_xformers_attention = False - def get_dummy_components(self): return self._get_dummy_components() @@ -81,6 +80,13 @@ def test_inference_batch_single_identical(self): expected_max_diff=1e-2, ) + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + @slow @require_torch_gpu diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img.py b/tests/pipelines/deepfloyd_if/test_if_img2img.py index c85063af9e30..ec4598906a6f 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img.py @@ -20,6 +20,7 @@ from diffusers import IFImg2ImgPipeline from diffusers.utils import floats_tensor +from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import skip_mps, torch_device from ..pipeline_params import ( @@ -37,8 +38,6 @@ class IFImg2ImgPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, uni batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_xformers_attention = False - def get_dummy_components(self): return self._get_dummy_components() @@ -63,6 +62,13 @@ def get_dummy_inputs(self, device, seed=0): def test_save_load_optional_components(self): self._test_save_load_optional_components() + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + @unittest.skipIf(torch_device != "cuda", reason="float16 requires CUDA") def test_save_load_float16(self): # Due to non-determinism in save load of the hf-internal-testing/tiny-random-t5 text encoder diff --git a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py index e7c8d58a3e0c..500557108aed 100644 --- a/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_img2img_superresolution.py @@ -20,6 +20,7 @@ from diffusers import IFImg2ImgSuperResolutionPipeline from diffusers.utils import floats_tensor +from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import skip_mps, torch_device from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS @@ -34,8 +35,6 @@ class IFImg2ImgSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineT batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"original_image"}) required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_xformers_attention = False - def get_dummy_components(self): return self._get_superresolution_dummy_components() @@ -59,6 +58,13 @@ def get_dummy_inputs(self, device, seed=0): return inputs + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + def test_save_load_optional_components(self): self._test_save_load_optional_components() diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting.py b/tests/pipelines/deepfloyd_if/test_if_inpainting.py index 6837ad36baf5..1317fcb64e81 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting.py @@ -20,6 +20,7 @@ from diffusers import IFInpaintingPipeline from diffusers.utils import floats_tensor +from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import skip_mps, torch_device from ..pipeline_params import ( @@ -37,8 +38,6 @@ class IFInpaintingPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMixin, batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_xformers_attention = False - def get_dummy_components(self): return self._get_dummy_components() @@ -62,6 +61,13 @@ def get_dummy_inputs(self, device, seed=0): return inputs + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + def test_save_load_optional_components(self): self._test_save_load_optional_components() diff --git a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py index fc130091b5e5..961a22675f33 100644 --- a/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_inpainting_superresolution.py @@ -20,6 +20,7 @@ from diffusers import IFInpaintingSuperResolutionPipeline from diffusers.utils import floats_tensor +from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import skip_mps, torch_device from ..pipeline_params import ( @@ -37,8 +38,6 @@ class IFInpaintingSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipeli batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS.union({"original_image"}) required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_xformers_attention = False - def get_dummy_components(self): return self._get_superresolution_dummy_components() @@ -64,6 +63,13 @@ def get_dummy_inputs(self, device, seed=0): return inputs + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + def test_save_load_optional_components(self): self._test_save_load_optional_components() diff --git a/tests/pipelines/deepfloyd_if/test_if_superresolution.py b/tests/pipelines/deepfloyd_if/test_if_superresolution.py index 9e418ca6aff5..52fb38308892 100644 --- a/tests/pipelines/deepfloyd_if/test_if_superresolution.py +++ b/tests/pipelines/deepfloyd_if/test_if_superresolution.py @@ -20,6 +20,7 @@ from diffusers import IFSuperResolutionPipeline from diffusers.utils import floats_tensor +from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import skip_mps, torch_device from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS @@ -34,8 +35,6 @@ class IFSuperResolutionPipelineFastTests(PipelineTesterMixin, IFPipelineTesterMi batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} - test_xformers_attention = False - def get_dummy_components(self): return self._get_superresolution_dummy_components() @@ -57,6 +56,13 @@ def get_dummy_inputs(self, device, seed=0): return inputs + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(expected_max_diff=1e-3) + def test_save_load_optional_components(self): self._test_save_load_optional_components() From 7d0ac4eeabfe78f5c38ad6582bb1062a43195a74 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Tue, 30 May 2023 15:18:01 -0700 Subject: [PATCH 056/199] goodbye frog (#3617) --- frog.png | Bin 110438 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 frog.png diff --git a/frog.png b/frog.png deleted file mode 100644 index dce094c892a958a6f8dbf1ae30fe09e8a295fbb8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 110438 zcmV()K;OTKP)L!Prp(uXU{}-3ml0?o(T;!5C-9256DggJ9 znGxZ~Z0my^_sDxuO=Wdo+{~L98SZY!Y}>YNW_^5+sE~ zlB6&tBtejbWTud&OhHNmgz+JWAhr+IzYr2gNB{(o5J&(5fdq*9|M7K_AlVG=oSIFJ zX)@% zrIe`@A!KHxASxA!2y`h!S(mPQ)b+>-Bw2=F^|=Vr>fwMi$P8xV!Mn)}G&9l(Hjpx> z8R#$q4yWV^fi9A8K!AZv5};@_Q!|M$nj0I~$diSS?;wC6MF>m@U~*U~LC}O$Dhy1f zfUKKJ(n0`{P#TrPCdx2|{j+SD(y;C_dhpqC0YDg$8b3fFAsPvhQV5`F=_xMBDa{%) z*pkyk%aAk0NfwzDAR~;B1c-D(@L{JA5>AsDMR-s~>*`2$EnO5lNgz`Idl*g1gfmh` z2P7&p$6ca{BqOOLGw7C?0FfO=lLW*j%MBp#~zI}xwNSFYQA2xtF!UX}KGYKTIfdmr;u2rqq2)uL~X4L6evmW})E?I39?sJTXCl`aZa23O%}Gb$34$-pdg zudg^*!K)mvkmZV$CHq`lsjAW+KFr2OGc9BAYNmw+t%mk4JfC|yF zHEMW)x>Lg#wLUS{6ECZjsCu7-ED1wYC{>mpHxcX4arG#5@;4ClEE){t^8rG_IFB3- zgU|m}symeh7EC}xEH*W!m@QIGNF_DJ1Scgr(Hoser==ysWoc&h6-Y{Rrlr$lA_)_X z?v}kXfIv!gB?2o>lNUSyBvl}{9A;c6X1LekUniOymHB zbt97S;WG?O3Nl$pq$GnN1QN+G8CHNwS+3S0DFKSG6~e+uP$_I+#Cjp6Itg6C59&aS zs~#7mL?Sh?D}`jnc>VG#7{9S>f0$EQ>h=|YtoMdY!3+qb2{40<24>2vi7cldk1~!mKw&$M_0BSxRCJhS$lSAulR{!llw3@~Qi;{S z2}zhFcNx-1BLOf`2%sg*#3YPLkOoIa5U`5#qz06wj07aucuVCVa^OFLFbv$hUd*sT zLRQHID9T3CRk#K~A--|DK@dqwBFroSQYn~BNwF(!L{3r6dj2_XS&j_G!eqwr&M4L; zVLW>RZU%^1FW{y?dG|GTL8%yk9)M*b;_=IvASk6JLdHscSb?15(2v_texVNk@R%48 zufl5mfI+E(tL8El2nU%SGzSW+1kmJSq{=_mgXNll(on~gq$mh!1>ezZoPf$mhgV+V znsj#Ag#rq>!N5QckVcav6Qh2B<+8w3tY-_`RSuKiT;*^BJl_Q2*&A8!(-s0M#tb`2 zfyh88m4!5ouoO5aPNq2p0|ta6TQC8drQ|4>*+2s@Trbm`tzJ+DDQQ$D640=I0LcL< z#x2V2%|WsXmrz~IhJGPd_RInA3kQyk3j?BU%1G)_CqYOz| zQFSs7b1Tm)tx9XP{okW(^v4PIXr+KfiW`nUzpE+tAsBhVRpM zGz%)M4`U^kL|c8P5>$tfDNIQrku3o=3~=lQcCnF>Y33CfG;oB`QbvXq46!xlLE=IK zkW!Zf#qgYjDqYJWXoMWdc4Ulo#W_4C49fBd8vmFisq!avqDYn(w*pnja*ASw4VfTw zs*_X#W;Tp*%_@f*8L_|e#JclkM6klMbZH=P2b8v1rWqnYz<>@ySy>a3ViqH)L}-vX zf@mxsln9kSfRx}MO7iTGwecQ0y9R7o@ELb&TzC&d6w*;jD&14$)Yl9ji?rQbL#7K|;i4~|h;&}o>P+OXW^_5nBN6)egZO^8n z+;+Uz?|$YU1~MF31+nFcRCl-;a>Kq@^Q z4meDZ5M13t{cpe+;e4Q1D6DarDHcaF@(Y8X zpT$W1>$emVrbM_>G*xFlD`L0-)>+Kypn_3$w|_vlXRc z<_z_h1XRh3jcX~ssK`+aI0#Bf+gMkuyupSvR?oMA-Did;WM!4KEf0$FFJwYN2-(o= z8I=s9kwps;D<4B)MXG6ah6~xZ91KL|D!IZAjeEZVk`)M{G6qV801V}7>S3)od&CN$ zvhXvSd3N1fc8ziO*E_VqqSb?v&u*K>FGms3HsE*NWhrcd(y>S)LW5JR7n>k6Oa==R zrF3UnrWza^kE)0__O|-YAWSgvL`8jxNnyq7VlquwbDl~W0hlaZNTUT}TtCf_D*C%P z1yKeA=yU{u2na+4g+c}hZtPK#qbOiP8Yp1E@8LBaOb@{U6%9!^6)AV7(#$0p3=&NB zkT_V+L{xZI6+%G5fFAV#gcQzPS13kws8G!f)elcR&aPHB`vJVgbH9kyL{>mdK*?fP zNyA7pGKEZ21CTUDgstd$ShgUm!VSD4=XhNc81=Fsjci;UE!N}`=a{7J{S~TDzzRcx zfQ+`*NiY~vX}tw?-?d)!mSB8<=fZq3hrVG10B&cZjwWf)U~f$!m=p?`A?QFK zFD*h=K2R`|(YuS?qF&QwU1^|=kY!dJ4_uzbkc>hgl7MA7*cC+sMjyb^G-^jG5MQXZ zl4)fY(?Y{)BbC`!Ni7WqLRol=KJHVF*t%Z50i@B8iBay^+(sXVRs<|kW#Q#9>6P0Q zp9ocf0ITyRXoP|};wlYD4eVFxrxn|{KAx(Qa=g+?k0J|b0hRKsj`6ZaJu}$G(sw5SE_k?h_G(|BVjII)|XdG`1X!8H+oB8CcP)BETqiZD(w4UZcuhdEV${f_rV99!kO`XzzyX8@L?L3us9*wSsM|r1 zsn{TA*}yYcISMocDXP9NDT>Aj!a6W2vRL8a2IjE_QK}-tdZKK2TbV4XPE>hpX#LI= z&(^;Jv@LlDKm!K)=Wu5jRHBiYF&-qKEiov>&jeR6kx-Stw$u(e-r4zM5Zr&mEv_IO ztBn9DCFS%$AiYwfq$EbMb~b2aWuYBaWh9YA8>Y9y4gpRO%H~3D?+DM((DN^D?s>#^ zY^m#T9TH$UsP8OvSbPy!xT>z|nUQ}K20!@98GC)m7;I5w7{y2*)|kP;;}$SRbdc;_ zjXI);pa8c5^_6gjHIbK487-xotVsdUsaCX986i;~=T(r;-7}#e{5>B{E@!jf-dT zk#bZV=ZIV?O@<^}v2q%$j~pNib16x{gcJ=#GDq>NtW4uQj)L3=i1$MS+wMu-nDvlv zyd=RYiV+H$m5obDr`0jePG>O}R!N9eqzI>nlC&J*gjEKWBdKhw^$+7CH5iQ5R10@( z;c47^u1;j*zK~VwF%BTMo(I@~auuq#dcgCO@bRY=gwE;G_6=utYI~KD(yz?uc%g(K zV~3mv(~JZqcvUTn%@Tad9$*6+V4g;iW};>Bc@2ayn_z+w%W&S7#twCaV44z@?&Sz1 zSxn3Z-e3J#FMruv7=2sw8oE*Om7qNo4V11^_aRlq?J1h%yx3 zs@ubh3mZ#<{X z$XBOYm&uhX(iR3rwYsi61v7S~C%dJ~0_NlDKqZUOCTSV$w18JW+_BaZVL?&!sspU# zJ2oSG_UUu)yB^twQd+P;r+=qC`J1F$gg&ndH!sZEj-J*R|0a<-QYy4KdES zdS8k>xWmF$R8x1@p#-z4+f{5}x|B?UqCwpVWr_DzR-W>D80BQf1piI#R55sB+tSsFbG`Ez@86wS0nMO-eGNcGI zDvJd=1DZ0hzZI=C!rGegkOC|rBZlN*oanmVQ85_tW5k}}^tc9=Gfdn*(FXU|Fj3Xb zGU;a}v=7eZb~8sJzuw|h5Qv0UjVl*w7E%q7l6xW(8L+9z6K-G{(7n8n8I_LVM#e=a z19DQy){t4b-cSgP!sD7aRW#lz9Dtc56WGSB53HtZbZ4s(KhA6Y>!<3WpWd zlPhq6atr8%V^(PKkrx~U#Rsu|{AJC4#yJsy6kHH!jD(pKIi8lR0klZMEQQL8OBEtm z3aI?m#Znv<3&hRjwlo1y={TNMsz%DP<^zL;7fOD&BKFl0>q8K?*0wg+uwLZ$@_3HC z)+0A6lB;2h5M}F|I>*WLn;u7r(2)2hI^k$US4nw7U`EVU&7mOUG{UAsYLVq7WabHY z<+NVpX}8Haf@yODy9t?5c(`&+S=Aq&dAorgfh^?>*MOFu#T`HxO8hjZ_Zl!~mjl<(L>fsZ=__XwG*X}(N{$<0 z@MuW6M-obJLLi(FQjT!8)u8J=oX_X8;>-4t8^+iUM)~@|_arvv8rvZl(2#oO>_^e! zOcJkj#)|(+|46L0NyUKqd>(9cjuONF7>AK@J$>szNRLHm456%blBJ+&dlw|dUhj`= zYATf|5MfTq%qhr;lo~8nBrRr;Zc15N6dWAFSPbMkoHeR`JPO+)R$gJaip2KqTHneL zW~~=$Fb%K;TA~tB!V^)kCk3Q|nUIx=47n9~G-7}NrUp1e06F48Sy(C}6j<|?N*Pd+ zkm_Whpj1}70WC9_lrpPQSJ~*e@DlugRLxT$NHX^D%| zqdQxdjNxKpGA=1LO1@`=ARW_o^dQL{u6De=@)sA&&F%5-ifx4uBhTBaw!h2*&d&my zlFG!?`jqS5pA*!!-GyLaSMPaBiO+jPom3 zKvSd=CM94Y1ULz(RIW(9FzuizivANMrarf0lqC)KOi;OgDYx|LX< znkmL}Be1s4R4gdvOtsNj)jBKCG3yX;1<_s`V{zGsRWsmAXAwkr3)!3V-N;1A~)eveT?=5J$v3(=bNz#dA2992AmZF zV5P2y3#$^`8r%Vt_GzRaTZsxzEYbR6W{3o3oxx;_=w_~-Geldg`GvXfaB`c{e$VL`#5x2}qQKw!{o}3Hs~8X@t0K1sk@c69 z|2iX??Uxo^Td$=d>EOyOpU>!ur-bvtSl>i_x=LUPDalT_K&eYaLOiH6ni0?>$2_2G zCfXE$70fd)YS)P>t;yHuX&|dG0E1;uhZPjSNZH1Ro24}HyqGy;%cWER5;jDx&!O^G zmh{5@709jh%wW1qhJaAAhLLAg-jz8V)_7)S%+cyP;^eTvtU^~-^7O1YR;a2ngwZ3b z(XTLYUG*=lBnOD%mk3F;t<*MR&A|8tlDUEHVoD&uY~yBzrx?~U4lO0H=QJrV63Ho% zHqkXz753Md_Sjv3JH!P*YvUrHS@v^OjpxVUIbTy} zhA6*~qhQUg>brWm(&ols*~<0X<~_7xt<6<$hO%WK>$g5D7;AeA#YkyA1Rt8s^H*GN za)wYRL0~YkAcYwgiM^2N2`M>GN})|iVQP{BMkgV;CDm-zml&*C4+$Bu3`)}hOJA8 z0Xv9%nsx_Rv}@s_VNGSv*8Ln~KXU>bXs=)i8#UgFcA#0QUpM6+D_cvRbLooW!xPmr zUUU32s3F&=xZTh5Y1z)`>Jzs&_Awd)M&76A)rzXph!oJoa6vdpQ}P4}fsucXzN}Tu zC^ysSoC2Dlq*cwJN=HKnRynMD8Z3iM^dY(|aezkkaE-7wI!-s2_zc9!hZcgv*VHBY z2y{8_VzH2EG{CqbE-Q|xs5~-Pd;%Nhy3wdWFLNsdrAQUZFF2}d2P3kY0O~fc1vxOQ zGEf--69)CwsKh`Co>qo?b>~=-w=^2ZRP`0oR#IdnCI!5P$~I}xLU#mH3+5BG(`m8U z8tyOk<>&nVSN+S^%iT-tULtPMcHo3%o)>QmTBjee@7yi<3bt#*$s1O?d~Qux3ps0^ zz8>T=Z@Lw12O%lSO_?N0h&D4jB zeeTjkaP_mfKIKvauEDIKjSRR@Bg(j=yf%Wh$sU0=m-zrt_7vM+zF&t5H8FA+C5 zUQWxj3(S4G=)6jJgce20`$C&rbZ;1ua?~i+xA`21KRXf6-urq6O6PY*AsPkF0)^`_ zv61);mRf5^+L`%(4tas9iE8+m(Qk##h_R+5T;tO(&hAHF8NZ6cU?+f#>Uv2)inJ3V zL8zNt?DNT_q__-am#8$j8UVqDdY%pw;V_~Bpsfk7=C_tjs_x0t72WFSrr;M zdr-&rj4LZC0|wqM9)K`;A_sj{cA`}lbv`hxcJf^7k_lvhw33pl+TKQkL9VGV>&424 zL8!=nrr1_1Ce8}laZpQ$AXNugViib&|VEa>xX?l&Ns{avh#M~b?CzELfN~WdOoF2@C=6G zseO<$er!3v=v_QR^9`$Dlmg1iw^<|AQ)^o*4Jm^aF$^(Ab~8}S`OdDtO4=g8@O@{6 z;FgxG@MpYB{w?c&_V&-cy5j^=xhnTUnO5zvmdaiAQNmKTx6CA0(U~A?#r$>}T!4ks zDHDyb$rw;i8n}>>4s2*WgUaenE?du0{Q%%=I2`p?YGr&1#TN}t@Dom3lGo*%#{PHgzN<_W1<)y5MX zx^I$M6*WX1o8fR-uUv6?Bsn4*73E)Divk7dMkKTv+JwS9AV*n&nH6m>PjRqLGMmZ90&i#$rI z8Bk(s$OUF5DAU@4oJ>bG_-VedUwzX4hhHtYSJM9NK8W*uf32M}XSCc6QO)fXASAMWcaAQC#!AnvXP& z30Jn0+ab}YkUr=0S6WEgj>wv!Z|40RI0?_l#KYm8P-AqR%%v7i3ZY?-KB(i zN`fbnn$|{h0+b~}pwZyfoB~SSV5OQ6f>t_R^;`*%bciPiVe(wU$ZAVc8NWi$%H`vp zRqR@6W%19Kr7GP4d>y+G6zD~ z&GGtW{{A=i{rCObuTGbrK`-$0zP%;iCk{e{K+b7tP8cnT1`>)wl12xnl7^U7N7$B! z4r|%+%Jm=4LauYFxgLvia#MrxMiF=Ow1hSqm?PQ5is-jcq7Q<=*?6~u^ZXw-gWf{H zmP`!if)yUE?CD2mV9lT*5Fy$+{-CB$i3>3X^2n&1hx*uv%PXBU-+?Yrp$q`ftDKUw(JG{R;L8j#v1Rcn>~wdt{!{ ztORkvql5dGq+!V+0NLOm*e>}+Ul>H4Ug|RpRCp7`7Gc1<;Qr<5%dU%nE-i&G9uHm zI*-wlrZJF8AZl)u5wbO-t`6<$S^^xD)L;%7r=SNwuXmR4nV z;eny>7}Xz{vIzaG%7X-O?bcm2VkMFr^G&=OZkVvlG+8o~t;Bl<$~(F*n)=Nb*G){b+5fXMFk%?iV?S<1jfv2JeM9G^aXwc1dFzf_ALH3abN%8VI2`8$$Vn;p+v3MVF+4hFjRpn z4M2ik9)KovIV@cuTZ*je0eRTta_@h5!N30&fBNe5>U;QC$Xh&`{n-4iremKXN|(2I zF$@5eWH*#hHPY2!oTkK)A+pB$7#a-GMip+o{dEXF#M!HzKaT*b@vLDTr~w+Ign7;n zR%_c{2WMkGzY?Fn=m(MJteV^Gx=_JKImvc3HXo^#+vRpXo`sijwWToSvvVRTqzvYu zCXHh_tU#4?zOsn2EVbU?XkdpqSo3w2f;(7zR}d3xbTQn8FgLTLne{0miHzzVU#Ycv zqgLXbVRppeJ=Y9oX!JQG4<&$0(q)Qa)WR681+e&4OBZ4gttJk_MY+!QSZ}Gy$G1!V24;8;QHV>*v36&Fc$e4-U zmZwk90eXqoL?`Ih4TvTa5FYI8ixe0mJ4ecC=QphwwU)J-RaNSh_Ai%>LFtcY;Hl8y zCQ1y7s8l&T9VIDZ$va1tN?gk}L%qCKV@IZ)D@8jzK&kiE{NwD3%uJ;~5=&;sF$X3y zjWnl?nThH&BAN^_bvGx&7#F+O_xyLiia&gF`t@((?ki5O@%9FflfT#Qz4kq(fCCd+ zGKFlkM2Z1RnZ#&m$LtSjbGn2nl*L3EK|`j3u*Nn<0>g&bfqAyMXZnW?sC?i=H&H`P zOV=m~b|fEbv2LYaKQw=? zkm2k~$Mp?TONlU@QjNf38U@)pA`on^Kw2Ud)Nb)4Wvh)rS(vbD#1vg0U108G5$=^02!XiEjJu$RbnRA0z*wPkLr6Ig8>9ElR4(^ z4;|^oY1GL7jB;0_j)i*3T+wNDZx-#AlFT9`XAMI%W3P$)fDyfz@&*>g(G`+lxuYRLHeQ)1?bA0s$=FhR*=*^5bHowdHfD6PdP7N09HMf#zB;{@Pt|;6i z`JwAw$^$8%&`D(NOfb-=FsUUf2iST{cm^~dSeni@GkoTXk~VU)WQ@dUZK^>mz~H9W zsIeXaJ`4Bc`L~==(8p%}%*{Oe=IzIyy~eYgLeKy5JQ}P=1vta*ktS4oNf}XyupWkK z{FqI)PA?ZyxmO$8j3W-^y%`7NvD2 zAT{PI+FG707-mp4Z1$AVSlYQfPf0gey1ZPDyCOwr0%_iM^RHg=KYW#c`or?Y?{W17 z;s%eG{$O|~f6U8+rpOD@sRo^@8`ohp7n?!Qg-w%`JvtCH(w>mYe(7`!8z>KLZrKc@ zA3($UpCk8={>s&Ao}rS>h6WfMoMVN!>h|8cJaDf4ajTboz}l~X65AWXhv8<;g}KF^ zbL7>iO#T8}owE^~3u7Oo8>}gDhPm zsGSE9)o89UUab=KVZ+tIIT)s-vgAa$3ZN^89SAo^R-LFfR8SUAcQWw3e&}WxinFZIpSwV`T;m1&ae7WKKtzZKmPM$EX5C=@JByAZX~x8wSFNLM05!VB4j-?)q7gX zYgbk#bC*4_LvPNUYYL|tE#*@yyEV$Z1~rlbjf!MY9Uf*UWtYjUBSPZfD9>sBN`8)5 zwZ+o$IVj4EF*txK*|G+P7{)yD`iEh-Yh+xTDFM}r0}Z|EdcW7=0D;`rjwJ-TkO`(C zx4MeZEg3SgC!H}Z>XRwbM4>lD7+ziD`)}HR_`3h*@0P1?VV~l7xqCO?Kk>z~+<+I& zrFb+=@IeUPbcaCF$=8U|L?&)IlC zGU*N1{B1-3==(lCN_>X42E7+aAPg27@=PR;iHuY-gD9nUT23Mq4M?HUorK&G%GPQ) zXX(E@DH1`KbD>-e%FT{Ogq_Sz!jgtcAlC*fl#a8gD#rRu&ev4}>Z%$f*XioxFmsI2 zMUfSwY|rP{zTFOCZOXy+2(fBWkwz1&#MuR_Vd+KsaSU*#WMa-$9&+!*WQuGs({B0d z#q{sL$v^(#>GN;lzrk{k_w(+6n|Hcf^4{&rVkbN~FKJB*wS>`2o8j_uJJqwzF(|!e z+&7XrJKcQAI00#1>jY#O&gHVHC8qhJt)PL=Eu(5fTs``8P`uryvvc0U^SITlQK3SS zCRXhdq_mEIr7YMv)B~d=_6+F!(p=BKZF}PQ=%1dUdw%f6zjX}$k3P1<`uysn)FlNo z1TW+Rgw#=E^AV|(&l!rT?NcpV+SDvdZ~{;3ZUme(V;TfAXRWtkPwPmhnb$ItdFv98vI*038oqbI@Uj@9+FlN?9 z8r3$+F|8oC@hGrb_?HlzOmp>c24W5{v}1p9-M{_LzW+VH`Fgqg68hBN-OeX(Pu%sK zGT;p3VqU9~=FGY&kV4k`fZ4(-w&K_)1?EBAi3XS1G}Cg-IYy;)HgQJon_bNplg+K_ zb6d&w4D-)klxMrx{INc;&u4z44uQ!th1%mYT9hPmRg#SH$U}Lt63fqx?wkpHVB%X* z>-^o%-M^l_aXtUy-?Z}ccYW{@AH1FMJ_RsSj?8KX0eOO&0Hmb@F^j3LW}!YyIoqyD z5{xxOw7Rt1%^+(f&^cd%k+lX?XmTk73(yIiG8+&fu6q|jtb)$bXH(I7NFg-3afg1p zVh%@fer1+dWTP?P02@OvPDFXZbu1mR0noLuW?5qy9VS{!rZK8}-%_$PO=CugcZX{s zv)xVKe#-y;xADz)$5&q;FFq%4`SaWkZGY7M1Pk|;5>G@Dv)ProGj}ZIaz@6Hc}gq^ zv6+&opnO0$3KlRK&Cm=DXc+pm#esdZYtPGhp4zpy*49*NqtLeLF`MaYb@|*3e6#S(GRfCifTW!TK?kG^ADY4#QC#dzQW%+10Q}P zpZ$q*DTHS$0ZKipWS&ewWF|7ZrI-zSWrhVJGj}qZ!INjidgdGxRS^~mq;=C&GjwOLNhbnxpB6#(>K4EKWB>2|c>4W!dGi_L)$U>T zBmAN5k1=(dj7Brs1j=M)&ESAfzNbypq?Xd7g+)iNMLsT%@s4Y3sL`lSHZ!zovTRHz zlWcqHa@Nf>%|5Z+8H=SZmTR2v=gpBHaV~4E^|=WR*D#JsA)wmYP}|y!XM(Ywi)dH(8u`{U#9T;JOVwu>!pKl@Jpjqm#Sl|K0MSvpk4Fs4gXdli_=!YSB6 z)I#e?fK3x1snyA<(X?iOPiY)eq;0BAt0NH-Ex?q+Y6iJov`{ECI_d?Ioxqq$Uhf2B z9y5#Mte*Hf5LW65TJ?_A$Vb-pr?Fn>gKHY8+crQ#&m4no!mc59qPt;TMpxa^GgDJq zf^s>yBrqX^kYDWb^H1{s{4M_YU+LF>!2a`>r;C@5^Yo~G=*MGvIt;t!3GECMY<8QV z+2B*VYTP6D;sh)So?sm^^{8@&F+x&zxYdR@5(al~m|A0tiHOPUYC2xEc|W<&7mHn< z^4O#mSYwn@xX!Hj#a$R6@VONYq1%8(G^>e9HKSF6L@O2oQFw8P<+g;OTBM%C_SFzK zMG^4)O?>bT=g-y_n=NYLSzP%g3VI$WK0GEL{HFEfV%3AJFrdyw017HIVbLPQP#{}K znVOOiLn&2_beat0$<)lc*KTL9wi{GOSCPr5s*8bpGg-=Ag4!v=*S4cE0W?65E;qJpjlh{2e(SA=SnC{-=LH8*<&R;%5QhcALjdg*oOGIdiJ(^6)}v}A`_ z_6B6C+3|Yv-+yiY?Y}$z_V;-8?Pu-v^}F`+y2s)g;ZG04e$mk}ct=u-COYY)8jNt0jM>np$#$>{=gqE7(?x4LkPD|{o?0D$ z)}Z)NS9zY1@?*1GlP;}BSiyajj8~tY3<{$5+E*dW)<(b@1h6Xb*Q&tTn*VkL_#5Hi zqqoLi!1<4y`Lhf7@I3t6(cpQi@xf`sXfBtchB^aT({$1x%wo);bfBk*tj6^$Qr;20 zI}r_LQ%%h%8UIjo%z#W27-3WMZcHMhxq7v8Wetp|sk=$AwjE&%3aV!E@i2vznSZYQ z1xo!eGPhAst6c(V8j~sp^E?)L3^EpGHl{ZyDK`=FHe{tG0ZpChpyx*&-*2&?4^5!_p#7 z5K<>NH9A*WI$e-6&>S;;XY)0GzDP62}Jp%%LXb~Ey z9p_~Atk$ms<24`=A~G>H7Yjy9)!1VTj4kKG#ge};++QZH|Hj9MhWSCr`o-6uTLAuz zFXGvYj6a|Kz_?%JC<#@zm^oC$i*g6`uvt`wQ2X2o5!MA?gJlSS8)Im3s(5sY@$u76&^OZX7jz7o9)~tb2y^+jt))??NXKoSGR1DBAksbitaF@wX{a= z8?Vh?PP_edH&0ix`H)M`;~@^G>}}Kmg;hMpG1Js4F~9&9Yo@gs+&TBu*-%zt#ZZcX zq{(FvIyIIP<~j?gQTYY_W4Qi2aI@p%w9e_AJD z+_)U%fe@Zt(&UngrYv~kDl@7943Z&9MRyZ@vwQ~tJ3DGi2jmeJtp_y&mA3VeS*jL& z6hQx|csKA`d47&&^g2WYNiex*PC!||JMch_IP!2Qi>x;^OP(c)22BmIr$*iF_6H)G{Yo^huvwM_H}zsBQ6P4QR$_+Rcug?XTx{J6&Jc)o!|VzvyYH)3O{6eaU`k zMf&8n*tTxd=8;!s>})INL0`|#z_8hxy`jO((m;ZW(JWY_c@3;(=a41GAP#UP<32iO z{|9FN%a2V3@QZK!@W>Da0RD~K;G^^PK~nL-*H?0oE2X7Ghr~!BU{VKLsMd;#YC;zz zw35k#okKTcLa&vEP6+Nunr0cH$)IjRgLS!&ktYcm0>Lp_%|>N?RNRMV9|4H#K(#T3 zP8?%$>1pd^jG{)1l~`GEfy$BaY*pmSdx=)F003jw0wabKyLtKIi}=&;ra$~fpZ)go z=JOYM^VzBG-f8z#*--5$r_szE&K=E3r@6_5*=TC!HoMa?xzXSSPs~TLgJ5&?-aNAA zk|AA22PHI2ZL*!F>uGd91~mYl@6<16?FP~;E*q?w6XvQMiG31o>Dfw2n-N=PR_V?|xZ?CUkUp-yi9PHwdwg`(z zH9~4b!UhoLw25@*H{f_lufIBj`eG6c=^B-|NkY#>-T>2PwTPN`svxoS3$>61L))5@chNj=iwX& zKS+Cy%RJ4753vxUME99d=7FmVI=TBEX5PcbIVa%)ML=w*(inPP&~ZMB??{YY~o zkMr({?J4+}a+JNAk>E4}6pd(PbDv=iFlvom z6fMCNLTF=^7dXyu7Y&xpyk<@$2{x}pWM;ee{mW^8(e5s$n`XC=o$TrT^8S6keduo& zy+?e7tz?miu`hNY)(nlBq-u6d4Fdrd_MTkf;ORfWma7;zpbM;4-rdm%>tMMRA@P0B^x7u{PIH=mlih*24 zE?pGeM&CjsnHC*bN<^9((v=bP4WnLPCHmE`i4ucO>xNdVv8vI>Aaa`fH#hm+x9zvT zZ(n{p@4tAd7t@p5qvG*oOHV! zMy^3%vvq8@_yhbujA~ffIvrxgO*CqBPK_3gqSnR=A&glcQZ1M~ha-(cxBgzO#rX9% z);FC095wv4BJj-U|3+YV_Q@|ZfwRSb6bJahKRv$(tDu7tIB-TFr~qR@AW#_qRQW(4 zh14N4rvzil*_%>rBE2|bueuE7o}B0eg6k$eOJE|Rx=zv}l*+Y>k*a|fHBH=Da~5b> z+cIn4hY6Bm9cvD*(=1D~81+mH%m#LW>E9d>f-XQ z_3QnU>lASi4(jN$d7aQwk&c2X;lOz0iC&xQs%zfi%H}*(=c%)Wnb#H~1vpk92~LDp z*pNgt?XGQqzq`1cUS05hwoAfJsfXq51Kz#wKRx8zh{cxWO1q0TH`{7pD8{0eTaWeP z8+_pAXIH$!U*PhQPLHTTQR`k9ibGqz=DNkzOUeh#tVOttB&Kv&B{5P+h}GJ zWb2yO^^I>)YkkhoUULmJ=VP&5;A-)IBRq^0=fl&oNo4?xX4#r=pfP{SA|)rM=#_ zdjjC80T><+r_V5Kp%yU?bEW{jXd2B3%27~_d^?xK0Jyjldy8uu1{1L%z12idqNO#cvjwb69X)6@4uZ%IRmr$W8iPwt z3f&DIGBRq%8H(-R`ZC6e7>-P=b%73=lg)x{J{ioy9Jd$v?YH{F-|4Gwch{d?w-Kf&J+z0%{_xI!e6#%YkZ+cLM2|UfHQDR==Ci9$N(;o= zsxaDCB%##RtaaPZpZ5CaSsHS-fU_~wSBw3`I(iA`%yt!E*b0?=n>Sy+YZ#U=Cb!pT zwUKpwu{9oy-@I~>re|vzgqqS+PqS>-&ZBMgoIgLaC+7=2U*7t1TT-;;1Kd7c^XoCA z6Cy^RKvEWI>B1=`D;l1G3`iE}nN4Y#={=`xlMg*fP1S&t=rj>-@aB+Aq!GOo6edfF zGS=#bD#{nSt$M!`!K{NgWTT0~x@OF%BCZ)T5*%Dmm?4Am8d%OIa(1A#o7?_(zwz(? zUZ4K9-F$w9%ZrC;dDo8**-x55B&KGQN{EK^8uThpMOqLRgeiN}P<2@Lv_UsP2{ZM8 zYs%`nhCoBBd{EnBiOPB~&6C~u_09D1dU|=`ca|5yIQ9O}9}mmJ;dDAJsgTR|KHbgt zxASLL^Ovob{^eTbUQ>T(?EvT6@_A-~w6=F`UtGyRq3*Y95YA(p6szcN!_gb97BW+k zC)1UK_b7)6CdUw2%A=@M+wy{eO4jUGV@+bE9+l4(YQVZs)dFf@O3qR>SIr+-LkBkn z{RdF!9C6CpH?YJl(0%wZvVR*@zqPJX+#SYDhg=e}t_Wp-$f=|EDSLV>?z`-ny?JwY zAB8bX$(5SGq@|^Yg~?6OsI|9R$P{EENXc^4iD|1y6z+l&)p<{&E7OdbYiXN+RsTFZ zQ5!-~>LI$R$)}*%%NO>$zvu6MAD{j9`ufw$*tJKGcm3&MSt8WgVCF4*qU1WYKMN4K zK*$byi~^ldk0}a;8LRm@jOLyQwaH_umNs8uZz7;-HI>PCW;@Oo?dH-xx!}v0`;+To zdG{`Ve(dia;^8SCdmIedw~JTp_VvXl_m^Mqcb~M1f&`Q*v7q^A;4Szm-<%ttTL@Pm zww1u}%#%tG17^r-o*S#_D?sP?HV4CH5UU75k_o68LrBY-2hUm`g0)OmCuw6lM0i?^ zr6>i8>Zlm9mO)R$d|CU~0XdaJv;k+eSB=Hi+xELN*SqTMwB|qO5b@#1s%=eLmm+ZTo!psLZs35!(NGD0qNkL}ipiExt3?Rw9 zPR-Dy6!Dp0GjxGy#soAJim;Is*E1N?M@JdWP2B`y3fe@p#+R@C58qF}``zQ~-%MAp zT1-6h^iw~)&)y@!2@<12J30e&wuB-;gp@|m(!#2zftE;9)!d-bRY_$UiPW+{B8>DJ zU{Rg@={oP|f^f*$S=r{Yj7w!J`;E~PAj||%k#a9(vMQ0QXtOb5sN%?q6W0g6G2q)7sM}X7b9*qMDrFb|Qb4W0kqE+( zNiWZmHKw8$plfoeCg=d1Oi_bI9i9>yeMl_eX(dS(vpbApZ&lMXIEv3Ef+`tamG_d3 z2;bIHP@a)g=NLG+eSPg>-jsh^HvB>M|M3CXDnc?&{gH-B2A}-17x=^9pMLwN{`#wVcYn3)^&Sszjt>u5BIdM- zfO|^E7&8w#bI%Frgi^f%9&%2vU>|Nx<@ykWCsrXD1Y(JvFl%iLWid52WK<}F!6x{P z&9`=YX*WBzwB>+@clz;d{PZ?|dWeT(f5;w_O?T7H>-oj2{iinTpXLB(qAg%rnDA19+r?9yj2Ho$f>!d-CYHWY6F+u#gM# zC`&n|oPsioCR@5C6`*^j!6O+S7PdGs#RR`*+>*0*ho=RWJb_QaBY7Mo%+xYFQTl@p zMC+`=+|ZA03Z<3d)JN|BdBU)PqGx|u?`A=xQVx+Ngo2<<=AtGA*d?SdnLT~ZoV)j3 zKl$S3Q@FJ>Z%vkJ=3VZ_f7eDinq;&<26{d}bV6o;Qr@82(vp}xqx!5En)c(ce1L~m0! z&8dW%2r5I$=q$?2m}=H)$!|{mfwNG~DBatJ zh6st(#X4&EBS3rvibvAtg53oN8k)oMYXti{VVeK{@Pae3Wv7u?CrP7mNA z_4Z_My}!xnUE;66+a9OHQ*r?agnSN}6qtyq$kri`W+3T=6YglNOc*g~f5D6U zc7NSoP1DQf^Ad5)$G7?JZGZDV-yD|r%jqC4SAOwodiDC^v%C3|3wv#Lae_VOBJ`#~ zwG&(UsqWEA5UMtKtiZUo++sa5z*$GMN$3g>smM+mu}sKr`^+%Qh1z+3n)qhM?=JAI z>G#U}8TW4c?CLrZeZiyQN97M%eiZ(>>CamJvGbo#?MLvq)WbDsD&F8^F!WoGFB*S4 z#cy2SEx07EE^rZO!p|NLiFeXtck` z|HspJe;+TuZM*yHPJg@fpX1$AK1DKptnEz&+1(842+2GkVp{Y-r(qVm=_JdLY6Te# z=!?6hnHeMGHi}4)k&YVsjOz3P;iQ{kQoFGIoxgt3KE3f*li!eWz~dYJ_2>M{+j#r1 z{Cqqe#O2cG*X_lp(FKlUE98IW{$D$Pdl6quSD&}Igy>#mx))6#+e$@ z`EJxS6I;cj&y-7JDPBfGnew8nq*SIO7)=?B#*mt^b(ujmS}}9-Y6tJ;GnLjr!CO*9 zq$p2IpbEUAYRz>bQ43j7LtHcItOo96lVwJXSU~sb{*(B_-`RJ6?4Nv{^Zn(iUA~uo zT$ZPdF01Qm2EhRrGy938nWM_!aG+18Jf4hj`(yD4P~q~=|l=4f3mn^KqLRrgo2msno5c&YwBhyOF$ z&l#s?NxtCnY4bl#{12($Bfspp+;iXj0y!rjq+uVr1>QnGWpnf^!{d~9-7Z?*F#g%` zzghb!^rQtjhuDx|DWO)ujV^zvmQ`R?*o_2el$u9KTWLpwyQ%#HY1Z`p_J>9Hi>79}ZDFs9YrATE4 zC6lUIV`PAq-Lq$8a4|8HZl8Rbzx#KGum6~@es{6Izj~Ul-$i>DPY;qbld~IZgk!3t z1H((GGX6v&oX5=K`_0g3#?nRDmad?veG99$ei$GJK~83FYU^Ng&fLv@aqZXl(x*`8nmdk<{poE!yzOrukMExj?~Z!Bu-(h)#i#pE?{~ktnqN+Snc$*+%+sQtDI(i& z;wy0hNX6KaHl8fkGSl{rgBvk$9sXRb>{{+6Yo%_doa&?RUCrBXf&b2@KXLi}gij5( z(~0|Q+}vfG#o6Z8j_F-S6 zOW(D8ZT2#CkvTz5vft+$=U-;Pk5thoycr!t=UF$dS9rK@${By@oc0leRw=9s% zg+^pUBIhb21GIq&p_;;GT5D-5O)&1%8W>5*CZG?ZTcihI^ue^sIz@y!gGx)$XF-yJ z>CrNy*+eQe1)#8WsuM}X-H@sC%bC)uiQlxs-q^hI>lYkJR@UGt5GMY5{ z?HBy|5BA;f;?*~}da;k`>X6f$z|kyi=RP;1Ng#rm>QL!RC5Wp1l}^&oNYQ2RA^1sa z)zvJStqU%TA%Iy?Tq3L)>DXDbW>dr5e4hQ{s@>kqFZT1xom~)~jCc9)R&U<*pAYeH z=x-N2P0+31yxx8OV*2cA_vzkm1q<{;_7hLZo_OQ`>{Bz%F#0r+Qf7`> za!F<&42`WEtC@LoPHIk0YO~FI>?Ye^&o{TbyBoim?b2*sa60tkJHCB$`uTnT`QiBU zF&>P%Yd4?HpS;|?yt#a_YuBQk7$+T1=qL3=gs{Z1jmpia^pF^nff7NvkC46zUq2I>_@7d45Lu_NT%#q{e~e&_m3{Wj;T>m7EkyILtJC3-7C+AyPki&2{`hCcVcO%xjGH5Fc6hOz9!|P$`c!%aTnQ9{{Q!#1 zO-4*&br~}0hPk!2WXqVP#g`6b3lVerE5xqhq95*Z|AX~^KAzsV9cWB8L9}2wRg)N| zx=$37DH}9Q5@};kUKGj5W>K}VKr~c5wPa2c?L-m2>#@*f3ktN8CK%W=0mA|}V;UU_ zG;7>dwS-km=PW9?b@(MX7uGL?g__kECxAKA}AQdec#yA*ntbeO^6BCpsLeov-pxx3RUPG?09!4T?zOrx zG@F*dG+{~33u00P(qS=`vLwQER5p6+Et@-Bi0V)+T&qwmHc&(=A~Q0m&g_<9TA0b+ z<@Y!7o4-4L{r7zF{pI}XqR%fM?dAK#;zqOydw003ni5Ew2Xi0D%oA-fNeHAO!)YSj z+}s>YG-oqw4J{oF89{PMfSBP6VKf?Mb2sitC)?yUVb^?r=`Zf)*H`oPZn~Q=i+Nm5 zZ{z*Xr=Q>H=Xb{+9`eV;>DuN`b}wFBetAFt`eJ_Rno>C8v}lQp6Qk!Ta_KlN@z{#o z-oy;mh%u%f4KYNO3X9qlpv+o>NmjK?EGC_Wf0dM$wNEsdA-`GpP52j-&(?`0Vp-&e zzC66Y@WkAHJRH=e3-LJRyPvNf{}q4v^Ze%SFkRXHBF%T%_jaB8S4%u-`AIs?I!%#B zO7x?(Xb4Jlx7sgWb1RvqFe;al=S&4-%yyZVu$zSB(D7t^mzFO2^ec~F2mdzY^lFD17oc#0Z0W%>+aym(#Wa%$yya~9qq}J&=e$^ z)L~k{CM%#>TacE{21F8$sHBUCg$o5qb2KHuR?YaHi6M#g+Exb@;K+!cDGe1WqBE0; zkQiWUy2qElnZN)3>672!=H;IImq)&O%-h8~Xx?PGYqj7J-PLGt-&cEaW)LSJEKX{@ z94qB!bZ>+l?r^uN88^zjWp)q=mzx#6La}K#H*c7ywwv01_W5eMxoR&i_V@dCWp-_e z_j-Kf!`t}jo!-7b{_u4AIrF%q-_I{!v|rs$Utao4=RTE>I(06Ar6VIFxI`|;IEEhD z%1~EXX4rfd*(p%UfpfI_60Ad)>eOu7(DC;YemT^V-O*(`EU(2cq!)|!n9}$*6n&ZE zq@VtE`KSLyQ`q#X-F~^hx?2u^o{r1oPygS;Uk-=>`(g~al+Cg`Aos%k)%#KU8Y|orFm`03Il}>wotOwH zdxI2J`!wsB28Sm=RLMISsu_T^44|76wuB%oHIQq`W?oOvr8GC7I;{mMm55ArOJyPw zNn}(xE;_pJ8ovBZeEY}a>)+z~)7y4&fAG7fm`+m|=jM}{%o-WBP`oC2N@gTIqr14L zvUx+El9FU@4v^MpA{&k7t2W)j%&fMo)rxS5YS|`h6WcsZQ%*ZxT(|4Hc7NYq?Coau zOVBe9PxBpb)r}xXx?@vExKF!n%zk4-(`tst-%iW6^?PRgwaE!+uTDT;lTl6?B z@sL`gA6r@fR@bWK$)n&eLS`H3xAy$uJSbz;PY$U!s;#9^QVwM&ZE1J%%P@-rl#jtw zi{}3B;c)uL!=bm+`^$g$zbtuqdjHga{`vhs{nP2E_jxj!((QCw7W6pcr}s|>{jda< zL?@dinpgeC!DqYQ08Zm`CgoJhv=BxbG-K>p=*hI445n(tO>Z?87CXaTQ|q0;1F@X& zAO!G3>u>w}_t_ri{E_)K@h;l~^Bwx3>y&;{SFQi8g#yLyCBjtdUo-U@cWURIlZhk%C0sAugd?i1sw|x)WEnDh2!?=+pjHGdW@%??z-9IK2Yc&jx=y{!*qJR2PpM9v zI`d$&PyN{P$3I_QOvn3sT)#a2>pwpJFaI%q{Bt~=Qu#!kY-a)jc|w>HOc9o5qoufk zuJ0HWA8u99+-#p6YQjkcJf*n43BoAlxaud!ExYn%pQ4>1U(%H zvpJEb4mS482FPGWByH_X#G*h#izx$UDPV@93k`^XPj;H%lERDy2os4!mh4kOE!I=k zg~;r{ZtB;c^soLHpZ>93{OT^Qzc@~xF0^hGav~?T*^M+#C6rAPy~GQxrlrV8Z$Qp4 zaorpbTGc_Q9nB{9;tNr6z-8H@p~i*~Zf1?G(I>U$(}iE(&eu27%YD0_aMg^F{r&Rz zQ~&GF`tfc4^?m$!$ajvz^|ZU6U%l9Ubv1pq=dIa%!r>T)C6^^*$sXR9Tuymd;xY6n zJ)xi4+JLy#F3myeNH909rP+6NmLmNDrQ5PDnB-Jm(6Y3O;?xg=4yX`X)l`HH%&*LPWJd-KfgP@eOP{eI{bL- z50=MEzqs?eyZKia)2F-L72QuzUzlmV*r`O)MOh1WXCfPt%1h-gN4r?C3AQ^PspFA;47a@OPs^_VYXl;B zh+TJ{JVSvd#BoA1K}!m9qzQ~t+4qcUmKm4eDj^R`_86IfEKyTk;?4q3NuEAY1c*5 zq!7yu1w=Sa84zThVTk|~8iQIr?_cCs-|(y79kT)(n%QBX=~M^PZ|w{j)p%oO5RZX1I^vt=?!hqiyOOn;V*9Z z;&Qrf*3gendjIzHuRrBaZ}sNQ@rT3mQ|99Z{cgH`z5nud_v&iiySHqSaq8I@GNmv@ z^i!N7my9kYwRk_FdMztH)wuXo>0jM(G%;5nKYIL1ohOFXGbd$fX@ZmLlu0x(I>N=8S1^ju#Y+yq0~s>I zF@y?`Y$%Sa;}A(SB5Pq45KuTu_Xq*mL!cqSQ!`*@W<(rK4`Zym+^`pFsMTIEWn=;f zv|J_A60hh-%ntJw3Y~Q7`h)WyT>trW{GV|7|7`L9;Nj0| zi>}Nk>n2UT9Tglch4hv%Wy%tqGZntDW8or+&MAC}Q-g)W>AeSyvZPlbxD@wDV^>1f z`psqg{PV-tU*W~)m-F2hoW4lh1)_s9&2pM2cT6yq{|2mTwUJ0RctAZ=T?S?}w;HrX z^^(^0))9+ui;2)z$7-m(!=ybY*4{!OUuX=zzo?k*Y?KPIWB8QNW>TbcUZ> zeMMVf*j3{*y8qpyH4TQchIL=#mHrB5NLJWT5%5 ze5#t#YL5-Vh{(OA$XeJaL3ANcVaa5Yh!f$-b=lm|KbQe-4cQ0>rjARs3AlvU6nzI8 zmfc)GE1KI?--RtF(ec{w``-V1+5hbd|5q;mlivPt?58$KGc{K`gFKsZKvqizkR}P; z)P%*9i3OZTUob3l9&wvIb%Kn{CMZ%&`V?1JasR93)9?D_m$bXjWA|0J+klBdGq|;8 zNplO_=EnV=aWU{2I-IwrP}Ff5VW#EJ$ob2B!lS8a}lrulN( zU(Yue^W6nr>~Q1sqc3lcKfl3`KOg_~r>DO>oZc__rg^`fu3qlGxZD5g(mrYJ)-d{m z5|%_j5g9$8Ai|JHFj76Bemp)lLuHL9a;HF$4GD=kHuYN(J&>SiZiyQ5nC|E%f>S4( z;3CORsc}BpZL|r>HqpADjvkY&J%vt|NAWlT*5^rA%1gtw*$%uAUAAL4naNafwq}5( zngOGNS@I1V0~m_*Bc!x~m=wGa8gf!L<-f>)rwf8Sc<; zJHB|U|Cab4clJN}(|_XfWAu|usajvZuppokAz80p7{aB5j$Gx29 zO}bDxCx8dWgXxIuW;64$>#F&S-q1%1ZJCiSr?J#pwn?p}Ra=ihW+o)Bw%!!S zXoEz8E-0*M!byJ-Z5;bvlM{iiVCDSlXi+E6|GHYUPoV(rZbcH@= zzet}D*Jyjk1WWj0%NX_&aB6{0-Gtmt>@dN1Ku|q#B9^JmmRCK$@Alb=|9aak zPXBxJpCTU87S6rV6-aM%cbj#564+ZdOo+uCGI^XrC-k{X7t6fRJsV&&OS4P}Zn)g{ z*RSx!m&^SZe(~zYFTPOwRCI#wb3H~TK&pG>b;+yPKO`+@rPr7^Yrv%Gp**n zw(J3@)P>QS<;i49q8ZUSO}P*P{# zO%0hulc##wcMYc~>K3DtO?raBVv_7KX$T0R%v_F^vlG%ab{FPXrv03E?&J~Rk@k@M z8J&R^m<$(&U7M-)cs%415W@0k{?4e$_Q2Fnk*&*)nNNzR%+?GifO*fz5 z<(GN$rCon|GhKe-?aKvsz=aR(Ld~`5NcmX26tV{N8k9rPY_tgxtss(n>vpQMLwFJ8 zwZ<=Ll|WV-Gm%uyL!V&jIn&(Pn&p+HXu|H^=vn%hS{8 ziP87IzwysrUc9{9{c5*+HMeV*NtQCVWywfoWKYDh9D9H2aa{7DA0JPTKOXyE4p0B( z?eg>E!@sl~8!Cp~*Q5fX0X@x5p6l3<49GHThhB0?ghQ!-5jYJFRilY^m;H)*G4ZAA z(tU~OyY@7faF8wU%KiChrCn+v?B-opOE_D{aNK|K0=OuV0u zntXrf_n-L1t7X2wu#0<4pDSK*x`lHxz)g1QRV-l?dQn6ooug-^JbE%M4PMg}(#@Lf zhGRk2*q*5ICN&I$7(>`fV@81CnrNVG_I5e7%XWKXFE8zWhb!20=nrr3*B_66{%QH^ z&&MAR$G5_;Z{XW-3#0JRZ6pPx+|+l*hwz{OcipIvjsE zoc{Xm@XwDAKRot#r+1Id$Xq))YA%EqxPUM8U20b+KFJnJ^rmPA18EI{TcGW@B+b!e z9q<}@Z=O5De8p7SF()m%e!4clou^NUmn|;EIhQeuev~58D5aTuLxcdQQZ$Fx=t{yh zMJw4g9kt=1np%|=0YRT8SFEWnBAk@M)q(+v&Y^5Vrqx(aON&LQ4v7{d*_2gQ^T3_& zu6FY^bLV3ArVgMPY5T*{P1-}#^zzdA7|RmNQOi@e_fK{;?e=_XzAxCN{&bnM*@=dy zC243o*MYso678tII9lYBw>e^;y2+f=F9JU0+%U~hfPsCDmMsx}O);!?X;=csqcZiV z_R#F@X?p9YpAp9?-)CEn%jrq>6I*M$eVBCc-D#fEF4}&w&N+RtQ^#UBWF8VBbSn)+$|3rNNn_00T?25f5r~9gFOTA)U>dPtLaY{iWYrx0e@oPrsCS z>W7DT`}Xwn!{NubhaV58x5{YbRlB*JU*GOuU$y&-`BlSS#<9GddC1eUJa(L(PEV)) zkcT%2DrG6ND5@hOdXG}f2&GKay7Id3& zVRqT(Pv`lwslUv=%s#_m9XKr!$D{vf?L=na7RyZ&4A*!27f*4|<;HRAyidG9Lj7ch z)|Qw$(jy?b(QQh^6iclFd5m;;O#z&9aZ+7#F-{4f!7~*uM0t1*(Iq9#TGX228m`vN z$!K7&NsLwGQ%g%GDJ?-Hjq4aKQf{W0cDSK`V%X2}4D?=&TC)8FM?VxNFgLg5M~m3= zDyQ~d_J&J3o^tw`3x(#Gp!=)+?xalbCg0(xdC8s=oO|oj=~R6JIz?m>4T3ARGfiQW z=uR`clrB+yEMjXe(6bh%wuVp?D}4(tI%c8=4zR-#hleE$Zy&Y%bLfA~-TyuK**~6s zHNX2dAAh&|my5e)cYR#K-1i_7#|aq<9!lNPV}YfykUyo677^B)4871$nEB|rRLEv2 zEfb{(HB16Xvu2hshNDg3CHL2UcQf5z`HMZS6L!+!XS{in|N2Ax%bVrTZcAtrjFz0;J z3Ok>CZVq)Tv8qTGqbN#hrA9Z@V!JKFaQI-r5R5hq*#Cq-7%*(uM#Gk%?lvUZYN?wo zQKAAft4LO{;;nnDZaDYkbN1P3#W&44#~A*wPRadYf8Tqpwcj=87{A~1xVJ020o$7y zrVQ#53J{lAAm-FYxD;Dyin?2xQBkSS`LNcRhJYD}U5Z{&9Ga<#f}^C7cS~jRMnGhi zcR)VR-m_USMM#8@qAG!NNL<419k+yzvnm1*qhTp#0Yxq6tZD$nN`z);J9#2FG}K%e zpaX!Qn!!j_iC&WW7+`miAcLYNBAS{Hf|k-qnG4E@Ntw9Fc~dzR5g^D2u0Y5@sYQm; z)s@+*@5hkL1xP3ZqC^TIW99-lvxsG~BspbJ?>&r;bu=2XDCYqr0vDc4A{zvSDud7H z*^pep;=F=c2nHac0Pd&Xr3AV|rs?Pi6u@O*$R*K$su@aU67apNwnhukPw%aM@Z|i? zXZ2U#yLj)d&1^3LGO;%j<;+Ox5qCxpQ_VZiJ-+0KA&8n2G%yvB%)llN0d`OgvzOBb zLSQS1-0qpg>KF*odj~bu4bEq7f5MZ}W)7=CV<)K_)~mQ&^cS0Pkz{b7rA_z!{HS@Y zaW|{-AaLm<5TQX#DT}5kW0Enp>o_j@bk>d!yUn9b_q^>-+uRN#XoQ>yOcO8wI{j(3 z!R%%FSpZ4RnF+>gD;LcFLD!@%oi4cut10dBN1{=j&5;AG7yo$ z&XEs=Ae z7BvBp3421tjA~*CL>VGDW&~DN6=T@-d`wl0$svPf2GHyPGt-WnhJb{sNVZeDdWX(? z$5@L}*8{Ti1r(f(bLk4l95WXIWGsX&Gl-1pl~GWYgkn{yk<-k!&@ma;0h8!vJ2)g8 z&=-^}CCMoP08s3ABt2gcf*UfjBP7TY3k1(DWDJnXq%I?KR%cnDD4i;JhX^EM&Jm#z zB9apG?vkWY$w8fQjED>o04(hmH-ht3=wWGwU z3IO&}YG|UFY!`?y1r*R7^gLz-BVaZ{5C20y0kIq)1K3L#%5J3t2*!yG&=ld-N%)(!`zzSL zlk(wwzbmiD`WUdBhb!yXd2?ENU>kga!T`Ns&QU}Wz#POx0FHzm8sR9EnWS_X&1Lf> zPBE*ZcqL#>kfA6*b#z?X(AES9kcBY-kq48MHBgo=I|mvn1qFfN2oQjSni?>ZssRO7 zQJ`Iwi@}tLcKete02LVlSj>!24bc$XF7s!GE`-w1XcUd91n}&4o2Q~o96EBw9Kbh# z4*CuOAxr7Xl)4HmX2_BV9Dp!Y9u*K^yF7z10uyJ#DYXUwQZW+cz@Tr6WQ@B|1AXT`NiI4S*6$y0Q!rfOrB*g>CSKaAUPxah~{o9@#^_fgkB>HF0X)0iM9&RHez zMv97n=t_^Z`)PW4hd+Et@KZ!13nmHaR;horihmW`_X3pj8w;Meldm7X`>S3{?C*`A zeNfaF(4K=X^Y#J>xrV$RmBAq>8u?0xTp$vZqYf#hhIsdz6b{g#M#L({Ein)2ew zhu=D$YLPoXF5uHA#lv+mOWUnC6Y6COblg;(Qg&LZbs>)_RYjq7Q4~g_6lRj3b1_LG zAw^QBsn;@glfbi*!8*!9h|GwAElVII=W@=w3A!Pnh$iVfVei-r$xH2zq(}*b5D|kX zGc(8t5Cs${Mkn4S^w?PK%i{U+M!dX=e>$6fVR7qvD`oH%qN)XasT0aOup{F$ox<(yt~iLe>`t8nm2^UfE-=ics~ zZAh=foeI$4=?_=Sd(O|AhM_-qQ`4AnOrT?c;HyTWM5V+WQiLH^$l1g@;5v-R8DQru zflv=IP9Qo&b|tZoF>8+5uyjr-T7fcZRj^t$jvabc0UtthRIS#XcP@f3+0e(LPy+_r z5jYu8O_@~<#d|h1He?UOXheiaYS}qcHSeo7OGezrF+~XsS>0%WrbS^qm41J&KmR|E zU;A7B?XO`~^bGd=eD?GIN%7A0{J{^mFV55L@6_M?|CG;PU|D+Ca6f1cZMUMb2IKwa zfMTaZ7lK}NY16g=Yfh=~RCw7BlZ>p8Wf;skkKn-a7@NX_+o3`n2JBN#ePR~KyYFuh zh22JSw~LA0fCR{ZfqWJBGv z-rzjI1e6DjWA0aZxfsvaaXrR}Mhr#e!$EnYp=*=+xZt_h2_h?vL&`~G%zc-ZefxZ~ zy1(o{zU&`%{YC6^jJrQ|*L_d}079fW;gP3n#y0^g_tS(1>?j}h@=~msv_QLw`nI#% zJ|r%3ac6$?p54BlXXWnk0%L@&avJXd6ldlTNPdFF9S<|2)qwu|BFES|0MYSdOq9EMUSQ@2;*wF#e8 zu;;Br;H+tkeM+fKi*9(bjE`6APga}H7X8z{-%6C2A!krUGZ8XGL<&fU4A+A@asI$$ z?tJm@?*O~o3Xl;Q8)kzLRNp{)U8#o7i1w~s3)6Ytt5X$qe9@RI?%sj=M)&@=drks1 zP2P~vGp-F`tWOSL?`Zh&5y5_Oau@0eJ^V4!P#)DUp1-*E>XG3AHfj6G zeb3;zQOxY70bI$kN+UT)Oa@RCh(;>WsUfg0LRLvJ6~&a9*JIx&kqk!OA~`|@?NZ_( zI@_CFd#8BqXPW(!?fo-;@@YG4XGM9|EjZiJmlk;ay@^kN)6H9dz5Ywjip}4{PaaB# zAxx81LX&!urCmBu@4W(-=ozxabyIrrg{V)8qX>nd=(1x(r91$*mBbvA0w@wkV4^@U zwy^+Y=ad)_R5Ah(n)i+v5m8mfUWzo(tVrevB5N`rr%ILxcZ06oWM3(QH(L~NidG@w zDi)8{@%1513uVQeh@29?lv*he@UFpV5CJMeqTQ2U0D*|wb9F z4fn3LKUr@+Sa)Yb+_4`}b{1;dVdR;pVK@T2?)U^?uGS#chW)M&U}U%}1{;E+kSP-! zSiDZIg6tU@*Hjb)Axx`2>-x#VTi0Jln#pFYXY<@+apL+g6nUV<2p3nw_FOinvc0k- z$f>I4nmwcy1Y?0B)P1`Op2(MBa+tP@VSXHEcMp&6z*Nrv{NwugTJ`pc7PE`i375Yk1}E^1)~G(j#Ej?%Lg>E<0i!o}KN58Wed#0_q`F2r#rTK`G`68$eUq{rw#G$Mx_-4Piz(xf z@)*3UzzVMk8DbhW1@MdoZA6CT7b_qmh2PrPr_E&7k!J zEr|db7#pJlP}2y4yA=zHutSdy9q%@AFONQm$mT70_JON{%TgybOs8&M`&olC2Mv=O zEREPL^2OEo>`KnIx>Zv`S7K4=UgM^dFs;C&iI^owqqHfHYdP=w=WX}Na`Vwu_n>R9 z(x@qun2MRbM0}QvbK)86KH{On1AyAl0RzyWklf3<0{+wxq`UYqIg%NNl1UUpRTIe7 z#S8Jr^RpREXD0_Yu-JoVmn`l5&wldSd%pzKKmPdJN7T*E&hmOKmmk$RdH<>%zy9pm z`Ehv=uFt#eL*J{veskPjl#Qmf75g_%pIu)6+RwC~|MKCl{l|;n{cgSf(EY+|d3uxI zy~dw?Fd2`^Sq-ocC-rvqVsM8?fBTo<^xk0g-W$IP4?mX8{dODY)vL7)?PuQ!W?2PF z>`_8#fC;J_5bi8CYb_7!##h%5;K`|7T*cZ|bINIGmFuGeZkOG9czU(o3Q+AtNm533 zt>nN&2UTxq=L@}9EjCa5JNoW_`k#eg{QMTHM{QTtHxGX9SGphk_~OSu#^3qA%g*rv<2{p4!%<3;yio7QFMit-hhjHu*r_gG8Nnrc(EIIZ~em0LFWG6k9w->lRI$z?)uyR=E!q#bo+AM zS7oy8eW1@RTeMf-v#Ztq+pog4Ut9nAw-%2c-aXle?3X=oo5r{P!qJ!ig5Ljqdhxxj zP0M$_8vC5)^?d&(ya0Ig2hD_&*4Ax(;ih#nj9JP&V(G5kT2AjA{pDY-PEPdu-;P&p zm}l=14r>!?vn%&7eei76rXh1=24l2jgvmgA9jao;09{)`%q{c>^XJF$2M_<~G(A|z z-lW?9+?ziuwteMi-n;&l&oA$P6#wXtkMfFs_E*={-J5^uKR^9`zJBmsXdf+K*uh)3 zi+4VE_1$lk2Q!>s12-Ml&r+7^?sZ?*sD+fTEMo`F3V71yN} zrJGfBP`kS3rl1*XshpCgZQ5R?vvpjyX%j8Iqy}dO@`Wz~lxV@gx^dg#b{tmy@Vx7v zEZRrQ;a=OHZey2|Sq8N%WJ*X>BJLC1aCn1hPB;ZXU{W$wQA{YRs$yveu`z&uk7qXp z6yqAKmRJE`&Rg?-?61{z-EFe87 zD3VNj%SFlPH1=u6)^Ffmb^KpO_w65kxccY6bLaa0=dRzD%l!N6_~uFTD}VJ@um4Z} zmy@?%DL($w)j$5f&mTRTedTZJS0~4R^RK|a7~m|=zc7QJ`$agpb&zN6zIWX~0dJ z&$fBd#dX>_D-^)ck*kUjuu`K4<7mCgx*Mp9f~uK-L?cepx*Z>Bm)zJDzktk{l$e59G`Je`bWGWSZG~dj zZ?{#T>u-HsTY&M@eeTWe+S;eT<3IhZUtYPRcei`5UQPC^yXz{{%YiL*t7kts)Z$r( z$K{QxBy95JWG~5f9nWTe<2Me&Tg%H&ul@WljGET>?oGnJoNqP%WJuNgt=qunZoTRr z_5Iny_2WOR*XMBa3kSz1@XlSRZES_X>VR&*Ft_c%de-)0dxfQmP>A}0#njidN`-gw9*bG~&TQt=%XU+&>E>iL& z3Q`f}43CT1-`~=2ZdY(L|M{=KbGN9U{^aU(lzMj9tsXu5C;#A&{`lKJ|2O{j|LB*# z+`s?Zm;dCy8s2}fz5LSgt0$`!PU(=p^!AHXUaR-v%eU(fZ{sZF@d*Ld^J7>>c<=zm ztJq!ze*!UwvQk}EW&jSoqm-qWhO6GTAgLxWh(%csG*Z^0CK~`qy=MS~kkO?{!P~3o z9RzA(5*)Yg&AsWJx4VZgF2l`}*S?DNEt|O5#%jNkvq5(2R$?ExinPuU!L2tbot^W+ zEE}5{83BvaXb7HJ#RGcA;M9nXi4YwcJ8%viGJC8FUzQj=OlodQS53GMS_UeBFz2ze z^*UW#+2vB#9Sl2iFlA)mNf{8ks5+1+cqMITd&ssLjA65h>d zFmZecdY$l^!#&^$LqXt_qJgUBY=cN5N}4%i?_MhJV8{9*+wl+4`5iP>tWzF+u^7`h zuC`r=PWKO|`@zo7f1KOJq?jMQ^;Y`e)3j~v<_&!PjqBIG7(e;+`S-u~%B%0VgB!#1 z7n5$Q$2VU2`Y)u#fVw_?a&J0Fb8@>Iu2#>_=6~^LuYKs@!Gq!#J~uw@)2*WTl`q0a zPhULTxVF!^-CRC9IsAO}_Age`I4)0%H|}8h#(Mj7cJmtKuheNfu9mQWt+@8qcI@YW z`9JTz|3O`I`NiA)zxhE8`))pkE+!0VZ|dKAeH@FSIyiXmonn3*)%DMRIQ-c+>)m ziG#sS+RskE@qhY1()54(Yrp!l&FR0@2j6r*S})%(@RLvb6Ib0hnv{p9k1nT2q50C6 zq3z_!b5|e0vQ07iz4`W|`=xi%ueoXl*+8IRMCem?ff39lHT2DBmVKm9c$G<2fewg0 z1QyBQ38shb_@X&LDE3j+pPiX(UREpSH{kH|gtzXzc4PbL1HF1Yxpuu zjJTvmFq#X~da-TSJ&gCN9owc7*iI6z2X zFJ-zQjsOfgGf*sAxkivFjYDc6WLvj?`pwBrl}+xy@L z_2OBG+nC$fEo8l|oSkphHz$+zcm6E3AN%oarQGPjc7ttd=C8f6e*c^B=t(HPR~)>$ zY3>x|$t7LPZw$@#BRF{7egeU_aP%64FKia~`lX-Se)I{$ZE&;s-B)0`4}t1~iNk8`(dP5^>5K5Ef8_6f zCkwRVvJ63~doZ=aL{Y?HNp=zOn|*$fWx|E;=o`QH(YL>I`a(vii-`^6C=W;7 za_;(zfAnAd7q|YSzi>BrdvRGj_y`|eh8LIl+N)O2n=k#$vp;_S_}y!er}ptPvvPB} z4SADdKXj=&dR>R!Y#7?EJlKmI1lH6dMU7cW(Tqe1$dNC|g?gg7A9Cqo|9aFiuEv7r z%`0z&nz!TGXUp}M{_fvd{^NgEhFl)siHEnJ9luHkua~bqFWqzavp?`T({fe}dU5fo zZ8dS(cLOl?-ny6`;hUItoEU za8==ECDygC3fDNT7|_5*%d2#`k;N)qZPO;HsAb5CMge(3O<@g@i@9s{s_!1G+8=9dr-D9d{e1`gWMqaCG$Q^gTDL@cQe}Ol0S;%A>M#}F&t~EbLt&u+9KvKWt~V3wr$7At#c%)9e%TE)NeVeR zLEHGT%^3ko$LKK$CE^HpVQ@cF6y=yU(Jn}I+ts0}vX9#|+Bh(+xjYQh%OAaXu=kzY zwOi{lr^Cr8CxOS4W4`+i^vx@;euZHVRu`+B-TuBtZ_BP>ifAr49Z0L_JTZAlCf%C* z5XeVS!vrukK%%Os-qnyr=7;p?=FP*~i~G;^A|-B{-XO}+g}aQa)K;QR0TlM3;~fHHHL=> z`({(1+Mq%VPD?~m09BQjyb}kYW-;Y$qTY1nCZuSc=paQ@_1Y?zWTMFee&-fu3r4;7;crr*r!jX2d??qU--0oiKOB3bn(IOa0 zHz&6iKRkz5C!IU+D8uQqQkL5n_YMwjQ{nw&?!WY_^HDF?FM@5RZd!Q-(R%MrfBDEp zb9;rex|~+fhAxNt#w#$HcxrCdM*tmAf?@{F6Z!49&4`Wc>4RtC-~IRYkN)25;>qba z3YV>neTpuGY-XZfJ%JX4Oh#;wv#2qb1(M&bk{khKR1>Zopoyp!-crG;BuH8>V;4JJ-WUKaM(=GDl33-fK`r&YDhqtD`^2_I=^Ou|AIP~|w*Zt|kX7lh%wLyBg_;3G7 zx9a9~@D|h4kC)M|Iy(CL*XVlFZ=hMNu73O99Q@hufPT^%fA+k;+>U3c!?o$dtMW%v z_tAEY;C1BJH};x~qDrL>ZT6sy#u$y!Jfb%TK!%DzA%f2Y3g}Uoy%kh@m=rK89aV0^ z*aRwBgBfO%Ar75vyV$PAVU#E-OIFZ~2#|?P$z_dMrHz|M%i+hDaO#NxNSKR; zZL^+KaI_5%lQ^`f%AVQSX>b}dTf)(is0E=hy1XpuQ4ax#0Ajy<$Zr0{ub%w;FZY)( zhUF*oTl+AXrn>Cg-qnShzp~k^H*tFNt8Xn`fSa#(7gyVh%_NLE-E- zH($*cPy3H9?D1!dKlv^F&L2EKU;WV0x4YpiQsV$zEpdq z>?~{7tD+-dC*aT}NUTsQfH@8>M3rPF4#61)t2~Fo6$OOS1?Nf+m4gC-1f|QXjV{*X z#X2np>rx(K9I~chnN*QU8T-6a?fULX+kbpHKI!6GlZl|9$&N`x3Kjqx-ko`{LELjN zBbpRgGu2+bnKKjtB_lAuGp=SO0W?Db58znQ;ZJRk{L*fOguv!FTQCV^Q8g%b1}_X$ zc-F%<%Zt9BnjDJE40?h4o70Rw)v#FH9g{M zhx4C%@3Z?K>j$4TC*6W`I1hG+;QmO&~CLlh};|zWVynJ9nA)i~h_D`_#k5GPap1xY<5z z6I9L3SKflX%0GCpX&=zec5D{%ct(Re0u5Y^2o=tNf{^Kn_vBt^YHOU`Rdt)_21Fz-weh3 zl84WCrsqJOe#;a{yuE*^lV;TiAA^`{)iAjp7Cu!UGd^>!! z9-g#Oat6tej0^-&0gwTZONK**Ik}11#PNh};;8XDW7uh5FlPmHXhx7s)XYGlWdmW( zo@ip$I(=!zpr2B)42_6|*;tf!tyGmnFo<4sIw?a^-EMfBQ3JbRr4ABdk)<`1r2$@%j1N5^;W&~|)&c~-yoR=#ol#iwVp>jx*Z zgSl?(`PGvTAH4l5pIcl!86G_HxU}<$>t@ft`-cZbzP$I%T8iVtJ($m{^=bS5C#!!M z!_7Nk%I){R+aJ8TynJG?y7}hkCr=;c_W83eg`TEK*8Q>@Rtfz`l@?qD9EP?azaCRg zz#f6AK`KCYf`JT_QOf{B)=5$PJdEEg>=SexMlujgg2<47X?B|EG&+&r|vU^!JcILA(ltb>OknApy&z^0moHp|u|-XtYnUDV8%{ z9#E12TP54shQWJBUcFae8WXVBG$x%YyfkG2L*cOW$ORPvYwsG51)DP(WA3(kwUV=i zo~>oQk#^{_NOlBHjZqOa=dI+kG(2C$=iPWQB*}?(X1^Vs*$|maL#MV6xW{hhan3wP zoCGYN6Faska-3d;SNHY?tiAP4r z6?scAs;)hslxkh-RDLYQb4$00onmB~vLBQ#d5ndWco~RRed|3GjlUfGSXfa_o9<>&pipzWz(Uest5Pr|0r}|L&vjf2-W*x4-nW_QOBeKe@YKr2GHR zzn`sr^K-v+D=qtn_tUzQ=*}dqM5i%^Nx40J(SQFBZ@&F@c>bb2{mevj8ZJ-Qe#SOj z=GAr>aG3|?VuW%qkuf`uzzG9KGa;$U&}X%=W$zI)mvzqMV01&CS8=_6c(KH{rTY($ z_`ke@1tN)pU|%>|L+8U(hOf}%Mu}4U?y9}!$}63{`PR><;OgOL#j_{ZLhYX3 zx94a3uY3u(TI8$h#v4Np+HZV6)VP27>|v^eB`q(~`VKeXpL*A77-$nrrg=+KpYTnztT3vU>5OdHkM>s<`(g$|oE6w5mS|{9))Hj&WTw zON22(+8mUH+w*$`9F}p;sX$2teGO*;&wL!GBsp;c7Gkqqp-~+$BQ}Hz;>ZfKMsi9f zLdl^I-a?=m9k^YWGXtyc-ak)yYz6I z+7u;aQeg#oX>5cELWQnis0{WQ_MDp+eB^15Y+87a;1O6ci;6I*vSF8V&XPsjB%7Sq zsvX#1rodSwIVIcO0bU~P&=i;uoS8Gq3MB!NGO#CY6&|l+RfhS@Fv*fe1%FD24rGY? z9xBO?9$g(4a9rBa$<&E1PS;Li#dSHMRyMD^{d!)Fus-d!!-J1MSxt>&!iVekpZ;XX z?Fl#A5$dvd`u)dn_MkqRJ-mNah#c)zu3MpNKKbm2F7?g)wjJF1aut-M~U6$O4XNDo#J-!V%(1Fg6ctGBdH3t3iN43mZcj)6$S;@dY zQm*C)>2qH{Z%&RUZeQr?;(6^Tee(V`^UU^>s2n@^^xAfGX8TR34*ACM!IPgnd~s>d zUGb!$4@DmUuB&{y+Gdd83pd6&#Hvh3b9d0>Ni)uZF=s%GR4lE0VB@Lf(SwaSbmSC8 z5ls;Q9cHJ_n<*-Q8?r04l1g_bBxg*XohK@g3wAYPKqvqxTDR5JHZ9h=SjndCxBbvZ zi3s9U8Ixfg$5!&HA0Kw(qc&|~HpnCf0!fh&$e>2M<#~^Jf>4oF=;|8h4(DDP=jxzA z2(x7Z1ak@rSwYYw<=CdY5?zU`lXV7zX;kV>vLj>r|ATpUp=dxQBr;6Ih)M;3suf5L zXiMSY#rXQZyHWWjB>@{;xJY9|yy|3|{C-t!IL|>BV``;?rg*yQin5*-6%3n4k3Va; z*bXf_@KH8vvLVknA#Ys{57V-hOp~&j)t@|BR%hF~AhYtbv!(WO8t+-@%*w_2_9u^j z7$)`c-jSyE$3MC6Tm#0RJ$Q6bhLlI)1Z^xFDMJ=Zh(qdnr%^|bAX!wbE{jx_Mafss z7hF1~;Gh`OP!{N&V?)!iE(3Fs73MH=2Y1R}`iu2%J^Y^?nT+la*KKbF3leEo6$;!A zKQle}(xl+fwGsAvd#BJ>?_Q&?e@zY!WVEx7KK56u^9P?c?PJOF&PYd5u%2Yb(iq&wt_91Gfc>5?E?PF6}D3etHl8hlCY?8ms z{?k65lMF^I<_sjO8MC1Z0xL3E1xDx+N>5N5D4Hk%8!-a!yrv$5cbm(pMy*B{i4qHACe2d|8!5tXZrsN2j zLa@R?6}a3{*>=Wl$OsBxs#&7OJa#el$y&>u;MQz39YM2bQc$;3)*{eLKH3g@#{#CL zz+eu=5oE}g$e|;`)kvSLb5$3&0%Voa@wBL*?K{Yk;VRi;#i^dDcALQr3`fwE%H`Do zL5Q2z4w{S0R8|1sMl8o6ZHi{G8c|s*ciwiBUK*#hFMH9&HVu$~Sef?fV(hxGwaONA7@hX>ZzWOHI{NnVTcdypZ?>&5ageOh0Y|sC+w)5O??8D!Mr0(F( zcDlEo929T7R^PbW)PX?v(@loq{Nv@`B5(iTzd88xfBDISkN^I&tA9a$8GMb#riO7j z#Ln|F=1-H(x#nS9KK|6&7mcI+Vg0qH22N;Qz&+#|CaJt`v!+8IHHellVjdJag$44H zUY{?z3n5{2rf87CAR{WE5&|GAvN2f`;`eBSYfeH&TDcA^knbK(*pRdzpOq-;Tg%orr^Z{h1BgVUgcZlB##Q_PS zQ=k@@6g?Vc%R5PZwlPXpGXm=^s&*+aqh5@%h;k*+3-ky{VHD_96+j&u=u0M|A+VCX zq?RfWK_TFbGLbMMB+gJ8WDm=PA6=3=Y-ZJ1;^4;&6^7dNT^2=l?peRcmeqSO3f?V~ zR6g`nLTcN^ifX=0(gbSr=&NFzVNsJ(%E04{sv>N*eejWjYYi1?s@RXiCNDSJ^_&YU zakFi|^TGRfHWv+%b0Lrrhk+#p$k|lWz{*A0S9Ou15N{LjHAh)uuEsu^4#OZ~ih=4J z7oqV)$hk=lqxRbr59W=7dfW^?1j#~aarTG*_S)>$jbHd%FHRrd#nZogv-u3S@3;Nq zdQu?W&0%W2Svqcq>kO0FzkBD7z4^uZFa8BMtj@pn;lXRK9vr_1`_=g8|NZiLH?pR7 z;XnP!@`i;kOzC9O&%(ZU1=`uu?a9q+%l99RA3wi5{g*dC{^N_!KKhr-@sFt3Xx(sM zjx=U7+Pb>Q!7IdQKB?_3``c~4xwJIC#&?Lwxo<3O^5iN>(}z zOaVwykD70w~5f?-l|OlY>FH^a`(sxZPB$0V{9*>>q7$Mcw$$<~4iFq$QSgb)>x5S-FW z*~?D+2?+Yq)D63R12KAqkwMI{3Q2AN9q`2lzJu__A>R(hSuwEVJeyJ+1aj*GWapCM za+4z?fNsTBBvt8?I5xDZD*(m7M2NNHjO54EIS-wB_HN+<<`^he&cz&h&BJQxi|TAJ z+DZXfQ}y77kMAB&>Pe+zLd{)@j-5k*rfE0ZQpt*Vwds8klZ}gZV516B(;TgX>3Sse z1I2n$4dW2POj#pXkKlc?9&Nt{S7OpOX?@Vty!hnm&wlI9U;n#@zwv+g@W1)L%q||^ z4e4g-9u33NXw6~ixLM~tfPLCOy7S83ulyzX{1=DA!>1qr==CqW<&R%^{-@vZmrq@_ zC)Bjk1~Y1^yZ^z@!WX_`sw?I3S>z&(KmO>WZ~Wf#Z~WfPt9#-6-bJtfjOu@g<2gcM zU70M24lD#bw(%gg@9w!n&Iek&TZMOziX#YdSS)FJy4a-zrXU7tA^@O{d5`dQ zSKRUL$b}-LjOxwED4Mzg0rS6?$tnnB&6SP$d{^@3APzJQ$>1d2?P`* z#B2`D?V_LrFU63!vjM=4cWzm;2X=;l5(L1kH(wBDhN}T^N#!BU9ROIZ-~&}bN>7Ha z9TGa5&Wfu(l7LZ8=yOV`tS{q`2dFT`EW9(`WrrdYa5!r6d2nNvYAQueG1pRnLs!Or z&`FWFH`87itk^%qtCXv_EW-pHlvZ||A$iV~D*gWvj1 zz5m(qdRtZI5OwTtM!YlMf9Dr|EgrpAeEq9&c09%{#K!Nv@%&%^$?D0&y-5Az4z30$ zcyfH`Cx;;U{`)@>Umm`4tGIr1{RiKK55IM7{CK?gugj~mM9shMiodtsoVzKJG$1r&L{dU0j@Uc)P9ll3-UwYu&4wjZNPhKz(LASfftdWaV>ou%02Y^F|B1?45Vi4Q#A0ev^{LB$gX zgqTNIg0W@^G$$fREUIJ{MRSfh4>1iXcPXvA^djqdl1qhEh7m9Wn5peZnTR03MBwZt zxeH(?S+O0!MZf^k?jHdefJ{9ivyBRhY0TcU2E>b1{-LJV_v#}}9AFfkl+KHg#_62X zC_aq!l(rjmiUKgWuw5@F1&&xaa0nP}r_QBG;9vxbTidP&nCDGNA^VV2_7O-id(Pun zQZ5Tx#C(;nLU4@5GQkuhFbodr{k_Z860)ETBeHb8k*lV_PK-OLtv+~bwd%P)-FRY1t` zejdL(tG;gIdk2%c%=4lgiJq)4$6$I?Y|FfW1pQt(e;VcdY4>ToOkKd zXXlq2KrPty85GIXPEoQkj%`K3CU0e4NT0?8Hsbo48B0;LMR)@d(PE`X9i^U z1RQ1TY`aNUD_N|E$Ozk9fYo2=d-u>2V;?&7eI)Q$Wa8 zmPH>Ea-P?iv#Jf8;NZmf8C3nMZ-HInDki0XhG~R?$PyL7#;*(L4UqV3oHw7&U;8xMYP z(VZ<`na+zbCGHMy-C1u=$(xl0PKy-dsLstMvP`txELY2Ve&FxiJ$t&S7vETa_QKVr zeC5mS=ihzf*S~W3*M9ZY=ZoRfA1^=t@tvDT_uv0$0mapiez3atc>3PE$6xyysBf<> z|M%*81Z?v0hzJx`2GM<4xxqtEdXX_i+mya*Xa`m}`qt$lhN4r=RpZMv2 zvueLbB3TN{lM-K%^3DD3i_^oSs=Qm`RlB_k+vgfP2XfPi!w9-5(h}1MXfbT9)C66d zHMvoIby$NR-+#P)yb|ppQwRt=g7OXwXxY%2nUO{U+XWAd2%VVsh+x3rz=8wE;226z zMW}0kP*r;c%?V3KL<&GDiwv>va@WSL9r~0;6Cg7-MHp0C0mJ}_lcI<0%hRQ{ zpuc#74+K0xCTf+84K#<+DKeL8MkcWgQkO(M#$lW{HjXL8bP!5kP&W)BIt-aWF-*WY z0uNo>!d>*eNkIe`g-=-^X*UyPn=@c+qPe=*667y&iLW_ zlmB9Q=Rf;9cy> zr%#@|eZ0SSZTk3!mrwFR^X}Kd2{*;9FW~00A8zkGPRj=eipAo^YPkNG{i9aG^v0vr z<}7Znd%EWAI(6?HS9f7Jc0fn|S(l!c#V~EQvMDESV2Dn(mO7O!a>^JJi%Q?eBxX#_ zL0!@G{$jXyy1m@E5ksLVWI>dHK>?kiDuXFxOQ4iZjlp0CG&ELWzyg_(f>8;+f@(@( z9?I&ZD)s|R$Q8jeqer7L%X++8%W@l6ZESPyEhkh35RftHLgTX`p7v=u$|wpMU^f;8 zL!*Fr0JH}bARuB^@nZe1s_4;(cOnW=l^A1`bxP|YZ&JS=WF2LdwN+RPYz?ECymZO| zBO8GN?UMhBB*fs*+)fdJh&w|Po3Rn%%b}{E0<*A!fmaFw1!Dv<<$^Ps1C!CaM?HS) z%6{dzsK)5fd`T%s05ZwubR;BD0#67Lps1~6msPAGFr+jF#;W0*#00k)lO^M04?v5DvF z=d1eU9UCu`tV`D(bK{0Q7H;Hvb=A7fWqmSz`*UwLaf27Yi*-Mal2d!OYNpML)nyY7 z!m}sm|MmZCa`VlLV!wXnwQKKvWxTrheBaLBdh64F_1Si~g64R5rI^>NgN~p7stNR~af9u_+=g*#P&Yo`1gIQosdAZ`IH@G@qs#1Nq*>YE7`fGc7;M@eD z3Z>M%tftQ=M{a(jEH=1)0(@CG9at{SwjiTnHr|S}5yph717_T-t64dJaYf%cfAMgW zleu78zLa5sB4!{|OGc~+qESHwQ5_f&q5%>qGodqfp359+s4AH>IN9ULHvEWDk6|9czn4^K|I4}P3J{83lxv+Wf|4Up*S#pA1o z7r8I}~iA-4D*SD;|WN)|0oIGdab$hnjW4^_YluNZ7&~m z@Vs}r0nSDX14%_2vl>Xun#2fZ43pWUPtA8fYk&M~yaX{~PuUEN#Q-uP8n6JVB~=p; z1Ja%L#}Gsan1P+K2k#(wD=IE0TJ1r70M-0((%hbu$F7(JS9@mXJh67MTgK%wpD)H2 zgRF=}gZdpZ4iu%h{mDoCIxBKWDq}`_zP!WlK>cHB$+lhd1Cs0PmFFo-)OUjNeZwSB! zh~y9eNWl4>kPtK@l367HW-U3ROhFncj>p(}{r*zD7Qeh-Oq|p?myQz=qjRW+AOyao zK)%myZwbhmkwHeVY^F*I9zm-D>w*KZkkwQ>$UHa#UCgVooWerC13TRo&pk)PL{U z`h%;}%NE#J34(}EBD*((j0Mfq4Acw^B$L^WC5S|7gpBAI(W!G33M>mOOPSR$+o$Q? zbUr_xmv`&(MjejGl@4l#kYj44-ww-lf3+ExN!lC-5&%dnD;=-0EV{HFQ;r$3qV5dP z07k?%2RsgPgZz=_X~2RkAcURJdpDt1mSkc%CW%q|A#M9{Gsg9(E7Pso2pAE?OkgKV zgrAD+5fKqokbpp$4T#XdDeWY2WSGH-5i$}$MpR^z%;Z$fc;|v=1~Ud^&ni%X^o|6~ zr@j_C6nTc>+vkun{lY7BqVY1RFA7jC3(PrZFbobLv2&8u8LAghB4;Envm18$cSJ?t z)JIc4pC4Yoz8C77i>vnX?DA@`lt&R%Dhe}6L}Uu;Tn29*=7nFkUE!;2xE!|Rpg@o$ zVnyeo#OxLu#Pdbo&vY9UxJX}x&to7p0*oCAzP zHiT-jO{-z-0i`ABA}`D9%VIJf9!+lFn9olt(pT;ji^aVsi|0Ri{%qm~c2zFiI;JxW zn~X^{Yf{Mpjn%5s&vep3`1DG@cXocZ1-D1MYR$kOp)h|N6h;UY6ncfh6IKPZl|IL#gw8( z$=2rFkGhTHVvu#xmFO0r6G#BE3qk-B8W>fCM1P)I4y6Xi;1u9toN(0D$fG;r)-e^w%bPli9xLrfZX>l>5=5 z?Cn)pxXp5$cz?Ny5fRhqs$!|6S@2ndmKd42 z7<<(*u2@6XaojpkR~ON+8;rem>o%LWZS112_&`Y6#!KG-YPLt3Qn%(@=KN<2scQ*wQ+P&fkD46XlUIsBwwE;oV$gL$uN zlEo7%2!OJAQ;jI3S-}~wQr-b|cDOg>49WydQUq|`yE6F+)Dvl@G(C{nQ8C}&-=E%^ zHP_4f*cVgwv$C8Ta+mt$xL9sqtcHtrT;$vX3QHngc0GJBr2ok29`Z)~>q)O`07lRb+5)Rop zu$Z1*b!rgU8N*b>$?P{_kfJqIRJnRy@;d%AW&FE zG3}r>hPE`mdvkJYkFYR-t~i{&dos}hXuX0x-@ECj7ySc*Pq+D=P)&#XVYn{Pp0zo& z&8)c?hcW~Zt=UBzHG(q@JXsCVh`3+`TC^MA%(-aVc2JZAO*P%yrf5Z@cLIb=GfqV` zUbwK?Ws_}_v<2%GM*}fT zfVLA{6TkG{0_jUT78#Ne0Ry5Lh(JOIN``qSu74?aG%b}FNf3>Jaw2rjKtK(ML|F*| zMAUh#)th3ZC=zNXbp{@gP?X>Yi*&~HR}Kyi{j$kvCYnnUV3df&2^ekHK;~vn9zyIj ziKMI;qo$lI59T~+A!G*3kr29+ks!%3kSkmrW4c_gV}v~iIKnhQ>v0s1TrGz*YGkV$v#+}N#W&=&Cl~h? z{r67!)9uyfp!?#@yKgnk`uy~GT1*daplsQO^JSA9an9C5yvZ$ z7OW##YiiZnQO2Al8^})EywfovAQT3TvqRtmqME^j1tX`1s7SceFeMNK$qGrdkJ6^t z#nfhPv-E0%X;MgtSW<^b&kRS9)eQ z3EnsL!H|bU2?+giNG*(qhlTH~%TgBv#AzH+VklrbE1c=gXAUOxxuo~Xa?@_qufQWn_Qy4nm{2b% z*@zXPb4mmV5rk?p?r&Z@d3btubhJOXi+N3hY~~&Vk`AEGS!sjlpwt0(O2Wl5!FouS zB0a#EM=VG=1=Pm-#!pOVXC0r#VVSfCG{g!ap%f};VxDq>F=!S*0|f+C0Wf%}8Q;0Y zGaz!#3iCHC^sXzwM-MRdw480_(2=~C|h&5UTSeg+E5FrQ% z8G+`M#ndc`3^}(cZe!l2ycOvbdxOz%G|d3AvpXB?xa@#CFD1UT;NlKgZG=XcRFYXF zck>tnNL`{QEI60!AUWwd2(V<#l!1Xk0X!IIP~54L0R*XJ&YH83qm&~l*JM8H$XQ}Q zROj;tT_2z6U%q?I!`cs5s6Ee1%8oH=g<}U*QH~v^O3P3eu8deOHw))??rIHOq9rHE zkVYWF;Dc4jDUZo`90xV6eGyZi`nD(?QOMmkjW8u@unrY$lU}9K1I{a7$ylMUi!zWl zHWn5gl`&dX)o6&Tt;Vs$1J7*J;~|Sb-tX&q3)lB@a%Zcro!j)ui~skBkM6<#jW2!v zgf7{4nR~TTXcPg^NXeWf3x-AH(vTc`Zy^S7+jMbs+ZkQ5L@zFoYP2Yjv+I{8IEkR6 z(z45)^V?)?OaeGEh)WK1Rux6Dm(%nr+2fWLYCYsg8OqU7%A+7+!KN89keFwVQ9OZW zG_Q=Rwo_RO?5wx~#%Kl{0348GLMJW&78EM+6P!+RGl$8(&JXzT@cQB2>wB|1Rduth z4vKoeE{dkuC#hOkuZOG6c-5uF*ezq;K;DwBqHHDiqLvlZAOr23>^l{^0wV^5DZ(6S zPBud>3_Jnj?xt>;1P#bQB}bK%CB+nD8ng7GgV_Mq115kBs6UM~?B+d&z_dFA0T3#~ zZnw326%?as@-~Qj>$P^`1sxy&dw?;efnvvjpj-yFNT5Kld%+kzNn|R=tis5i8Xcz7 zx=rISD1gnatc^Mope(}7DKUL~)=QUu^~wON?Xujmq7;=R4GGX#$n98#NZjxB^n_>GO4|_3Y(aZCNmevmRnNw);o2Fj?G4P zOgun@{p*w5S{}!%D51piaCT51jAvU@H?AHwleuq}CJp*#diNFBZ?u0l4D-&Yo@`;l zuNLk%hVdVKn!od8td7GS-)=^Q32mc^qe01W8OpucNwv6m&e)mP;DW}%As_|k2&n1S zxE@%uMKv=&q?{vlk^~^?kPOz=W*?x&=a!|WfzDF3SPN}xla0HQ0;@QW5o(_8ywVuL3z_hLvX&l=+0=={n<3^)g#AlyGl{V&3fYY zBo+{P({IQA`9XPu%NvbqVJmG%nb-(liLw0hSIYm*G5o^^`Olx7UG@FzuTPSL>A}(1 z$zEAdX3Xdnj;7X+i&3J1Cvsgs_Ga0TsOVZ85wlRInv;yt46~5|WdVy$fxII(4HG{p zvlWXTA8z8J9|i}^&ImnZiV<EwDc#pgI~v(Gg(;%LYnhJALKud`Ac`&9^(`Gb!#SpBW^AB!m&Nuts&o zd(O%tW=c_p72p&$n~uxQbPV-o z9?`^%QGm&!k&VKYcP}o7|L)uOe)iqFUp{%G>Yra-ZFqv!v}pUhjQ|PsqAhEmvJRWH zw&lUR@x@*l!m!??6dkxwvRRT0hQb6PfF>QrcBeN$BrK=fQ5K6uM|%C*iI<$xMu(J1 z>bf|Xu#WKL{1QlugYuJ2OKX^2ZKtC)$sf&6#PzBMJ3+AZsvFi=z@7>&ew$Vs}MmwiUolm`RcbyPL= zN+@ayd7F&`X;5KhP$f{T3<}etAKYgvc>3vhHS`_;9EBM;#{eB7WXzfe$w1&NJ54F4 zfN}TPF<`*7OD|>+R8}_It^NU=m>{6CF&UHqPMu@+P?qcig+{{!nmN{cI@@dZ=0^wf z*QV2(lWvfn`^?vOS1A9@~r1fPyTg1~%JnQ6YhzrSU)2--`C5xEs5~c=b24)Im z;E)``#GnDJ4QjLkfyjuJhzJ1zvw;d_70V(qN69J5n6%H@oAp^+gRMdYNQ$zXT$&=F zArTVqrae0^A0dJPsNv2H7*R%pey$=+&h1E)=LTdf@z6}K|jEohMEJ2`M=!z@R{4aixS{p`@}% zzIIKWL+$GWt7mSy*X&PkPG{H3`Y2TUp`11Kd{WH|Z4#zV)~k5AP0N0}==$XtH@3qr zNSh?9ff^g284!pW7_tEZc+eWCLTnJH0HuKgbAS#VqcMtsnu&p`XwF$B=bQ)8!K61B zO?tBtAejl0ArS4RSudMiyGgzQ2th_51l>WOHG@UVi6oPamPgX0t~+;awAg7L2!YI@ zDnQn%U>|^;QYM!z^F$>ojg8MI1UD*b5M3=p+OE>LHz~UzZPW_)f=$|Z>|KaCvpX@nqmz%3qa82-e8FeL?y7Ab_9IJ{j^+QRBSEbc7^ncR8DwAcmLAQ z{rck%zMod(I>#tc-!fa?P`$AsH@cwTie7OBV@>h zs`=;{3k4@5P*0hZNEw79L<0sE%$6-t&IvP#We`R%(;Wv{U#3~$r2rAYn9LIyGe{F^ zM}djQG8j~`EJmzlQiCh=q=B-AYBHJ4=kw;MZjLIyUzb%=HPgDDSCbkX!8pWi+byp0 z+12oT5udmDw9n5?F0-s9XW3<{5nig|6%63G`@5b?@~sfmd!?!h!RnH)zKuQ zWpnElT&5vyI@c$sMj?S$?Tsan>@Fc007s0Kk0V%V@WymoKVU|4VC5{E7jU?4K`6wo-4l{}7J0tOdq&$Vd)c0h^0+sAC? z(5z3<43fBQEA`Y*%gJWUTi8aWpimhD#*#vd4yxcTo{q!z==g5GUJVz|K)LU8En>6z zYy$53mzQG?ll-%nn{WKE`{3EL?-j*dix_kumk=F6pK+}?0>;FgA;ENv7!}*LTgTjO zW0zpqrlf!ZfU0Un;7Kdky_38A3#cdBZ7eeqs~DjPFqza zJ`ke1U80@Xc>%1ed6B7ts`Adl9I=GDOw?@4@`^b%bF9OptR{Pt8};5n<&TPT8p5QY zSxu9WOH&-Q@A7gTm)riT>lZ^_jKf7t3y@9H(M(lIfz%XC%>a-UJYa!V0o4FCVvSG} zRHy|~aO6oC71hjCOd+Z!k(@PWNt(c*qGeiW{?z z5jEyPH06{sb2c?lv#iXJ2|SWAAVXF~Bm+e>Fh<&iS6Gn$e@y*XvnAP< zB|%^{BqT#)bHK#TsbdO@+93HZ)~0ujYx<5)nznImy9v!}Xzru8HZ1z)tZU9&zlwg< zhBh=!Xc}-ju|VEWb=cVbuH5d+?IFJ%rngycMDI-}(V6A|dSoqt0-7?!PCI-h_4$jwGL!9mX1jr7+5UYU@DOd#n z5CCDzrpcsW%_ap)R?7%zfacPRWPx)Gj1JgUYp1O}Nzj+!1gu-JH(<0A>RWM3?0d?2 z|0YL;+64`#-d~={yZfC~k&}3z=#1S51+MS*7=nT*2qFhi6(EcSz`TJfcuY1vzbWr- zcP}2Ue)QnMv$IvRKiD{=+hQkt1GarzPxFfEaNeImNy+fVyF*ipTXlz%^W{k#)>wAg z*)AjG3{Z>#dWQm-0Jca5reqk9D4JC7)DbChLYk78U;_1?b^?*w{XTircG=dkY)$gr zI=b(%Uz{{2@bCoAK9(k0K7ITB+YkTfVmLY9{`T&7r-JqX$CN^~SfVycEVZ)0a0E}|is0@8azlK?WSsi}d8IRv$utLm(6R8&M& z<{4`*&PYuPfdf-icJqJ?K;RLJBM4r4@B0;pzJb1jF0_j!Eth@Ubhx14Aa=nWobN+_ z5^%|`b1pa+oR4HyBv;v@l%(k}ZFl45Fx;nfUGue+JJm^5O-Kw33DxEk7=XYWXao!Z z5iB4CgaF_GJb|Z~MQ#9UYM`Q3YN=9G=D!S7tg5Nah1YYYm)Hc2l$2o(v=|a9ff^`U znK!{|Xw@XEWsq!AsAililF=!7aDRNAjH2R>S=f_f3VV5A`fy**v|h9=(&Wg3D~m|B zUpt&e0?pu?d>SzswcW{wy~)Yt-)wGvbF*EeecI!@>faR2RT#+-1=z7vL(gK$N|nW- zcz}>l{`0Ht->s+5SN@~(#iu9W`+do1lUl}&t?>Z+)7(1*;YX{J>${t`>+$XW`Z4A6 z)6-S&?rv|Kr4!G)!*mGX(3wCctx71UwIZqm=R`%b7_FOfVM!n0bjnzyiQbfLINZ!33C& zUmyV$gQQwil9d8Afn-n>V&y3 z{*)IFdo8=?Z}w%~oPMw#L$g|4JpJekczgY;|Bxg-?E4=*I2+&GZ@nuy3lT9^Ga^** z3`!7`ssSP-MRW)<=x-$dYCSyh{@KOyA^Jt@8YffUyw!A|x@dy)i(x!i?4o!lWg~oj zH+GpXPai$%E}fPB^zQcl_4Y6}22(+A;2o==%3Nwu4M7C}2%&;ubmUHOH%+1dLA>Cp zYalDRQnL6K2ZOM@8vW&Wd+$H}#DDNdwHBPAnM0c1hJjPpdTP-(V%As`W$ zVnqQ}1fC1u2vJez(g#8#LKlF&`3OxbKIXoo_B6DMMO=2&Ho^4-PPxmvBxhjiqjMY_ zH4V4X7^@~5*(FUl**N9hFx~79ugBrdB)3KPAjK4P=FFNPq8Ta>phJy-9xwnfphF@M zLI5t<&N|jn`DOI$JikYAQkf6>47M-&L_}=LcSj7;JOp^+b2@8UOf+?7g zWQ43bSt(fZY@aniLuVK&H|Rcn_<2+1b~=Dnwhkr*ZvE*Bl9(d`Mq_eL@Q^F@XP>`a z>(w9rH_OWifBTF2?icrX=;Gt=UuGO1G|Tn}pWJN!{@0gn-N9C$h3_ZZjeJPPW?4`~ z2H*#Qp>=ZpZCCOr@T*w7-<8D)wLNKki&S{wJ z$}ijYwATC65E2{$PY+K!dJ+e_Yj)r6)9addH-GmR>p%V1SMhGQ8x#O)em7|ebH9Tb zn5qF1m;nN+Db80yM4Xe^M*z)CJO%==8OsMm^QeSkqB#3l%qrQeMpT?j3_(dz0Tq-; zLD4|wH^S8BEQ=8sumJ-`@ey37u7kL+<*Dmd)^(vjiJ^@?E*y8nv2X!RV3VQ|dmsXE z$e>V)0NEtQqPUY`KVEO9H$!=zjRh24Q1&V+uCWC4?iDa{?Q~?U1h*UK}5JV*dFd$O0YM23wnj#=5SryGD z1tD80STjln6)-e%KoNWg!KrIL`Nu_DmP{cVM4>UsVjyU%PQ(2);X-`_QPQ4QUdf*< z31!^DMg9Dro&U+dc)k9rIlK5od4GQ=zx=nh;UE9#Y=5!{ukJ2-?UJTPT^jejH$hN# z2jshLefu_VClp|lY7C01#Doemkz+I`V;S7^=cW2X`fL?GZyr4I`P?;54&x3;l7OX2 zoeSrk-|xon=tg!M_WhD=xmW=|03?WQN>ZInWoNY*24tx0M2jPb2rAIFORS?qTmto7 zJdE4BJS_{JMZcG{3O=^2Re1B8@AoI;lgq`+zx(=fwXnPUXQ%Dv+x6h#MrA-8z!ftR zGlPmo#)hhZb1d(eOa%m+$02y1z5@g>B{DTIKobLxfC@w;3VKA=gP36<6`C02BHv<0dh1%Q^x?VdI!#tYcw|4wbt|@^xUjmciQw_@7w5`rPE$fkZQGk!3Hf6 z8X^*rsVWp>F>_VxRP~TH(|D7H=efKra$iMENoS_!{DDF;KtzWSE4FOWBmxp5+l+lM zu$i|wTO_Q4V5w^JnXg4f&BSW0m`#dV7EERZMHLiDXzsQqQzJq}6u`n*&}4oYoe@!N%BRU8lUgrEwk1(XIkzN08R3 zI6ikU{MEnP()nTe`|f}J-};~auWl!7@4p83_2%X`C$7ZKVN0JZ!eKW7Q&{Tdr|xVy zwP$FQbYIIo+^yX&U(~w;LTv`IL)MxdB^Zc{DOPk9wW3B}Nq=>}`;PXl!wJfjmw#{Lde8h; z%3G1W@+inyA<{*ohM+?9WLaX3` z7=REFKWAo` zA*?KTUWw~=VSHDgK5f=(u~HRXLBHGC`Yygcbe84%PPWA+3_(U5$U-dM4r({M{M&l_ zl|IgUI8s7c%e*b&WuxJXWhsEjH!_VJ-|2%~H#ScHc`R3(Q1vd_P z{d{-x?Y8Cv!%lgtDTOfXvnnG3n!sGLU_eTQiUwkUGj<68QIGPS!7+zoh-QjLfU1UU zV#m53Ff}7nAV38M697V+mt8o!3joCwcy4zfV?$sAhb~xPj14tCH*M`&ZkO2hZQC_X za~iwz)~}-Ledxp*qoJyR%s)3e<^aI~vKo@2U^N&m4dZ4yygTGKW4%+!s$gKIiV8po zKx$5b5L=*#*aI{OZhrI8M2*yd=9P5yW~d;l2J_H^S*nT_vs7$SIG7f-WLVHr1wj=< zz-FG*eW&&rVd$-?Z zN#JSnK5&c!26P184_Y?6X`2#I7n|Ut@7fx7_uv12|7AM--_l{r+zuwSvA8%b@kuHB zZ{8lhExX$Y($sp&dScRIY=We)9m>#WYvl5ocLbV~6|_ouSH?A#`#RQUKk|0%^3K1_ zJPe1!-bQhQr@f|ci~aI8owd8q&)q-z@a*xbza8^L5KmgC(J%~Cwke&46;{4|b!bjj z?F#vF8O<{aNL~xSK9qO+ve!03-QK2`uU>3U&OX||`>#%yKX3-;tMc%(kN@PK-0$(} zM<2fV#g{iPp1-?(P195lhr7+UyY0RbZanNougCEwzyNAwP#nShaGL>`RmCH6p7aP` z0s}QO6r8`}b3e(vuQdZQ6$Lj3o92tO&N7feg_&B3#~XwJ61urHOJ}QtS7X6o7$G+1 zIt?AhUc05WCp9kHb`{##y2ugOG5f~(K&)0(piC8#fjMALZ6FS+jMcyr+k63t41id1-jJ9m02*oqE2PyhL(Wh$NCvBBh{V7_LJO`#7l9m* z2L^NhAzh_hHbW(i*19BPZ%D~52}WV>dB4t5Mi2$+MnT+`b_rpmwo+(d^wSn$NLkET zincD<$-&!=F5V%P3qJjrZ80Wa`re_|A(u^C#{m6<%gPJaw15XogUS>YR=ccaT5JxF z?-A~;Db4j3bZ7e%-`=`!?#A8zFsMuYV&mYedw9OTZM|0FGHR<3TFW*%qZT_?A!BwE z_eq=U@z6|ikUO!d;-O>%pS88Jezd@+=S%9{MfCss)#WFLJKvrT@qxemLg{K_b+@a* zpMU@Lmk!GQ{(%=YB4Jy38kP`Ff?RzL0#)yClrw*IA`FHw7PG+Y#^7F z*MfOx&2rI+kgGRi?x(UEM^<>aI9XmkX^?lDYrt_m?bWwgX+4%VCPL9Q9i*B?v&!{W zhh;yU_T8i9;~&T6)9ctDLc=g=E;OWC0xTb%@#12HlR_u8Zp7t5M<6Gw3VoTTap8DX z<@LKwTS~Vtm+SQC75wbL<8^bly2p&2%?T`>b3-*wuQNCSjM1_G+UDi8m93dE}XIx?nax;%YFn`3R0z0-HDapp0nq z8Lu;{0}U7y^?kf!)=SL{X;%Dj&S)E`DN6wW24)S6J`hFd8gLDDE9@51EudLwv+_;f z#IAK+gV<6)b`{Ck0i&p?rqQGzmH@7?5C9k}f-@0C%QB_B-j>&!{Bo47$UJWlm>H6h z13^UuaB6Jigjfv#lu*^`yu3zpXx_}JFaeoDt!k!bQnf-YRg>COAOQ?w2_T!Pq0D=7 zAW&1D|2PoYA*d1wS_L&T0zngn3ABPFtd+EyA`p}F=p(owA)<@G&X@tv5YWYQ`sD0t zV#;}7Y){(g(xGl-IdK2NLG$?W5C7n!t3SBtSibq|+x(wy^OhuC*J+(YAsmzrsTP^M zD;u>G9X>m=(|-Bn^zxGjG2FGM+=STLfUw-`rE=?=mFS9U;~DBOSyIQ;K;!sCDNx~t zN*;$T6Fm!iyHASyx^X$l?W^`aeLlrM+Qj>BuD5saL{Dzk_VrtNyUle{Vh6(L7(l%V zOlF{9NwMnGB58|7m7Som7G){I1~00eckX|A-hGzV6)=#UpN7}J`DMZM-S^)=y!>3s z4k7P{G>#IxQ=q!p=M8-}U0{(cZWvSSpC6cFvm9XQ^1=ktA5Fm3KEpC9{U`;Zyv zvG#%xm1!Pv?fCxK5!;RGj6}hJ!IYTQK+ypKvNPw98}$vx1%##bz4jgUEAYK*I^V|A z*q=7x)N{+`s}i7SRRO66pxIQtvEvw=CsI&Q17pZlBn_$F?DDIj-j!Ox$dJIQ0hu|l z2n1$~zzD(2gL(vpS=cmvj~aJq9*|A23V@h`RLxRqH37}4li{FR43b&_6axh{ zM}8+N0hlSzUN^HjF$LKmgH|dDEW<1;AQ5{B4TTO}B=40O*#QHHv8lN>ppOa=96>|t zb4jO%L-&?e>3s3|KZ%b%yZrRiyEp5XzxmJd`Y&$xFRp8Ohx{7Cb?}EoH#zOKCM+Sa zo;IhKCyze8I{kQg@cZ^+w5q<1WE=sjjH83Nddz1Z7jpo$*a(`9K)n(u6m4toESvU$_-G|*c*4|7tpMz>!DrLxC-r} z?ZT>QS0OB-Z@l;F!H1(jC0vRW*t31|f6&gz&(Y63T# zcsi7`#H&qw+Whe9XaC&JKV9r_SMG zL96zokDooeh_sM_@0QDMb-7RoaDd2Nhr{)4iv8Ky*=N)~8}_vU*^k@k;|P<`ffY_x zAC`h<<*u$p_NKH93Wi=LC#_A(s^>|QKmBYcHt;Sy$>sj#v>W*5aCu!X?&u#)r(eEI zH?PX8HQo$pMJ3Vh+co=`VW^s z`9ptx`d7dB&9~qDCfvO$X+I9*b-5b{xI?*7Eargxq_oj=&|q@ZxYpIsoeQcI7uAEzuYb~-lHe$g%$z*1NVwwpMfx_{5Q^YD#v*ub4_Q3v#fi^`` zldP_NN~vzbVSRCntNZcn8t!{r*2~@7~>WeF@#SY5APsT`h;I zsoE6TExN@Lt}Yz+)?ZP(+NI?b`_Q)CqQA7#I=Uy+hH9P)`M8V^8pe_a0v?mZ_S}oh zwMm7S%f|7ll+C{0@3eH>`xqw4j$j2K2;8uem(e3+V{T*&r(Tc0%@OTXQf zS2xcl_vDTJ^sasWvhJ_Dx3Bfvds<6HXoDmrO%5YeLFI-WVhh8+xVreGKC7voxYNgH z3F}|{_C?zMKR^D_kJ@(e$qyg=^56gJ-S&m)wAl_})z?iCK#jC3Wp8c+m}?@KGpS~P z2x{+TpxF|8ls$3Y;m=$KoxSSsCnNwi49atBAp;7^%(F)X1w$18U{xS;hKTIV6CkN; zh@-Vlb#3j>ATFuvsa^QCZ{xx@p^eSb$Bv_Sfr!B)I|l$sQ0s9wt!Cztk&I2vDv%{j zQm07|hx~3Q?}oA|Qe`%F7%4h5X8=efU}9)9+gS~1Zm3f=VM7B#K>z}U3ZgI=R@GW9 zsZDAbtOyi?WS}r7C}*7?0Z``Ak(iN?kO>e_P?eyH3MiO>Wq_=bpcIrci&8`$eN-PI zwCsX11J6R4nTeUGR?tFDDs?rU-|n97mKP@<`Om)4<0_*!b{vvqkc~f7X)^U?Uu=fe5vv8dKE)yD`LR7+*zPbm57Vly@8O z2X%E2PR%KNx?1O5|0%XozerBLzn`9mk8bGo_u=brWZ1*bf}*H)El-4Di2PLP=R{rq za5(=ccHt)7ch5e(s>|K&>#zUa|8Vi~4>EV9=~ri0CGFcvyRis0=s4G@s_nhM9V@Du z@O-Gx)*gI+-sABs127~5P!yUMzQ?;QsTm?V1&1*pc@Q8q0FWcD8^GxJCJ+%bc!%i0 z5M2;r@Vx}j%>sOn@xu7jH)S0MKMz? zGJ#al$s`F@v^?YZ?fnhOKoA(``bI)RMg#^1AOM7^cePE60YBVySNdq-9`851ufP2Ntl!?JG~Ge}0^lXW zn##Ir>4F+e|JBj|M+D2M3%T8 zhVigbOCZxQVs?v$7NfR~0)?(Y3H6YjD%4sK`E=D^k<@8?HCY49r0M| zJGROhz+4qn@2a+k=zNgAf%b*(Ru&tzkl64`QhqEKRH?7##ev!_WN)4dRpDY-n<0?RG`o^q>(_!3U?l^ ztroxlvr!WT6+rQ*;LM9&gZFL-<$tK7ooxbQ2h{sm+&SKqm4) z3>eHas#h1#d5tZGt~N^-7tk+b?7ObLSgb==?t4?;r=`tNqk&$HO4i z01Yb?yS)72s{5Er+?1|~zH_{wX)O1H-ji7;zKu`AVli!RR2@haO=FrcX2uHa%u;0u zD!@={DYcpbYHXFFV>CZ#99n85+sB8?PsYoi^K_l)y19Na-F{78-~43xG2GIL z4+3v~xqkZLAHqjJd-S89uYdKM#q-N1{OZ~H>QDdu|8T!qliU^6;k)*r0cw(Yp}Y1)%6%u%jp|1Y*XD-~br{ zf`S&%s*tmlYD3C*soqazJ(Z~{h?pWB-)5Q@5d;Qmq)MQS#GnCn)_C>(=MAmStLlnU zRjX>PTF|PPB8biZ{#?*bGqlB!0BK$u5F*dwFCoL6e6Wg|O-w65Mx9VIO2v5*M-E&d zj_MoqPMM9_=Iw(4Xq8f^P6(4qW*yCaQqH^OL#S~5)%Vx7dnM^w1x_Eoxfm(qNi;%@iB9-iIeXYXkB+CIK}=eqOl zTff=dBCXC>7wek#w=XY0I!WbZ|7VA9zWqweJ);CZF+-+_#paFsCYP;%s*<9j;ru}0 z(XBh1;S3dLSF;gf1v6oq!D0%;1mtwF^e$8gpdeYB0DA~LRCYBs%U5qKoNzE5Q<3wD2l1p zG21xSyHRfr<$kJTR;Y>!B&xIH2h|K`mC>U`Bti@}m&wdN5UmDail7QmE0|UVt)d26 zR1Fbej&~7&0?-@_Ff=1}^AJfyfJEd72x(q>DOe`0s%izafM(1Ew3sRYBQvqr&`@mA z2XP*VjF13z=A}ykNl+)%q`6?8z&&}BMt$>(`>kpDeyG2d_>#kKX*x6(F2d<$8yb$Q z!lzyXYO}x?PEQ|paYZ}JG@y5$0ksdfVrr(EorP3z)ddglc56_7x;pVXRi zOt*#lCW*Y-y*}+eY#pdT;C#Q!byp3_vU}`d1(OBK6qlQE<9OA!zl-I$La6n%D+nBd zVXZqwnVUDYQsurFgS4mA`gj6U%iG@JwQrhfx;H6^PEItf!o{lpgS|YyMTwVBpZ#(F z_`|nj-FP1Mx7Yvq`**TEk^5nKbN~EW*-ce;wl7=iI_YW*^9VcCBxsXNO7%W=5v^UQP!Fbbt;Oq!3gEG07@5 zm$8)Xkat6!vQ!mNAT?8+Z}~j0x#@V@pXVq<#n~ExCbKJ4XDW~)s;CuE{U7N-b~I2R z(A>#w$36?g$Eg zQcx0>3DSg;@sLSW9o+8kcGLaI`QL5Jmx5n`9-3M&_=6ulK6%9*q5n+yq(C|3!AD}3RguH0l-Ajq@`E?o zXxgv3INjfP-z*v&v*f)S#~tGqeRJZ7baGx&9Zf`~OsOCt79*$ttlFRjh6Wo~l?j4h zTp-YH(6)<`kRkdK)G;PUg$~~DVsp8!?e`-rfA@KLaP{aJob{USAzIs=d=j6IuU~!h zXMZ!iyN{$V*562g$qc%dtpJwwpf@BpY83^{qY0qCCk~pyKdiwFL91glKmstTC}b#r zjwLWJ&s=k2A(|qnlLW2{O=!EWyX3gy=uKoI48&|xv8{rVMfRl&L@xLkV#Dl!Bg7U$ z^7hG!~kqa zggk4hW&)Z_3*=hoY}W*qRTK?@z4-v#ns3N^a}J4TnR(_wn2zLMI>}V%kB75Y2WszJw7J4 z%q2&Zor32W&U|_rnkU{}X>LxJXCIxOE`0y0 zT-^?DLU7JNJmzvgnniR>2bEijheLju##hFUTvOpB5}E9Bu8NC=zjC2t1J(}EnM$o= zv0U@(#t89HVFx;j6fhnHnw|Ub>%&KNF^=8Y4}X8GRKDLr4v&BQIXwCZBy1%y$bb3I z{^^&${M$Ffi^%HF{AQSjsSec*me?Mk(Dwek>#w@S1N1$4&I!8Kq9ZDKn5IexrUPU0jXCr#viAX< zy3Uw^$)g7(q`=O5atsL71S%8F{!*H6-Aw4fBNC{ZsYxnOii|n$O5IH9eysbVMUN<} z`GuL8g9J!N_JWa`AyhDjb58k~R-PN%2~D7ys(^qU&;IwTaX~~r@>`Gy2|Y@d&92{}M_rKBb>@3u#G1~6lzCPf37~f8EqhS|90|i(m`d)k$Erz!6Oqx~O-gU-l zv%Sl<6Q5ij1SjKV$%+P!<0LxN^22xZ+4JcVukQQD;pD?jjbp05jlcf8@815Kzq+_Q zZ&&wnnx4P?X3TkY{&dRw$GN2jPb762EdGi$$R3T8O(8BC7TUz03c+V(Hp5# z0duXheo-$kxjU(03DVc%7%ldE=KMw1K3T@Ardt>>F!UW(tc)Jwgl-gL0Ow;xUjrjM zLUO+4rez084xOv)z%e0^kpUJ(QxGr)FjE0UM-b3CV5ooyqD8eNIOMvW>g`x=2i;AY z>kLmn##a=P4Uu?8`_ELK*_{CZm5`3SX(TmN1O-){vvG?J z#XLeuShA{$5hI}ck4^V2tGj9s3T`0}pFKM{yNWUFr<<(z-O{*mhRRHcg@MsdJYPl! z3R0sD^(`1gB1CB8A_AdP9^X<`s;zNIxHVTDnA6o%N}*LOk5=a&*Bzg&+CO;spYoTlym<$c+}N!xZ0qn=K6dwcWSJigBIO66@Et>j8rL9wWoybi$U?ji)dTiptryU~ybX3Z|vO5)nwX*4@rXGF`s5i?M&5B0f25SAF;R#00cE*WnA|vsqM3iWL*VoIoe28R!2Yu{sAaK!_YXutV}dGZO$%m8yVdr~~GVQ?fEa zEhY+JpeW9R+dgf9L3>y=zJIhhy}H8I?R9bs?3PQ*18_wHz{&9_T7Z<%7sE2e)$+s8 z_qo^!+pxKw@J>b2V3uMo9NTB@sx?fFN#mAv>{=qQ`1tgLju-7o`{2Q9Z}pp-%|_gp z_b*;dU)`iv=1Lgeke5s!O~c)^e^<*MZEuz<_Z7~xgi27b5H#5^6`ew*WjmEg>$Hr( z4ub0$2P9(&4W1eWSl{pHn1Pp2o{%_QS#|Lpv7 z{q6Vs?k)n}-d_*7T<>lZQfz(Q4^^wRcD<=vB{0NU(J+G}_yh@Qw(ua3fjDEpifHWP z*{MBwYNwZFc@n}hN}%Lp;WpUHyYmp9bnQiKF9LSpn4MCEm9NJ$uw)7Ger#E1Gg4h^o_cJ#5ltT%utI-wVGKctr_zK zb)uTBDkvBcdS{I0azKW9wFu2==T0xWle4&N%v?Z#ni!o&pR<&Tr7;f6E-ZYFDXB+} zbk^gUP#5Z*g}UAU#wf??Lw(xN>U{OZVzt^%QPT(#3;0E^wd7& ztGnIwR&=Z|i4&zjHYGVE*o7d@dawA&UH+`}%Zo?h^21to^6Ks8FZ=74Z?1m$lkwfH z^k*(Pbi{-a-i`OT}lw?GxeCgmMyDhR-Yre+MRpl&q!PeiK8()CWp?50ab{NKmc_1i_VvD0|6pW*v_DE4fgCQ7sB4_i$Pz_B)W>a(q znK(~cGuCQiW~hk3jO@v=IrJ$mxI15TkFTU%Ro*EMRWkskGNR@VPXaGH*RH~2=VI5i zhoLS{T4OB56+Iy2jjl6KwGEc0^DfrX0-O_iFxSXM;Fh^NZ(vitN(iNy#`@*q?wkJ1 z-|eSy+D5()?dvYYA)K7MCmo-Zp)d9Be)o$6Z`zYGCL={p6&h?9$qT}mw5D+jI7IFP z5|E5~Hy#FZ`$~yLr9zclrolXbGbl_%mjnHD%MUbX%WgBCJbZHgM?YHI6-uMm-`dyT z|LW$}-M#+t&%ZeN!Abk8%iZ-pZ(rGTZ&_N$`>Jme-&6)uHYEThs0y=94GGW$(5Zm~ zFh&o!Y~u%y>PKJfubzeOqy^tdJkPp_+BNZ<-3sYE`g86U4fmmIQ~@AFWTawViWg2I zYyjp6q>_Q5cZ5RBNMKw6lBpO9aW$%92r2>ygk*&1*w}%OGlUtBPPJ-NuDi+ZC%M_z z`%$K>sxxiksO%gOpL3&(7|!cCwK)gFh-yHfbad_+s1X2(6*DwLFbBYh3_u8sbMKKm z+I7H=z6L}Da6pJOH)fkz1*@tFN|vgcp(K#WROh<}IS;zAP`<`0MMy|sO4&rZSv%TUZW&3SM@ zWZSW>(*a#P@uz)hLXm`TZ?(;4p!aCqQ5i?!v@H=*n71)PIm_RP}bn0 z<~)tV?fs#3r>mxyV5%;srE9JP-ShNH5V%g+Or7gnQnSHdZ#*Z*E?{+pbjxq)FjkC~09;vUxfLk0OeMQUM833<9zmqJzh$ z^yx?V*+*{m3E?>bE>#~Dw`5y%%_-wK`Gw=N(426)Y$%{HqPggq$i}^R*ILda2KES2 zEFeUKN+`&Lh)4otszO`|D_OM)#-=)3NeL01v3C@a8HfPqMs!nEO;Qed+>P}xlquC* z)dav0LBx;{$!1e4vK*J!49b9H0A_--00wLdpw7%p6~$BtWlk;{phs|MOz+_pD4?ip zpeQrB7LXC?*yu8Eg{_)Y$RY(xCY`J%6fsdmR`TjS#OPuG=K(zs<8jd;B-N~{B+QA^ zq>`mpP(uJDGG=0D&H;Mxo?WwS@bb(@3M`IF1s|7ZakcEJE7>xc<21@OINnZE$rLeA zTh{A$d+5oSoRW=8zdzKpU~c<(Gfu0kN4`0mh6yCAr@Nf0h-6WbY@VDl3lPWdNrTIr z)M~WqcEy0-ZpUA)cdud-M9#r)Ip1lW06lP=Ab|{f*s4|Mowpbe0+W$Wm{M|{BXLnl zpj^yVtb$oUF@kSmDO_qg+hDgpnVQEZpML(;i(hFPKR)ez`)tVPKApw*^uvdX*Kcp$ zynTIrzkYjvef?$~TlKD&X;%W>8^6s{HAZ73@L+0K5P^#VgAs^Cz@ics{P@H0#ShYl zpYnr8jV{g>yVq^d~3h#uL25hw$jVbxr8O7$@2^&t0$y3dkSkE@J% zO=Vz5&XwWJp3-@SF$c%1?rdCO6;*X$XjN?v=32Em;*SpJu=;%5BO;ih2#S%|Ot~eV z+dbG6&`gU#0WDUO)LOFDgqpPossfTa2Ncye8Gl;Mz%1O%vl2NOf z8W1um5|RTVBM&at;K0SpQ|e-5WHN>jIH)BV%f{}^0-*V7glU_rkOc^?IjtBC5Y&gx z4J9+pO~~vQk9IfL&MqT<=tCH%cVug+V-~T>W>KIGi_5M*gPnPL5MAsrINj{`$=6u* zyStlTRXDIF8o08Ok#nt?xw;Z^Bkd*vBJ5cy6IvCnLawq1#Lz4oW0Un%kuY-8DI!7% zN@zQTDY1oO=c6Fv5Zto&4;eh(e)BK?Papi_|3$)2H>D2J%IOuZz8_w^&FS5Rhh58~ zb|K?vv#t<5-6&X98spDc9L3NuCjl$!swego0X- z6%~}t&2Zg0CB=>q35nF^u4pw=sZ~-fLr(iCuZOzHl0>Zn&TK6JHKaMRX*j3d&5RM9 zsWTw5<7Wg6Q~-oP2^697OydN_nM#VlpbmI$1a$-k3WQZf%!E*9yC~wZ)()(qX3#7m zT11LUrkX)3&OronNM2)uAp(0uCq#3rFRGy`sA4Tvv(|!jHX5-Zl4AoVBqbJh79x@d z&*0+YlhTFafLZ(QwAN`i4U&z>nt&b+JV>@uq{H4XE`9jKR}FyP?S^ta4bK}bLvbZG zWv_ke+V(=n&0W6R95xP1u2Zqp(~@0moM$u}$#-@rDQnuxF7G#39oM!C{p256glQb= zIPuczZb&Wx7goRMorkl)J=7G4iqE<31_LsxiBGytRZOQLWe{(y5w&q*Bdy{i#^ru| z8J(`GJ{iVtZ}xaaPk(y9c1w&;^hV#T-+cdn7g~><|K;1KP-X)%|MqFjlS~}sS;}i0y z!C!>-L8LQoT@xCVhy_8jb4}MQTBU=Hhkd>u?@W_w(NbcwWOj#<(3_!vh;cFmg-n=% zvH}vlC&V!W0up=l0of}$rXz&~q(B<&KFMZZ?+&t_>NNBG=WX?zcmbSO0BUH4bLS`_ zIZ#GKQ2U<{3Q=i}^%#K8E}lqxpg^TIAWaO-0zW`q!Q_g)t(d*7i@heiE!JybE3dQL z#ET~->p|Y_VB;Led@g#R&dkA7+>~nI(bWz3U^#D7lJ34e|9HipI4H7t*W?CE=;%Io zw*ucC=*}RkRMB&1Bl(C`F{?Pr;t7{aI)wuBpT^zze9%qF(tro9n$yZ%us?UBuO3Hb zVmdl5eCBA+dql@YZTz+tZliq;A` zVa-qriXeg^n_~zbnrJ>S2O|fJHiMExQLAY|sWi6;%px7lS}n~DGC%}G$VVDAnSqmB zjm#^HP1}7JSo|#kC8npsMWhk_et-`6|g(t0zMS6TyJf9HT6%Xn#{Ha z0Udb-6kIe9iZ905D7a?H=At=(Wk;uvA4Jh_a3X5A*W_Nt#s21L;Z2t`4*yg;g{b&hZA4i`J|J>p^5Ig z@T*~yIaDLZ6QL*2>P#x2fCCG@Jo<=#=V$Qg&sNJ%KBVR%C71JHoO1=@=(+cB(ZnkX zt#hrnz~l%}RUHAvX5rh&u2d_vrd)s&tRm%x8p6f~ac=aLMAQHYaqdP}tw>@9Mn)iJ zh7>$Ypx{UxdImrsF%wBi4=Jw?^>$b9r+TPT)J*Z%w_?W>gS`(kkOC-x%uy<|IRQ*Y zh-!)|3<4merYbZyl+N*8M4Zw6hyp-hRjJOVAe>)=xkUhsfXylxfEBBw)LgUG!j`pE z6j4AU1PJCDb5VUT4vG$m5Rgn~K^L@`2+mwmL_=isibuTOY_ThmoSr@4 zTqe6^2Mata6m*liJ2&NF{a^|g@6xB=+%_+; z;l*#>z4_4xkK1ti;?I8BeF3kGnlGMw{Pbsc-)&bv{ATy(|NiCIzv))132Vc~d5pyd z@4uRM8|O!G4Tx6*#t|`?BC!V71^&Ua`h(wvkAJ>;@WB&+^E3wHTv;l2NP)O%;;HA8 zNT*`a9A(2`h=e6qGz>tTQK%#-(>N7Lz$uq%S1|Da0IwlQL_t)&DR?%(qOPQA$eN2~ zfJ!E4@B1Q5Q566Tg&7#kAqF#t#7Ck~6;qL%%5EyRL%vP*P^DBeIm(fCWQ`sl{UghC zHj~kD+L=+-3W}x(Mkb&FprkXD-{{y>J$KZ=ypS*g1tpm8AmkJ(8R0ypp#d0K1r?EM zrCK(fLAHfdj-Lwj>LP{)e1qtP2;7kzW~Lw&YB4P&#k7J|1VyrWKr@*c%xr7!BN+2m-ylV`o z=F+x}iMqIK(m5uc;0}q%SZa12TOT3GB7c38|1zibWERJp z#3t=RST)3EzKvZ#sYn2WKzzTdoDt}dLbq%eeRO5K-Zx?$M<_nodf$x2!**J`BA=RFh$P{pimoe&>VQ8$fArlE)=u@R(Bl0-E?YS^m{6(^OESZrp?(NS^4 zc}t5%hz@*2ZjB;3H|w4T0+Mx~%YIkx#(J}_yIe(772cEUj!p+4HaJH0Re=-$kPT=i zdg2jc%?h&tR7~7F!2bjG1#Gq;&u#EklmM#{(6O@7m>j_TEGmG2WrYlqrHU0T6*Pku z6+tsZ1n0>|a1oMDkVXoCZ09^YS1Q4zTw(v zRLr&IHuPrNhQ4-7)m+`}ls3cWjd}CXbZ{zhVNw%~u3O|>l#*!z@nkE3K5GiL9;Vko zjae_~M)d4FwU0XLot>==%l$jGKzyG$LF?~uKe6=({hjQd4Se?X_+%?5{NdvA%7+Ad zTcN-H<{RyBm;C-$&CBn;+kJEExxlnn88-L(?XW(C`+bo#Nx_vriwz4gmdd@f*Gz(s`?q;Yf(a>95K-9z6lH8(x4n!p2^<}ooxz^?-pav0IoAK&EJC2+P=n`$RH||3SE}B7WQQt{r+unxwm3$bu zjMM1u-!hwd0yK8!O-7qkhm4gJb=;LA3j3xBsMz?h^s7=|11U2`YJo#XrYbF4i(>m* z&;g;M)2+ykM(u_e`tw9TxIO%A)DPZFy6#7NbdOKui$5Y*k6&Ip+xWv6OR z%&na>G~7-Mt_cxMO@ikL)q7Y(T6$?iaxtEE=%tFUheO&y=e!qFBf)XK@eVCEiW-}~ z>?cGA1|s{C1W*-)2$4`>o_4cYH8mhbPy%xhJX2r_ggj3ORx0F_r!lWL<>rvLr6v)b z9c%B`y*8sNj(X2A%0OTa5DjS7S#*v$2!Md8K{b*2?+BHE0HIdV`5gi=L;x}cHe&>@ zG@JgB(F_=%f{9r%$*M())Qpy`R8Y~Gu7TdzqiZcT;G(iO<|A-Ml%$v@s2OU3TEH?E zIp)m3)C^~L;``s8z3&YMm>pXV1;p)(f=K5&E!g?CXOA$Xe4Xs3P$8^63bPXk}2!5B_X=y!g^ zKl(lB9^1$=j#jMkj;rayabq20Qz3d?KsZ8QV zBiyQY5j$L@>7dk1##*PWKx7J|)naT06|@qn10_>pVNgdD2_lmxN9X{}pqN;aldq0Cxz9i+E?kN?3Rbor2XEvMOEKu_S25-ScmCX$$ABo?HdmK7ac+-(LSZtzWL6zbMqyCQ7}3*!WG(93T~Q z-ZfDko|PYeE>C`#n@cz#trMrQalQgLb|j_6&;?p>yNvZgq-8)K+eP1@(YluB~ zGB(g^snl_h!;o+HvPpFkm3gp6#QD}dTb%J|-4p{-L(zHeXCwxpBgn$$U?iBSAR?;H zg#~J;W@c(;b8iQ!C?g{&12KacDzVzJKupMrX#zc{Cabv$NCqh)LTU&IHW5@UBeTMwQ%HR;~zR)KyVM>{G|D0T%>oqzZSiZaGuM76_?#I#Pd7p z7OQ8GPRcz%JnNSa4%_kO@Va<0w{j9sYco;Wk9T`mS8JRPA)M7=Uwj({${`TDEsmD#oh4q!8s1EtG%@rsC`)FaCi4k!=jCl zVew{jiwpVV#e%(Vl)vBZuBly{oUiC-zu!Fk2-4$jh`8O=5uzDFVr9N;Kkoe*gWDgZ z_0R>+HK@e4i9LB#8%mkR=CBuI~15vd5(x%NQp_y*`)%VaR;v}E>B9O-U`W=3dW6`>jw(8(mLiWQIvYo=1) zS4I&yk0DqH_p6g?Xj9E~otu{XlSx}xSfKG@Sbw~`{lR*Fey3+ugPm=g)$V*T zMOU)(Ghk^-XX&eb95vW$)#8&K3uQ0=9gFPcHawrC+kYSDDOPs=bNx5t_oT zAXPQz+6Ij=skusx5vgPqOuPMd%3H80)%}Df1G6*|%nC5Z&b65>Ps2DJL+A< zicA!XS~V3z0ILMX$`mLvw!|Kf8A4UbHBI%fFL#H$nd(qP>_})aH8z+#EfAP#Mp?~w zYM22GWXQ80XbzWvAqZGi6Edk#RpuA+cc!96_t=p=sR;p3CDr`wcHe1F+LxkK+yZGkWiDt^zH(^$7TkyGczn7VpFc0#w`F_@LwTEgJ-4yKq7(Y%9wdk^j3!hYT3hhAE|U&N`7 zS&T7Ry&dWUac#gp@Ch=Gq9CrL#>8b;hngx1Hc+cscJpazQEDRQHik}>#*$0UUGXgyzvAW6Q2_&aoo`L<6hEO40q4Z^ydH^-!c9%Pi(c>DbUWtAj@! z$ZQVu+I!{?!t5RfGy^MUN}>Q#P1ICo-vi99Q+&@twHXRbOlU@cOgy6>m;eYWswxz) z$!fN$QdkpcMiT(wdG7ZCd_yi)=fU~eEN%0exMq+{l1wMA*{oO#Fy*YEE|NI;=^0(R^WjJ;Xoh~rYiNgJBw{{G{`_n*I6(~IeZ z;%)QO{QFP(OMdGEPP6{ca55Kyh5B z?SR+a?MH1^-=%RkDcS!1^=OjHsq>Pl>)=7>&J9i)$oU`%7an3STC7Dd>iCY_ZT*9r z^2v{zzvTMM-f!u2H*POkweHv!WS;fZ^O;mHR0L#@eV%s74pmB`y$AH*NC!|t^PO*2 zZFiV#7!Q+-Na_QrDwyD8W-3tU)V(@EB_{R&k)UI0nHq2e{yla-PjVRLb}x5R8H<#e zxT!#3rn8Yv@fhSZmG>B51y$vFM1uG5b~FG{m8uFLB3jMBKmf>4)r^12JXRT4;6DNuz#0UWR=7odsez4?I7+YCoFB^A}AwO~z964a!% zh*l^{YV*7nkQsg8(75Ovdvxg7AsG=d8=x|JBvY&y1Wk#|W!!PJTzt)lO{Df*^dj3S z%gM#%<4bqdr)jEhZ_`&>+2z&!xO<1Wh-*^a^zm*H{We|q{Z{HbcK!X36|kiHB-g@Q zw1tec>@S0s!knR^3#+z2Cpm2|Z_@UKB%6S6xo96zQ%g8@>HtK5m-Y408um(Km?jIieksoN&?Vh z>C8;@1qU_<)74~#Lzq=lnKSN$sAxn-w(nf>2LKEtOb(FIn1~sjA<=A6Ffl8tMd#m4u?JWqc4zZzc9|anf}wKTPSXH;22dhe2)(EXmuf zqif@k_G5l?hF;;B^jB}ot2l1D{;_!nhY~1@?ka6HPtG}29rU0BqlLCh!+zMP#KXQ1 z+(cAdMt|kd?GN_?{mP##R_ABU1FAIXUJ&EhGWFc!_U8NZGq|8D~w%AWvoN)7F~bwd;isc z*+TdH^|$~2``gck^kJZjPd?sm-}E^3mLH$B+dO1oXQ4TNa{BC#x<`)>2QB%&RXywq z`xw%lA9DL4;b9n_zy08=yVJat5B*;}Q}|j);(~s7{*3 zx*qCgEQc&v3SFB_D+;&ZF z7K zpDwQC@D4jz@7{)G@BG>I>+NocpMUbREuFL%i{SQsuyV6OY5cTyknc9H;WTxAaazy# z<@f)M<|mW<=?D8?x_Ik9Z@aTgbZu!ZxR>jD=axN0p<-67Cdp$7KoB`Qt}dHw>d2>= zCAMVh*3)jvnmnPhq%<)DNF`GsA=E(DIO-k8;5`A7L$C~5Y|Q08)y-J8NwO%a9a(M) zM>^O07%m*2e4P0aU@)WH3DK!ILT5^-2r5ujE7mzIGxz=}AduPYd!w1`07OKAkck7D zn_FMsYfK6@L7h~q3TQ#iXcIZv?(mIJ}yM zq2@@92#TxX#;W^VwrEY3!OHmb^oz6AA1^Qd>tXlHYn*m|w;i|4Ep=4XdGH;dR1l^l zIt6B1Ixr4&dUEQs-X1oaJWT+ui6=c@Jh`|$neG}2r+s^`!&!eCaB;ssG}4ruB;61D z-*)-s^RHi=e=|K8_h0_ie|i&c|F7R)|N1QLe%IMm{J38|IbWSE|L6z(Kl${*KmXv- z2cd6#t)xxfD7Wn3+mweITaRrB!k zZt0zSx5aY(+dl{55$$kuvoiBb|M=;%?WQ|BIXyjps-0@PyMMWe9i6_r{=?mUzq@|& zv&FZu-eV@}r8*gID;D;3z*QP24SUo{V07xL9uS4v000$t`wDp~z@n6opJ_E=anY(4 zfefe&7?2lior9h{TjO|kl;)z-R5youH|5PFQx!e7Z_i{OGo0x>NXLR=via(2MyCHj zVvcj6C|Xb?ML<(EP_1*R<7}{*+11Eu^1d9IXil{gc`}DgF#Dgeq7~E0G>KI$dR$!< zQ=JP2h#dv5Ay9101?AbmWMn3qO*51UQo=G>Nm?ov1yxi+A|}to=ikqbAW~< zfMCD?2&`m;pr8mKW~0&0BoU}XjG=FkX`i6)_U&?&7wFEv{^q-1zy9<4Uu^&S z`d5E_G46h{Sbe%WJv)24c=T!a?CRm~oIUw3J~+ENwbRCvOet0I4VzWN@Zr@L!Rev8 z>#}?M)o)VzGM4MrRr{-7Jom+4sr%=je*s%qe*7c;=ufOnjsNQIxBtK4um058VoMOu zx=GyT@7`TMI$d7<#J2sThtI}sTAZHo$IE*kEq~Zya`;};({s|mD8k!~$lA!DRXFZ0d`RL8NKl_`DKl|#F=hN?bd3gExvrj%f z`S64OM~~sd%ab3TU0%gi=k4H1)ihANth+;5m)-X-zkc`pFW;s=t#*6% zVaWqrzk=TaKe)ZV9EbI<|3Y8A*bV!tH(W9ecJ*}WE4(Xiw_QJ4oW8mq7Sn&8;v&C& z71Zt*5I_G(|LKGM0Wa(lzFYRMe)H4A^VO&8Uw-z`$*-7-3|qNLQpJrP zLTHt3q*6=X)?y7YM#rilU{w+%8hzyiR;ujK3ZO)<9YW8+6Fc@~tY+DUv24b&9cHGd zRy9?C83zPMrZW1*1$W3lEn+{Pe>=Y+Qb~{?$8v%)4)EuHVu&3)h6S?N$uZ(aVo`*g zGoQ{7Cx#sFT*JLp&0?b{l4zm{5sj7rJ?UA}U#KSmk^q5_x>{tjL{?R^##>c)%qcRa zGx>3MGuwNwK@a9xPl+5NGa|#?Z11&xzwi6_)U!S_UhH=ACDhh|M)Rsp@*ryJSD##KjgptYN$H=5PyA>@4Buy?3BDtUesI_ zUHcMwa5y%YYMj{Fp*J9>29PpwG_q#%+zwUuV|keL zR8`H-{*)y=a5hXXHr=JM0*h&P;j|+F6DRb9AOQ>^;9+i|Zp~a{f!!`!f>^{&1n9(P zoX~~>J>(qBfqdRpsG+HAfi`2C zpcSe$)XT#T>sebXz-8Z{#)Az$v=Dg0;sI-U8CZFcwN%~>;{msYgxSVPvrkfR=>f__& zSDR;&)3YwSSCX_{4UdP>k6{&F$mg3+FCOV$c)P(@8>Z{+?pc?%-HK~7EX+M2uvjmB z2ukFpHOQXIB#xGwuiyXtFaMt>{qp)|_w?a?tlQ-)X=@#y^4;q??$i7<4=i{J#c=Z(@Q^-#RndkASReHG z^KX8?J|3Tc{l$;|@ag)8A8u)A?tr~8O`amBGzf0}Rm9e}R7x)YzL z@-5V|x%!^Q^Dn;r=AnJVD{gItO$rPj;67UD{5YYz*4FB?%g?uYcsd=0MQYctck2sE z6Iyr+dAa#MMt9s>cm46FpMCVPtww&|%KOLt`!?jd=RP72&tjhExwb=@rib_Y$1u9S zdbL{bkmU1cHv@JnI@UJll#u|*yCm%F5$1pyPLOf*)oSxJj=%cmi+_Cc?+bsW<)MAh z9ln5zr#7@#SG)P!`O}F$vh;)1vs=}>`C&5a0}hu@`zTtwElHl>{`LJQ*Td%Gr!O$S z+6k*I0Kfio7XbDa`($lxt9fQWlCZ?k)0w47j-`i?_55#wS~Qmd^U=Xrms zr=r!&>^$*U$jJm}b{#+<5e5VU>33K+0K{NKhF}OFr@#e?Vi3*4-5kKfJscuD7%38f z1^^NxFbfIvB)o9YkPN}W6+sowB~s){+Q8?~ zF>2vb%o?aImmNe3A`muVVRj)90iuAV`4ZrO;qhG%8o(kP-4TM&13bcs-5uD?Wz+Rh zJrR#UPcwo(WJ=8X@c4GtW7pZS9!@Q<1B0e{e7}xvv;Cd^`qkHO|9-aH?PeDR`}EQ+ zJ=}j?#u2S~pZX+;K$)Mk!8UFDA|4eZ1*Ezu}87Km5Z#eDNm_@SE8mn$H7o-IXO#+Vsb#pMm^ux2sPd?(&S) zjN6ok>u*!~=Cb_yRG`<-8~mWbWr1J(@^%08=dTQ8TfFNwFLHg7N1gWnWl45{;!fCy-5 ztOcIy35x~6mkC&);)f@Cig7FVF(20a zQjuGIwDGv^zK`^=wc62KEY-1vjnwOqN2zx%_Jmsvma!?%9m#Qf#2L?4CWBX=Qw>&1jnuKkn|@diDP4 z?gv+&(qxc$-S2uqoPZ6e2W#*!AHE#l9qhH0aa*Pzbf4VZ-2D3eFMs~+|L3Xv1tkyrui@b z?)~r?T>bc?E>U`!^Dg_F(?9&`FP^5$&wlS;?w+sn^@}H(_dohc|M=<6{x>%`y#MC? zJAZh6!G|mR!1Mv$?$fT{ULtjS2NzDX@fN{|v}-h(6~`vt)(N&LuPJ3pOy~$|R&-K3 zP4nYeA7>ra%@OT8R%gI@UcZEkJ(PmX0nRr8im+v%1F#Iu@!X8jaK$h%bvFoy2s)GA zVd>B~1QQVx(pfS<06>BSOFYiVTZEdcRreNN*&4Vb#9}%l?mTytvrEP$n+X6O=;~U* z=g`?(!B%52bSzN`Vk81eAqg^rU@!v1QqVyNL}PRccd#H|zEOdCmNf;$*g!p-#o;hv5emLC2tHD2{_QmS)@xy() z+>YJqkmsH@7qpkiA(xOlB+DRuik;wwrAyr+3sP&dj-x-$dV8pMlbxE`;&3Eh9+u1F zn9j~?2tx65%N@^S`z5!z^nXDj91T4z)TP^km4O&}x4a?yjMIdGDd&r$90bZWyi~UH!8ngyFN1Z~es6zNsO9lp99D*X0 z*%L53k~w*tg(Qwk(j&sb5M04M0K#dx!UQhIwoH_O5sVWj$vLY_D6D`G((&Qj{ppFq zNDwTV=M)j-)+R1x#jn!vLrM&As?(mkqFsX;rse^Cx5-f_Zw(|v??th;5YV(ycEd;i6weSOjolRx6@U3by1KYunn`@PTq;0MF+Ki@nr4SxG@|6jiRpMH7w zKLb2)wl;S{-H;H_fk^!3`}F+Zy!Z*e{c`{C)&1?=rL4bf<9|)?`t#N0_kZxSSKq(f zz9gbwwQ^W}b^GPdWuLCf_JZbjxACif3J&guKl28x7pgRq!H_zI%l!6jXE61l`%kVb4NL) zT_?Du^(ILtJqu#6IaF&^r@1^G>;BZnX4QRBxGYjS3`BQBq(usVff38GpTN&D?=$GZ z7k(%aav+AG8-$xHxP=EUlyqEj)39*T6Q7ysOd*K8*o_$c?EZ7LW^SMgUeVej2nHf! zp_CwZlv9Wxra+>Ft?ovu;0<)fTBx=#4UcoOZejc~AyNpP1J)EkO3O=*mczZCbsUSW zK*A zHAuE{bGiA!tLxu=A zyX*1mfBfQqzM5(G?8Ps>z5joYXR4${r;;ziTby%-~$BzW{S@wc3FPo@n}b1VQm9OBr})YX*e!$Db zVyQ|l3j_4;LKDlGIsg-rU@)LC5kV|$LI4YIrq#5zW~vw(YTyBY&IFVr^(dXE4k!f@ zE^sV2w+dFtN6;Cy;ra4S5gcb-DiRYidI}IX3FO5~vq)KSF)*F?)Io%VggtX6+OllW zFF9|RRzPdv9*_con8drYpo44L}GX^ye@<1atN@pv^~e!l%fmlfyd*PBnT zH!lZX53apti)lYkzuy1N-@p66kLByk4JwLX0b%$l*i*=Q{N5k@?*HZ4&Bu5D@vr-| zUO%+kcKVBo|L}5q_0Rv+_dfZ9^>){HD~^j-R9?#NzH8m~lhfb+12z97!#^7~Z+N45 zFxo-ry$xz<2Qe^j6%PMjmIRC!F5^h`<3J{+(|f2!H?(Nf3yF9S}n(C``f}-Oe4De|nBWkS|!DrQJdV zj6@8K#Kc0tywKgeL1?q;R>CwIx{|kug)2==$#RD&!@@OT0wP@U8Ey~;R=ibc4a@?9 zoDsvB&iVl&6mUVzU=olP9Rc71vWq|w3R*}=33xQEAW_#dFjjh$v$@6}9eep826(!2jaqJ%-zpZ8Z;pOLg^V2TYJ$I|sy6X~^_Cz5em#{_yqReYpQ+ z>b8IUyX%ji5h7Nb`D8xMj?nr|f5F=y(&t~E{`~JR1%8mzMJe~!mje!)+Ef1I4o<)5 zyta2Y+P%2`WOaBQcki3PFXpj`{`NP2^5eTdyGU0T-S@xu)BhdxKh7WjAL-*iieLOc zt`7frT@KIZ`a3K5pfDlD`el}sy0sAnlEtzWJxZcGs`CYG3jx zPGeQ{X||%K%$GF^0iUMH+CiW%cA3}Dd#IaO&ycl6z^%)*8fmK)T2Z)KU%%*ov<7!* zkEL`vbSV*YYYLcJfz_}g-zD!>w!7v*5XX5;i7)%h74?0l<6Il*?&gBaZhyRgxqW^! zTpjo0tUC1GxO=wUJ*Q7UzWJ={)~CoiRRK#J)Eor|iC)&Tq%Zbg{->}1Utb^p4D$>M zi93pDd;+KmQ!?(x^u2-c=l|F8i$DKd^zDaVe!+)d?yP+B@#gbSE?#aNj;Ha|+IXC% zCmB|0{mJ(GyUmUN;CJr-^2?p#oO`$a#wVtMSIFzTUOwB9+&iQ30kHu89gqF|F33CSkT2@Tw$A3a3 zrhy`%bC7{A99biGHw<&L0M~z_gkl&jTQ5MOrE$mv!hk{q0E7@>q3&*B779_x8kz=x zBOxO>hn5@!f`K}A0A_MxAz_C^ye3#P zU!!a}T_)c0up&uJ8IcqB)QMmsU?Pb`8JUQP!+m!1IZjfa#$M#1zPT$8L++5nIcHl{ z3)RCqPj|<=c6i^x#nsIpovhB|om!E8(`uW?2TVG^+Jn2ixfm|UsTv*F#_>?9NdR-F zOsQM<^zyR19P&+l7zs7HCw?|ixBjT(P2P;=d9_*hU02{mT$Gc(?@v4sP0?oUyH2v0 zRR`O2d5Mg;ch7q@kpP}68`}FwNcmM0J4}aBg3bP1COCfO5$5p~i;?*kF zU(7$R@7}I|{&&AQm=($Y7jNlP)8d}-5;m>eY>CQ zXrT*NZSkczxVu5nGV{SD|7+kFgaUwoiIABx1QQZ^3S>lZThvAl=4N5h%*+?I^4S3x z2oxLyOiau~$PCB;Ov_d|@En<#n>AH;i`GyTBEDmKc9=WM9ZDA@$mg1idqY!*8m*x= zSV(n2fD}u_&k(^}7#NT_0+1zzBuE|LmN*ftIbS5c7TI=b*L4?6IY|-(Lt$ZNMg|N) zUf{SAgaNa=8Xaaki0XKlCETw1L9h$S=;TdBkJ!1DsXjf3G-yRQ#TC#vsKBY$xyo0)o%S_#c30(_06@cQvygbWTq67FumXZ zw!E#+u0LfOD301lCwqDMv9~t2@R`%9yWGCwQen*06A5Gm@?>bzF@lX{9J*u)Tg-p| z&42v${h#>!J~3C=bSc+KJz6HvrlU>5+;V-K+5^we`(gJaEmnIRp0ah%1fZBA_faEZ z;Q{*)hyiUr%(v;NZTtM$?*7}4c>D6*W@Euv41%H?YFP{yVo1O>eKebH(M0k{Gj!A zfWs-YF(_26b8XXH?~mnfYDcqb@Ldpdxv<7L)(?QO?7WtsegptBC4i1e5E3pLKO_kB zz)%J7P&W-%U&@iV49gewD=z1LKcD>_iHVSicPT z5sNX0@RCh~rDCrYyizOP7O?X<>kxSEc>(hIZW9852~vi<1sYJc#66`o@fP?Z<;#>e zIqmwCm=`NK2mk?pryy8te%`_qrlQ9RPg8r4VGE~2AvNdP+`~|?YY7uYE2Y(zxV+e| zE{5kHz53J#y}SLofGQ|D^(jY}IEJulQjcB+YC9QpZahHW1zOw;&#$h(pJ?c)=cMBO zkRxTORuo|_I^o<7zCUR{3>C#}oW_Yf?N(W;o?7j)L{IKz8e7H)bDbjfL#lz&QRaFa zO(0frn9Aw+=8HG~`5(Ud_j~h9LKF37%^QPXN8eHRxx3Ih;ni zyhtluKg9!z5dhX82Aw zr>|On3BUi(U;gi{YrX&ZkMMp{`n%iq50($N?_R-4(~I5p%hlypY(4E-WmK$=tt2gM zCE65I(c`fkPUZ2aN7ZU!K*-^N=yu11TJQnQubJv|a)Pz<{6-4rp;s z_*g`kw?*g%I9j*-47lPyolKtQy62*IjOQUdNFu)?>K{ z>S6tI{k=~i4s+MiHDgE@-;j!$s`MyqfCPl(m zyd&u^`WJb)I-ZW0Vt;s8Z+97T8Q=cu^`9KeuRWD&kf~!K3SLo{d#+}LfKf~pIOQIV z+wE*P$D3b%_4;PhfBz@bZbe=405x|G5jSY6Oz4mBR8J>M@NnZFUhwAY`jK>>*u0wV zZ~O5d+95l%SfiF?PU%X>AJXG5{Hqtkzkb30=O=5l{pC;h?T>GdkC|_~d@9F&eRww> zZ}suVH{b7h;0_#4B0=mHW>vMdTBc(;p2~4<70{6oBOEatmlYAXFMHf&u7bD(?UAD+ zl!ycX7>rCD6cFGJ;pkxj5MYiL0AapRch8a0f9eY$B2W+lBjLh8Bmi8xgY=DWL^h19$Z0?=hWwz!Bg;o$- z$QrX?bdn_FkW)8Em$PIM0_Jc;XINUC?ig-X)GO#@YIZ7inCpFUtuf2tq?o!D@gRZJ zWuonnpW27h{O<0EGE6AdhRz1bpcBfP`0 z71;3R(qH7XQ>(T5I$sYfP6KgBE#;oIoSq&Y-|@u?*~>fw*4zCwPp8L+caka#A8DFT zx?Nws9rtscb6>HQ03)Pov0mp<{Gt5j+uL6s;%PeW`CSZoQ;xUC_{Q^*ghCnI=2}dW zAh=y}C{`!qSb?I6C!S(L5-K^SE1M2|8OuH@#XEg)IsZ^`RVPA!I ztNN3ReEM+vO4`d~|Lco(8+DEN=(m@v?K)DkYHZWxLw$PqsD8fN{BwvuK0LnpT^RqJ zZ-0BY|Mhied2M{^x7DS7kVBvDFT1{9u}TZq3bnNHR1Z(};aK+7N{jCltN`i}%j|pE zY%O~tAV+jxRIP%xuc&;Bzj5deq~f`}J)5h5bX^6V!J zpzt6yZ)T%ub+y)_)^G!i2mwxzdhdIZ-Z=qFEO~4Xa)l-5TxV!AT8ZXo#tOliTojCi z0TPaY2_gxk47823LEKOpgam~Xr^G37CI(_&w!s<>stUDQF$gzsHT5=o)plw&&3aTF zA;K7>Pb-kk)tXYY(Z%fk5OGAOwj>01_b}kS!%F2#6)>0B+&U++gWCf&(oGo``^8j4-0bMu|&N zG3ZP-kMJ5ghgDP8idHGw(xf*^n0m-Pac3z(VnD((Ms(phFAeuuYF#eh2o4Sif)POk zvD}6wkV#TU-HOsR<1^~Egc~GbUm@173eT=K0xOTcUk?LI>brr(=c(Te+sjvSdVaFz$G5kq zuP2>6JV8%wok%B{v{kGm1VK=Xn}c9RW61+k#9Te;kT$mABwgyH7jh0HFH=PX%u(ai zRNMiGbIPHUH%HO3D(`vD#y55I`zQR3P45(QzeWwi(p;EQk`**7RrlN1zbt?D>h@=! zt^YUZPdR`1Sx-N2_HYrqUE&_|x?kmv2Sg;uY+=<}vFX?zp6bJ>$7;<2JpzCn93UBZ z*qP45373wbdKe&J0y-y*5fH)95+Gu*BZ5PiEi)fCcV80!y1ZBr2*@xHD8c|FkdT+K z#8N7QC1I={Rb%$rS_5s=G~5vgSWq$y8FGiYqa;SOq}|9otU{|%3(l2VjizqPM;|a2 z8@HTOpCCdhiR7HOnXWQj6K_N=lXS=iXLlJP1+oW(hI&k1YvWn#X|jV^(=ZDI3)8vk zNo{m63kzcKUP1&|>SzH2^Dsoe;nhRjwmAB9FEb3AkA=R+6Yk4_VEg>ya+1U0sl;&? zcrzcWq=p=&o=(TRXU}go_##@~4m%aD*4DXa%-uS&^mo&=KR%x7#D?=>e>xn}27}xg zM!k1iOTmG04a0W0-bt?z>akkknzDErHDrLk%j+~iE0e3EON7tUG`*Xq_w{I$Q|5$) zRS2^IC`LdO10ocX@CdbJr>^|GeUqNHAKiRzwnqEnv&-Ed z(dJj{4%ZuAZSrc>(F$1z!c5h?nIEU|I3AAmRJ^%kKm@R<83bZm{;;GROaT#zI0(tv zLm-5J7?KdW5iDsPBtDC;>AU=ysT%m=z4(s$fCOmhgcumavG8?}h!KgGwpmzc^Xg5h zwK!9qA($z5kvr%+kQ7;*&Upt41#hm6+U#wDTA``8CBcurFddN%fRKZdaF)!Q+z$Y4 zK$5>5T{2&G{ETFqs7E6VMVZaj85y#B)0m9Hw0J8s9NRorJ!n)7b*tcEFjXHtba@I0 z4afmu0_J^B8qhgOj**YGJ&X@;^!Q=d-B|mb{q(Uv??1Wv{F#2dKeWl8UfpPawW99g zVz;RotKUrU-L!D3iX|g*V@1Uj;%k^r&wGaK}X+8-8 zF*oO3>b8A9?&tQXd6?=5NI50A-dqbX4T%z*8RgnyZ^`A|K-bvKdX3O zz`FqEa8ctEvTK7(SsBC39SEQqKqxjM>-8j%2e8zfF-zx1aI`YlBDM67@22}-J??nl zbryX-KHsel+x2mCC3#DE&Cmt1hL`4Zn~(M3Fg;B5P^{`XtxS5pqk^}kSOP)7ikJc* zMNrt8>b%TD4Kx}A2tYc+dzM0ogX^-i(l7wG^Ir}i2*NH$)PU&bl7b0MbzsL}ps@Lmt+k6>oFYf~`hdVnXNWP9P&Cq-FLa#EBDi zU0x@?5ZOxFVp=nHfb8KOGx%&#LWdyEO#;;1RG(TqOgL&gwm6wj?&>i|xKWK*95G!` zXCMJ3!erQNeHOL3gU%*9wh>Xo!`-|53WuLPd+`VRI6U6|!gGI!&F1El&8m~*srfWp zdh_AVi{d1ddyh5d3juKNKYB6Lhtr$mdmhrsv5W?DV2@WXpABi9+exR?T-sbj2FafK zblR&y-lUD>6?UCu=|s#?f|&AU4`vNrJCQ!`SnB=Z_E+!z?pGiFY>u~T1uX@EVNX&8 zbv`u&iOduhKtZ9#giZipB`AzC0TMF^2veP9l8V@hS8(M3rbJAE6c$LtD6kkILR0Or zcpKulDZ62iQC>aGH{QQ(HRJ9gUCcBzFzbw*VNTdOfaw@--`Jqe$>9U#PIh@C za$%4hxp>rYg))`HR1T-QpL{wWd;CoFM>KtRLZ=%XSSz-K)21&Ap;EGQ|Kx2Fd4h zzho2vVj{BTYT<^a-b%EJtwQy%SaN%~_~(fNiGYCwSx{0hc|cl$tVq@*10h48VY3)5 zPS6T~VwizBc{pe*tv^8X>kkiqbvOM+iy;5NA364gC&;-sF0;i7^NXwDFi-Q{G*E|F zadXkbB_(^bnLL$j-WpPyqxlS{5||8f|hg-F0DljqQn>kj7$(2me^!aIAAC^A_ZCkU<^hfi?C?!#e4#* z9vU7F%X2t*v4r7wupZ(}l8`wihJ0qJgIP?`ikh_srGaWh05D3%u8Xe2zL%6BF%m7% zI12?;s1>S0teK+<9p8S?-|i2;opqEh3pgQf%l4wXeqSEU8|5Jr<~0{}wWzgqk^o$5 z0U)7Jv8ZUxSx{9IOUHdmnOv)blbW?4OgHz_ zI;5%WAD_NScmm2-yMH10W1aGQ z#YXFp%We9&zkX(ga`%xyE~f)L?RV>&O$-Oxt}fzo$n%KLhk?|7{rc<2`D8Go10$Tq zr#g>KBIjY3y2#kC*Vx4xH`M0Rd2RL1j}~j)PphbxtIKun$Vn131l1}cPJp?;J4~++ zzy8JT|L>dqe}3}UMkY;C_;vT}k3ajjL%RIaFaOim<<}a39GRGT2x3_$2(Hr<#rB14^7s6Y_krBZx#aH4P*Xq!16N9zn_=K+*dC0pNJOhWm+&!wFx`^E|$t zQ@1lSmS{+>mTEMpla=`dUhmo)PMlr{?h@q8sjy%;7+_;N&8NeBI_iFmry5nm0U&_D zVaf3?TiG~sn9&gcIIweKaQ27@!|?N_03|?>6GjKf5biNsn?p-ji|`2h4v!d%BN#9M zNCXp$Bw;y~;$$}6az{4OKgf;*NBLwMF=4m@np6%A+xLWD-;h55<-+hk>F+EKW(^meQ^@ebi z`6Tnx-lbVPq0+^$x_k4e$F7J)Gy^W?bw9Qk8>$gb$NS@)he3<<3hj8< zPjWec8P8P`_ruDh>$>44t+A1}a%yE5I)_XE1Tr@pJKxWTHy^%wJO17Khrb%_olEIr z5Kj<|Q&b6^9~@MW%zy!F12Gio)_M`aWKyq)3g4=SGU7Qef#3w^U(n76CXP_xs%@$|&*L;7=XR`GG#mn#w)Zmp2f!uh2Ke2R8kmT{IfB6$ zse!vM6)0r%E>VW)5%R(|)YeR!n}HiG9m4b8H+Tu0qDd){N0~|MYO=t zUBff6M4*OSG&BdS5kVnl1mOW@ZC2M(0W*X?W}JLvMohpync_5&mH7|{NYZb&yDhJ^9-x$2ZF**u zHyJTYqC9wbguZ)vH=gD@7;m3`^EiI_=J>1o`fZR446NKtP3l8=|JSeoBe(7@Mi?k7 zAqp@fv2zqJ-e9y73b71A%x#*zflaMebHkL;N2W_w&fUd4InaTGkXmaVzz|?+G)b16 z(h$wu@rdHjK8g}u_vVx1!>31ocL`&hq#*$X%&jyx5=6q>ZF=k^XOy)eFy)S@Ia;xH z^77bDV|zI0{S=cg@iwN=8-$fHqf+>+ zR!nD&0-+8|+RVWr&}rH9A`n6%&Lo-pM43QniAq2TD`rJ&D+;ZF+9Hx9NtiqA2I^8I zCO)qUJRH5bRwxx}g<8F7=)yuj?;8RMh@E)($^tGkAyfc3zdLw_U{_SYz_18HT98DJ z3J|6mp>AdoVM{Y+DY7xq0)+Q}av%pTcwj{U2g48y6qBBQepK}9D8$Hc4yZV7sU)Syi|6|L+7frDhAL3lVc4HCjo zD|Yg}W8sw`ZP&x`G%;BeLjvMvh+wGNkZP@vfRYS{3XjzjZzx^#Z$Eth9)9t0e$1uK zCuJ?xR5fA3BqF)r?$)ck?sh}Y9i_w#qcbpVZ7zNq$CI8)9IIDH07pLy&MlmY0Ljk< zsc=AK6hvN3rie^H?oLP$34xh5B5R4Q1_Vvb-CY3`mQz)ryPw%<=h#1!xS%8^k<22D z2^nb_2&%a?YpNBspfz*=B2O8+9#W1hA;idpOVrcJ6--Hs>jbq*RW}WF@W3EK3?xE! zA|!OA^CBC;7Nj$fMYtKz@(zTBbjNu1+Vg^cG1Fzn9SaI;p&B6)8UX<;Xgr6-+k~{h zCt@MR5V(b`xjW88zCWCfcW?f3|Mczk$G?lb?OzR#<+eaD)Om7An38mTV$ovFHH)Y2vdfus9%xvtF1CGg zk3c%khdRuU_eGo`sRxxfJlSs^>KET0|8cM1Qitg3=tv9n($F!&*()YgBMc(2GB*WB zDnnvqOe`G_b)M_o4m1r{n+zA2*nDSJPRG4BHNQXLLApzbAm~~N$8c&(dIp%7f}k1_ zQ#ZwYAZddL?c~L$@a?ys9&!DIkJs495l@CR42h)+h%9N{4Sm;h$|MO<0?5M0YWp@l z%?}5?8{3mcWdcNO8sP2-$Umfe{4*miWF$kPA{07&r!jHAUiZb6{7G zVqV+|Eb-|80AGM72+IzQm!ppmF{VXH2hq%D^~tO>Lse)N7J%eQfI5>NbB8Gr2_f=g zTXI#cP%G&SwPLHGE!@Ha7G53DxvuQr8Ri3FNpE`qLbwxx17e^p-RAQaBNmY&yL~so z3O~P70)m#Z0WjhDRVj$EBmfu1>-m@N02m(Ta`(9RmOu{U{8s0$uU13XE?$1}v$VRt zoA*WMK5f^OIv$v^miYnpcOO1fszQadVy4R8^$9_pV*(WFGYe7YE-b`o(m6$6TJ%|% zyCmd7UB2E7yEP1iczO9U=4lP^ROFm(MdI(3#k11Oc(^!-AIc z%5u;_Uu44oP6)X8$08sg%oY~VKUtaJ9N7+LA+|+^wq&4SnH&e6^QZw7V2dgKRQ7qZ zYUAGVJz8PB8u$m#U;gy%ey`0TXaXzsexR1DgP@_muE+gp+@IQ>jo_jaNvIMaS=MXt zF7Ya@qTt+ARgsWorL*@OU3gPn(RSDKrn?A;@^reodAY`6H(d4%o9?u`-tJF_`E+=i z?uAY9aDc;`_T|I)>-+gzhq)6V3jjBZ3M7&ky;TLOEKCZB#K5K2oDdBJn3s`wyPJ;Hj0d&J89o ziFw6i+x5HcaAP^E);hP;i7>P0PB(}&aE?O9dfJa8GY_kwm&~gqkyZJklb$vbT%SM& z`Q)P);1sIcivh>9?Ye!;`*KV;TwT3DZ4afM?ET&G?V;WQwFH5|rWIT`Q*EXV08p~H z6BqzON;{ug4{xl0-&_9uKH_|%>Bw+ zbVKJ&=HrRnII9CQn-qbJt%num^(cC^KD{2FUv?L>cxQmf8r0{0wc4i3OskYR3$i5^ zZqUYt;~bND(^>V@thxt~M3^r@A0a^VrK}qOi~$mW7A%4SJ&J%6fjeYG02~qx5*-A2 zU_^CS(-Kz9)!p2~JUA97JW&7<5`qvVN-0sIOf0nAro$JMjW%e7YQv@g;J`?Nsl%K+ z3sFL5WCQ>TL-VleI>TJ4R$P1*X0+gS&oW5_Bw!}uKn!BQB`k;jotScA6$LGR7h3f2 z=*uC-!Sy`3J)iGkxmFl669wYAMR0c0oKHQA6z%*L0Y?OM1WA1^^{D(b$K!a=PcD9E z!q2f{!?=4R#c-Ru?yu`+BnVL?3#< zUJ_yw=miOZ1vBIF`&z9XeSVtjLC0g6>HrEE$ zptL9rEL;#Efeg`g*5%-Y#OW;Yb#sf>T<0hiTccJ(8>ol5I|6_&UpPug49gT~(K0P4 ze?(w}AYejbWCDO7WB@=#Gjl_52w%2a%ixHXe;z27@CFbpVpbqr?i1%*b$sV>U8)fa z^2JSZ-;eV_t%7Ms^zzv&-ve!BhN{At*H@A!T?LEb%6RS)Y6{S7*c>OO!5zB1O-x9_ ztIhN6W@{UW*{1n8Jpu`?2k_Lbb}JbQ6vQs^DjMem56647p@-C`{`R!*7_CwZZFN4( zr^oT`{o(!N>0R?ZT4iR62nUi7VNmIFrVu1hrRD`&bBaUp#EnVCMUZ>!1<0_N`w|-n zQTNF#2vZn#lB2bz>W)N6f#Apj;mU*wgmI_<-a4@rf|q%!tq~8ikDc003^{PON&`!mrDGOK1W;@qD;%{R>$soZAM3kGk3cMoqt z=om~69>Ic)gyF7nHrUa5ib%oWps3aimMw3T<&lX1yext`Sb&! z286@xR#Z(DYvDF?cpxw&mrPxt_4cLI#VrJE0}?Pr{@Md-&_cZ zXi;$kFe3T+J-~<&#K4S%i0BwV81Cn^6E0i61p~BPnU|0IA`0UG3|f>0;Rwrjhan8k zko4vHfbKy75Cp0sxq}Rn+v(}>>gGp^-H-p^_nzJSWM97d;=_9mU^AxE1am-G#A*DH z4!y8c2TW@=UiVoTg4%SNP=Wp*WS|VB<_)mA zVQm0&BBrzgYXnMUt(5RiLZ+BVB#{yK8LL4MaDY~=ppm4_+!DYL;V8}O>?JVRI^r;d z0jeJRgIxV~+J8RU{=;K8;@tb1WJNe*+2+et+OB2O5eC#nfG`9qK(pp<762IG;by9;LL7Q~CrBbGCnh4~1+#2wZH7{y zRGh1|!m4HgXRIy)GGPK_AV%asfyI%*j0*+=D438KXt@X@K>%7TKjmf5z1074p?fV4 zdPV|64(7A6FK}5G`m*U*Fi8tS?o4`g4zC^o4l)aU#;resaw7JY_)#gH#?ph zOvM9r_KK}vZ_x2nMhk1=T^g=n>oE9mQnOn2-jaG7da-dv=rVMDpK9%h(V@+g59?v@ zfhqcKmAM~wnUR@^8@bqY5r%|?l)NF`AE)Dd_x|wJ-Td1YcVUejJtWZ34H}^W%94nj zwE=NsNrKcx1{4i)HKhg}9MukW5=9~5um}_i!Wu*(G@4$8KNUn&cWT~q-&&k@r=O3r(md=R9_Qn0^Es`>YwG z*i5H+dN{s)s^2ynnL`L8K+O@C&8|n7E?ou$Kn(Q|CPWSa+Yml}mM7zA^+X*d^-{rjAk) zW)VosBEi+ILRD;y+KhD$)o6yxx_Nm+5d|X(1(6FI5)nHy5&7BXOU$Iibk6Ub4F(Vh z;1urR1mMdV9+$5O8zCVDF%dc}-2+7OGckiO&V1zkN1 zjge|A7FJ4Zx0gMxwbaoLk|?*niFz2QgEN`0B;}Ommbi<+GTB4(_usz#o7dCdp2}N{ ziVo)N;YbL~Zh{QWs@00n-}0S(-1)!;0s=n3+g ziQF5g=^R32$cb3Q&D=R*0uOKnZ*>mIc{6OhT6J~s*%W~blw0y2!I8)JuS7KANxa(m264EBW8=1=lu@8U)0i}(&+GJJ zExm<{f~*o3W=!~5yo zx9|V@{lni+_7ep4jut?ZY3?yR)b(B0?>dKUdjYS$qa;6eO_e18A|wEEb%OvR42NK3Kt~HD zWadPa5hQ{E+&q?Po`r#ffKbn90O$lx$jbt>S)r&N3Ob`XEmT}YAP~fqgprwqnE?@? zhF7<$OMGoctrQl(N(u5JvCEK>2xrC=;1C*K&9tf40(Hh(p=qdv2L+&000KymAhKa1 zPL2YYfKo66N-!Z&5G{J7fLK`m1RkalhGuxi@8YsRL&L>_=RiON%jz(|mt)b=SYF@( z7!(FF5=eB=JC`DeyD!TlimfEnIBj_~E<#N~p^cU-Gmv#xo-6JS((yre^(%DW^+q;q@ zCTpc@NQcTZc-+qS})oD`0K}N%~Ecz=RQ29T5eEsuqD@ znt<5OnVG(qYzVbJnK|Vy>GX6sLBQ>0KVJ=fFDptNb=U&QgHu3fElo~kE~OpR9~B#S z3rpMs0Wdk%*4)BL7>UUN-NPM;06|0;I{-mKcP7xq&C5gr!jvKs0%D+p5}}706tf0i zOsj)C8U`XRoN?+%I?7!VS*l*Hj&4!3T2m;6<_4-Bge;sh4B2wxF84_iW2PnP?8R!+ zW@n#QiB?q&&A}Fz03iwzt0a_;D4`^x1SvVmS$M(A&wJ^bIRH2)SXeZ03ou+_4zY~z z36OvZ8JB1TAb}t{G+fU3&PcGh!NJ2Jz)>x%xVL}?Y_#!i=vJ5My4&0cPE&jsw{-c* zi{BT#zI^e?I2N2JAXo?Y^#NQ+kUR&iBs40EdZ!AXw?fFr~)vCSu#wLO;gwP z8(K$TE2l#_ZTpK1n{qfl9rmqOM+Bmdr3qA#mC2h3t zDpK^dP2jQ18yTpzLvt~26r*w`=kV5`VJJWZ0|hl@*IH`@$pLRXfqm?6&Wy0KL_h>4rt?Ee?fB>1oBQO{jO?q$;ivS1!5D=tb z!!Qm52xJ#wS+3U*7^q>U7#3A^QY{e8tYP?hp9~1-0x2;h5f&1_Gg!B(H*CgpiCVxK zK>#79WPQf2!<ho;pefBL2FdM<6D)6p=js!8WXw@rVK0~;v~e`SxA;oKf_X0 zJ`eN_(7^%2APig)mH}$ucRRABwEx|ce(|6p5S|wd3-Kb{9VCpy!J}aXsZK4TLL%w& zXU-1V!W`Fa~Woq2zJ>Vb_}Xs9#|9V26fHa_gh z=IVQ}a-7F^_4V!k+nG*DB#@(_n>i7IocTAu~3$k8@r1Njub1JM5xq^U(>=gu;=5Alfv~ zG=&e@l{3|91*iKY^6Bc~qP%-atLv0_+11P#%O_eUgS+X&`@`$Ke${+W3={~?$Nrc%Mw4ILAu4u(F`5YqK2^-LWC2k7DfXswn)Wu`jA zT)nnXJM-8PoiLb#Qji37${k<|O31vJ0T%q#nN7Mt($T>jVo@_0S-_cn1h5z`k(T$x zlJtZiSYn_{R=``z14w5->r$(uC9jBCX;uVH# zjS1(W8&bmMJuJK7q+;}8JtG1wN1^eH^Vw>9FFz&aD3bB1Zt9UZOyGDTEW@S z>^fPN5CSiOtDJ$F7GQy*lszIt>pD(It(>PtO==Mk&p6`Uth4&Vde{uBkFMayWWVzk zK8W0Tbit)ssq<*Wb#97>)j*wfsMer31cjNCdAKu^1_A~4!sem<>izd#f(P`p?uO1G zygJp)J%Ut+`*L{m_{GEcTJexNmFS2?6)nun6p#}lWDfRd&Dp?tsU{+k#~D*)PROBX z7O*t^fL%djbXV{S)Vw)~hoWjlH@8@vF+_+MOd!lc+@+kP6B57*q3HFDq9{-cSw#y# z0!fg2mh36>GWMZGLKA9U)k!cmdzb9e17+gm$dDGe%GtEB z4EO^K%?TV#-49h6f#YSG`;l-xE;Nhmrmny)Rr;#-v zrEdFtyS?mV?LKzU=9qKl8j7uk34`RkVSuN{+g=7mtJNm00Mo#5v$m-6JXNdFv3$e1>Cg+zctXf28wA)@=z%3WH$fEd96i0)0@9Sju!8PVOtyt*rdL1+Mm zMyn1O=oH{iPKziAFcEi@hc0*AB}@no6lTS>YO_*eYFKNiFJMFLvUM5yK~j>0Nq_=8 zpoCY`R%*-*>KtX(TBy}z@9h5Iyx9245S;@ypl{5DKtu}5Og>`?hmo+x1IKwPX{##$S55R%-x8?@$m34 zj)!@2oP`t}RRe)Cibq&e2OtpPU=IXH>>_ReEH@YZmi(|k9fD?J&Xgf_tDFttU^?m( zLy#ll;1aV>b8Bn7h52v03)BAyMJ4J$~mRT(X4=5w6HLR073^t zZEn8ge|Z`2g)OgjM_|No^ad7Y24HRoP|X<}JZgjiy0D|5X&?h?EPWR`*FlUy5<(Wq z!if?KA-J1G^JZ8Y)V{QL}kDs=Z{zNqki=R$)9wq>$KkGzR$T!k{zX@?EB}Ed)aaK`A0uq-+X*P zd~^J^%(nxlLoIV12@)|jnw=HYV?OBJNc7P6U26^XxExPSn3!t@S3^&XLJ7017zfV0 zA?$jz8G>|0>+UKa%OO8KBxK}ut2jkDPVe_m-|Uay)cPocQ~1(n)&@jDMvknij2^{2 znfh4tlwp)SrB#niF>JVO)D(PPCmS&}zsOx5>vgX?tB*kyD^Mj&tKz2HVR+bGjXSnfd5}^V3+MWMm;>(Pne=hK^y5QNm5=xQX=^ zb=O~KAh*KY09O^e+4P~FzBzpHSa16-5k@#RKy@+zG+45~0AS56kbs;3%zzfkxr5_k zjt=7Gj1LhWP7v;HjgSGz0TvKV4Sk`AHeV_`5f1K#6?))R zy3X>w4t{Ul{~-0RxF3*3X*%kYVMM`dobnB{ZrJSPYWMk-@9=j2Fn#mk>9<86HtT2A z&}vI4wX*_`^XU;QCK2Qr^~qzM^UAEWqU-f#m>|xS@$~jIziqY`an^*Csuo59LO|loUK$}XF%r4~P%~4- zMz!QJb?dE~tcMp`&|;oin{l*Jx3&>_dBNL*cD0{KAX}SeNW_H;qd7f&_4s#hpMHI{ zd-2Kc-(GJ%z98t*%PgNh;o;r!S2{zn(E%smRhHyyRYR)bjR=B(5ObDH{p#|nOWWiE z?L^H3Itv`9!`p{%j`olw`h-B}O_|WcND=55f#B$i z&2LHmHAIVWKnD&FBM79xGcOt-z(c|@f`}pjmvEng6B4-rIWGSp0m0}4uA%>=_0L-0 z&FQv8t<&Q?f2j4TK|}0VHZO0!Px0yfgNY5B6V~JX-m4Jc4-5o}cQyo*)zFY4k zbue0DL{g-aS*vZm?w%zaiW+lJmtZ`gz;<<6#xW78P1E?WuU~(=|N6!H(`Vb?>GDrD z_`^;A1Fs)vc$@vtX8)$Sl~z)O469IWVGv3j1Lw7DHt90MN_?HnLNhQ{^rz|m&BL3! z@)oeJ2V#PV>gVN+f<=oONKQ_%yza>nP{GfsdLmlz{EJGIToJ-000NtuErdeU7U~*_ z6NR@D4u+8+g3u6v5t*>#G;mt+D&atc0M)dpom$bkL8)F94FeE4LGG|0s7sJZ1Ox#A z>Jlqz&8tEyI2SL4N_A5N4+vxg!Vt2A#JDsLfLjPJ+bFY!=Wcp9ID$Jgs}bD50k|dr zLI=-A$WDB&{8waQ0tW^J#Ds1@2vo@k*ohf2K)E4q&KH{cR${p@Yd&B zjbnr|F%ozE_59&Wh^x}FU_U(SW7T5%P{{!TYF{v_J5#NNqXOe9k|fB3zd}k~mJGXA z(}vCR2*3gisRmI(LQ5WGmXO%O9;T;L)J z5?n5IbunWy^@5?`t;GU_CCaUNkUBU)T1+S>FAlRXOw+uUs~&UXKJl7)Kl!Q3;lt^8 zv3l+WiaJ2o^R5ylyFhv-!KoWHA5Y_lLwP7Z?#ufX{_3XxZ(ct8>2CNq$*UoM@_PTi zc^f+Q)@ePlpaKE~bHC~3nx3uGMGrj&kTSteW1MSwczk-;ztb33Yi2Egp?SO!dnYlK+jMe_g(7DyR$j!q(RB3>ML4i4dA8cQd5sx%jG&DG90 zAzB(g*rf-!&aaHMFu0ZgA6vh6o|SJu08;X1j^y`^sQkB*$f&nnjPwF0N^x>_vkr<*!or44ZIZ~rId2-s1kyZT)jC`?uS)R z!HB%-1`J0_B$xm=4L0iNhXu*1h(1pZ9!_!l_TlgE=HE7$P{`}z2yu=?0AQx%7J#%E zM~H}|H&qN^VkYddy14|+EJPG!t-rjO4P0xIMA7wrr0pia*V|kkXK&&Z0fJ!xc(Hk5 zj>FIn*w=WR+To~&;-}N}cqngfu6||!r}YBjZm%<2f36R4pFLl`46i$wr{g^?kUgolTPbB#)p7%?;e z!YU~^w#A1E!QN2{M^jZpXaPZhZ82OUqmcB8x}1dtNf5%rYRpB)(yW%4YcvB#PXg3O z?y>74$x@7AfbcMP)n=uI)|e~S8m6%b?T~^Ymax)4wODA$CArtL1lR)+I4B_k6J?aG zq^otl3A|npFNf{T7B6w5kEgpq=02Nu1+kST*Q<}+?D1(_ceVk1d5hVCM$RAHEeM6rQ)z4QhmoCyU& zLwc({L{7Ytl-#=`IKga`w}}LWiIQBzDUtJ<)>$(Kb1yk58LslcU59|Pl~JCK<(Qyf z1a56E1@7)1e|tE+E_IIt;mF7Uf!czCS= z0s}E&C(sEe;)E=O4yHb7JGNG9^)icB3UFk??74@2fRvDlD41}`-Mjgf|&*I2nF%UzO}A3v*{@@a^=UiHH~S0kGp1j!>4aKBn5-zdhheK5)y z`vFs%6j?Jv21b|F&<|-<)gb0FhZlw#}Q}cWF>|Lh*fZXww9l(!(OCh;+>FM82fSF7x&(4Nb^X?^vPIyzcF6u}**Lpc_i7-7>2281zPeg6FL<>mC5 zQ%#xP-5(^Z_XXLJ0~`0=EW*8uCnRS~kivn%0Xi zZjx9KAw0yOGDd8JK@LDbVaP%Qr@}>2A;R!x(3;MxE_Jnag{rQBz{DBLs6&c@C^5^1 z^mlW2Gwbf!p*L?GRl(hHt9C&UFa(4X0h0xivxAwx#+9w$ap38m%6>|3)BfAwqI5X$ zu?)vvZMZjWKc!M4jknXfyFUo#+Ln%M#FhBPank0R)MD-&27Tgpy+uSK?L%C~7(ph- zn8tmx*?aF~rlxK{`_yvE>$<;ST&brdd#0LnJ%0F`FUz7W1T>HQCfN~r|8knEt$!l@hXD#hj!cfGt1HGZ?yNiE2uO*r?k-)s0ZY>{J>Tlr zwyX*Zcjj!uRGUYFB;!84KfeEm&yTjc3as<|Jmeuj8^`|e&A4(p+#lY)ElXMa*MI!! zhu(Bb2ZKQ<7;dVJuZ0T(XJ+oLp`s8`2xZ(!*`=}28^QsR-QA2mVYhyIIX}-YYhT=& zP)d|CMyBz0ZN1*m!@FamB(hn$ICXSF!rHnjAck$nL?dQSf&`ccV0KW9rXI*dB1{3i zdFr%Th1VdF8@i(e5i&)gRC3O_FcC+1xLUQg_SKe(wS~F^5lWz#Mt(Ey zD2xxfwC~j}^;DN&L0K^YSR`LT-*xoBIiF zWwgXYj!b01lrW`EW8QsydzWLxbX@d$zMPwLiWt)bL%yjy&MES3^=bbREv9iW(95ab zZfEPOd&LnV$F)8`ygZziuaD<1*X5!63wR~vade0EW|1Tk4Qln;uSnRIQv{-qg^r2d zNeoUIIgdG|cmN!3x5wrSBr|W_xwZ>7>+w94Z)N&TDQ~}f^Ffz(iLW@=o_3Gt^;+v0 zp0dyDN$=(TyyDxt?@F6j>!;Q$v>;&+2gTfbGjCTwi*VO)auG=&S*9@^$rBnY2S#*O zXL~r^e!Bf|hOdjADEV}stgf@I-K{AAg0^r$;soK~t`wn!76_`Zbf*9hM38|9JYuVO zm~}Gk0E~nfw4oEs4V~O5XyfcVVT2$NCcz=4Tv&J%EQF+PYPYI$?{(?0Hc*3rh~#OE zQlKP|B*aXJ0O;Z7-qp2f?a|iFC;)+oTc$s~0v0$Uf&nrTQ*e>D?{4Xsj`z*FBEIwOYnJa4}ez6v8@lEe)^MFeW~-+PC;wXZMhlHfR-W( z(g;DT&JpXPH&6R(TL-!uaX*&(W4=R!G#sqPb13$Dcl_;Pv`JKf%Amu=dQb) zsw+fGV-Yy)=9W0*u>USaWT@=iBBa1h23SaTQ=#8|JA5}NoZB_nFFu3AUHPz+H`hA% zdad3O%^4p1bL~FtMj|xpL@7@tft~Bi%W_*T&vkje)u&n?i7pr&Jcgkl+3E6xo^s;E zrE9PK5&#fNjEKBMzS>a631gJ;ozb0F+xfd;I;?(e?bf-e&LQ5~Wsaw84^w+Q-ro=W zaTwpvGg#Cq-+%Ldf4yC=%g?9l!^7oIfAR0f@i+TOV+_K?MNg{%-J5ieg%ON9hDQWr z1Soh2@sQ;>9!GwQ;X&K|RY0Ou*V>f*FvMQ+OWO5`IykT3x4DP0BlhIPi3G4mWG)#) z!!*>CSdbae0Ugn|C?lE#p>_b!&EE2=XWW`0MBxrWphyU2!4$-iD261NDRW6&0Lh_w zRI^#zx~|a|vQFU$qLOLImNT-TFv>Q=j2`Z;q224Y0w|c15riNoPf3v65hOsvgAg$T z-|r5;8|e4b{l9&|fAe9u`~BZ9KYjj}YyS*8AKp&$%L~Rbw_2@MuhDMg3(?G}1VCR- z%}?E1^XSkLP*V#J@J3M(mE0PxeFJ_bGS)NB9QV6a=J`c+_83ybnhr?q06GGl`g4uA zhqniK%XGi2rzv)WaSubLAtxaj2Gspfy!R9wkUYrQJ8=aO7qe-)J3P53ujl2o=(Tms zSc<+q@^Q!Sv|L(l+;Dd&px{GX)^_ge2$Dw3liZdCz=N)Hd%UzS%^o^_HJjC(f!z&g zDHz?^TB{8s0T2Sabq6FyAyk8<`mPqDBpa(3XB?24Z1QQw&7J9FfAemC3-}G=wE#W{SG5Eg4cPkU?Qrh#@z>e}DHo)EBKk z%*#(~ag=vc`RxL*-@$%5pw2hf9dhU(1>Su)yo)t2wp?unf-)4MkzS0`MBz53JX1PMOAo!%DRFSeL9 z;gKbAWKC-QOhvxdFL{83k>Gl%%iKBNeV2!iMc@6y`|mDa|8VkE5gcmMRlWLF3I;aq zh{`}50;#7ymO=v^GruWu2bP&wLbTIb^=6}%f7l=Y!yrHE&$IDLfa{_T#U0wJ5k@Ep z44f!4Nf1JZu%$C3DrVrJY#5Rn2cg6;ig)abs|SW_I7ARL5Kv&4fEzi6(^hjQ0cOk0 zS-6xjr6QOa#k_h~tzH$^2DO111_p^swvs7lOhW0eTxSmt4Q)_6)gIc>ZNoZ{Bum1a zX{)XqhZ|(Rzsvvi5dVJM6{TM){Cu?+hTH86_6}TjJqwlgVt4R0$B7LI`82#mtOfQhgWI0+Inf`kGB7Aa1;jz>ySzEnoCJn>Wix9bGx19=S81L)>d!o6)^%G z2pz!{5s(DTNdW>8oa?HoFjDkD1xi9F;^BEJVM<85F%^g*;Y-WQ|0{POtn)1@t=h>AM~WsJrx zoIDcsXjqaDsSJ|Eg~X9OQI5##s!MMTZngc>#U8Hpe|wt$XGk{(!Njg^Kxr@NVA`u0 zC(Ahl`j(ydzzul`P63T9oH30V3<0&NLkdE{ZB)D=G~LZZef9o&Rt-SekkCY1Z!1$_ zEC^!W)oZhP)pc2-R#f$l!GgKOSRiLeA_)nBA<)CktT$^K)wZ2abOb=h1e5@>n(#&z zh)~3oi*q`6{papWYo9%RvHHwxjHErWrCS@}+QVmzwn2XhtEz%C#VhLv5WoN-^mP`T z$Ri9z#*|BhfC+^MyAVep=FCtMXV3Or7dx!gPBx2ZDij58Jx+Nw_9@(zy=UaOzsc5Uf#Wp^@2BrkSz>KQ~ zquPSV^}42Ae0$__o`9yE`0g5x`VKit;;DS%{pF+1amMmrz^?IWSPy0aS+fDngw!W0>ybrjel`z&A`gaIMU z6oJt<{5c2%I1uw+(_|cG7(gi7CkM%yJPT#Pl5&}!3gdVfT2jtyrDWbZ7u(m59Fp9C7Ij2mqA!dZVAYmt4qf0=yjx<;|1azvm zTjMO@jNpiQEPIU+!{*+RoU1iHs~mP%?)m;Oe(a<-y-q&f+VxSNoxQ8tghcXkyOdVf zvGnDktj`F7u!R!Y+=$aa=oq2Zv)j6W+hIrt8F#}VxF--M1c$k`TeA}`r{zz#`n|`R zFj;6|auW*-S3+3Y+T0kB7;_e44~t+2ldvFUOn?quyF3NgdU#YDCdpie{r$T)AJc^v z*G8)AI^K`DOc3Sm;a_KARz9XXS=V&f4f}8Y)_+qShG_EbZ{OBcu#2bEDG1jcmB9}x zcaOJ!p6fD18>gfaIgf|*Hf*@9D>mAvdr7`>*9ccEDVLO*_m1=Pa;oa5<@xjS>+}40 zsxOhA*V`BMTNd?5uA37i$efu39oYebf_Ev6o;7IhbJb>nK*(tV-Fi{fT_O6lod#A2 z>ekHdSWGB-U$519Fe?I?Re8`XC<3{fA^+&PkcD+?x*o> zofi+)L*vzutPhf3x3XN?KOa0}g2_q&EaN4M`j- z$rASKUD*?j#9eTX2p;J$9;x!R-=6y8vh=(A!5mdx^`>?*eXjlN=tvO|$}TB`5&(2{ zB?B|UW-21WM1&4p1}O^OeD&bOTy05_hcbafZ*8s3-BQW!F}K+?kclQaEnnC#ROX;} zE9?jk$HTwbQE7|)!%%;{{qR(OI3DtD;^q3-^ra-<@3@*Ts}Vu>nJLwNv$`m+2r&j6 zcx0Rq8QF-qdT``@`KHBh-;QZM{qHIzm3!=*5j!?RbzQ(nM1Z%!3M8UtO~Syjs@H1j zFchHxbw`Fo1rb}=vY0N2NE8XsiHQOtjESrRl1GrgVi6-FCGngk3o#|490(q!zOH&* z`?_k>9?j4Jh@&J=#Z%gNSb~7&5#iRosjdxMMQz?A5D3^o026KkcL#((w=En8$Oy)S z44dUWaATW57`PFMBz7}K%tB0zK<=&%6oAYGM9hd_?h2hnkdVX?yE6cq8)A$-T1(A3 zfDI@bo`YHhAs~ARyoX`ZY_+cUyEn(lo*$pymk;JDFs=^ma`lc=xjUregk>DzILMe$ zmyV@KF890$r&jzKJD2Cl(hvf~c!c+9SqNomexx+`w1A;3)3DDp0v3_6#NL3?PRG1g zoh#jzwm{QX-SayNVK$7#Zf8KZS~*D|0XTyIQGjOvH`s(#=&SW4!vIVGm}nY;njbq= zB4HSs_txvc1!)*|rHHJvs;`J#t=GB~O+!YnFE7{6Iq%tqF~5W1-}A7acDc0S@4oqO z$Llw2Ushe88{WM2_7x`2l!SL{?H$)TcVGzCltO~<%5EojOe1qKQt0N9JnkRco_1dR z5`2cZMX>t-Y^F_HGX+Tu5$YajnNVax*Vf!MI=BO5PDHG&o0~Au5M8zR)f%W1LNYS2 z?urPM3S|~k>I$$SGSP)05oD1erNo(tkre~FclE3GrLE9b?2RG-*;$Yi6T z0yzL$(1r|i+nA>afee6F>w?l4GgEP2muTjcXedJt1q*X^7B&FjyS@DWkp3287|Q<0 z!>ulvGLQ^-mn7HaYEeq!L6UVP-IWB2dc7?fv*av~*Gr!r%?CV82n1{px}2JJ)jDNK zG6j&xFi?>k1LE$GbKeI=bt*_?r68GN7d!6uR!nQVtc$G=0M900=J*A?1tKxorZzEw z*XvAnP}x|%8k?uk*Gj~IK6mAmB&XtKwizIgJn;_d#UM54$|OQKF?(wk9T8Nu;>}T& zm(_o**K{0q9Uh;s=$hWX`E4Hl&71xGeg6H;RPCR1bzA1@H&Rnn$B0;&y@vv#fFFk8 z&Gez#?GVL>8?2BzKD zUgx>lQn7YWa6l5Kl4&S73T9w(0**j+3sr9|R)xM@5kLT8aDX^61cd_u05}C0Y%mA- zE7yU|0T>(u0)h~?>xu|NAd#&F5MW^jNX!hT5uivIP65bigWxJ7AfN~VI=LfvKqOBk z?Kqe)I)qz82?qre=`tSjuvWt5@phwWSDji{ym^{zZ8tDKBuYslR3uSItbU$XriMYg zaVHN?kG)zhLIq=VvzKZ;h-L6)S)mL$@xWui(oor^uyyep?>*B2oUe1`n~QXIiWgR-_)4Eg5XffNK9s`t*aYmNHoAk^a5lIi%?%UymOyq zI^Qm#Yr(@1!-0b-pUhf_;=_kPFUCb4V$?4WI#Lsk(+MD3OaJHng0d^eRlrWAqEB&F0TU+V38{>$_H zf3KHMU9Y&$0D0r-t6Hy+Bq4`upxcmBAT{lEu?XlL!Xf7@66&Tk%EV~I}?sF3hdoY!^0ypt+rZAqwJvUzqanV z|Ni6dD7fc=7Pgsg+;4u7JOQw#P|#6u+<{RCN&}V0>lNU5vy&h>kJ$UQnHx9huHFOG z49Dr+qVm!o2O1|DG>G(^xl`oRBTofB9KZSaQC`j$ud<)sH1YFl*JVD>FCp{G`SZ*A zU!;Qe0X~J@Xc?>-P_ZA4fJ;9 z-EkUnIp01jsd-0UQ-TlSVfH_Rf@) z5Y0kclLV5GS)fzA4t9@f2Y&TI}63L)g! ztS+#DnYUgJ!>f;LvvvPf?6rmShzY{Nw*D$3FpKrRxdA8wBO)l!0}xpd0f2pbpjh~? zm=ppKM+#55WK%X-!+UMO5`qktfW~~dcON^Z;p1*hF!&yU8de;zc6V2w0SPX)BAU!} z>YDFbiZtJr`Q>S;Kes>sDuFf3TDx>0j)t8m49F4;?2)XhwM7>%$6oL|FYWm=j9R5MT%Mo7o?6oLGb41U?;*vj+BP{ zTB;-Mt=p>FkR!swQyOvsvK|yXr2T6BVScIos@8VHhd1d%+8ujs?x!K&&g~Xvg4#TN z!P|rb8{Nsbb6-@{pqkFt+A^Mh`urEQ9Zg*0a>kc_opwx#$&Ju31&uOJdCVzkpqPVU z1TTiQ&DZ%a=j$h9wGf7E3c5+#!T^IJnIj-k<^XHmy!RoGkaDOeCO{7`uj*vrK+Ci!p(rZJ(TVi=nA~i4-+s%1_rCOa^92&M>!u+S(*Yl%tUzgi8`_=f|=O3l%CZ!0h*yE+WPfV$Rpu4`p&xK#aoT7Oi%00MLB zHHa}$8F&J8bzRzu(FBD!Ap*s~BIFrNQchILprK%WX~7OEH?@Mpf|@9JC-3v2_Z8F~ zFHXad-@V)argylWe?3f}eqDd;sW0topN8j3g7uBuBan9>uaw%_yoR7dch1*vV#=!C zR|>7&(MB#A^RDdEpp0w46wZ|hTYbEIepvqHW&ZxAD|3;QkubuTISkD^08$bT?7ozg zk&_TbboZ$J21Z#D1h}Jtg+orvlC}m+0QKhDn|2feW+o;KQ;ps`1iAn@GJ^;YDGGC@ zBvOdUaqCj(+FPr(tWXWr(Gi(3XC5;Y#srC2Hej;3_wH3qJE(&BCglc_U#{|F7>Gd#f*2IS#PVt)MPPSxaI@$w0D*-R3pC5nn+QvHcFJUX+J{_k;)TMXEIviv`&TFI2=~xbW)!TA)E|h>70i1v_Ths#1 zORGRoZ#JKv?d|^N!;o-}Kybj;>PuUHuJu>Micq?#8AxQ#Bu*NjV5k%UjVpT%pwP;| zyRu`L$P;7ER03h<+Hpky#M)eusI?kMIi-n*xzEeXMbo@LG>9V8ckkrCfBXKw+kNw& z{^iGiOm-soF0d}`+LwH{WAF;5-7IjFgu!mD#UhYUO?xvzg2oUKupjS_LrFBCl?-FW zeQSL_Kb_Z4m-Tz|OE548LI;CZR}@Gr*#|Wb3`Y|p0Y}0_s3vY^9Vonq5rUzHLQW|W zgN1sJ4kY2G)?Q7Ch=AeY2;l$(LA(uK7@57Wq{NaE6A=&wb_+Ap8q11n4b=cdDVO2Sn?d2N4wa2oAy!iU{F^i4q|;XbQkA@hbdf4hAS-V-N=gu>b{=6BUY5 z(x_FMS7Sx+s4pu&|MK&vFsLESV+XvxJgl{+_7aS-yxdMd;oF_SL%RN%v>!;?kMZfb zh})P*29d5ROu(t*emIU&Yrw*6Mp{DFAi(k6`)>*{6qd-a%&ooLt`E0)F(E}3jMh%) z>tE*h(qa)LP*>b84Q;KpZ#7jec|d?3&V&(xW&xz3>Z5tMbx>Aa!H{y!feui>OnK%kLn)R)jyQ+C; zDNG%@s=McbnF%~<*V@d?Fpwz$K}1kU%E&@y)@nBg27n&0F+ULzz`L*R1Q_OKzA3MS zFf)#s^1zAH>qaEPy;W5+)>ObaK^>gyCEI zl_Shd1Cbzk07)W5Mj;YY_OL{$h%hEVA~^OiGwTLrH`>g>sV+65FV|WLAb@fLAG=le z2N-6|*C>tq>Em=><~=;M^dK*Pq5ty_{^?Wx{zb*Y%rm%T6?9f@R_b$FKTdb-hb+62 zOHon^$DAP#6dqBCIrA-Uy50ikOJBS>cs(LyYB?)^~s|I?y^tL2UKx7&shm_DD1olH3K?xk+4evP98gXt5W8U0}^ z{O#}l3%A{y^g+^p?CG;FKZREm)gS^w0|+1|L^PPu0x^&^Vf59&00?rE<8CS`F{S9) zV(JZ-e$w`<;jh>AQ(@TccWyaCTWhW9xXU9IUBSW}4KdO{ELN)`{*^x7Lzsaa0<1?8 z7Gwkmha`yu-PAR57D91@2r%DvNf8d<1c)Rc@YsAk=1K`^yW5L_g17XCD>yksw?K*nXcEPH zA~Sa%hl#k{9e7!madgOvz7hqCr|3jj{2KL>)E>?3c_mG^&+zb6e*DMl=Rf=By9XKy@R+TURyW0gxMULIbzMkkkWAk z4wAX9ty<${C&%C3e|xy~ho?o$^PC<`0ooVydi%ql6h4+WcdHqm%iZ{2j%D`rX*r#v zcP`$0S1@4q=$V9RgA*G#1um16yS%?E19f3ZmUn%n)A{Lf{r)!pw8T@C44kZ!TXWKq z@;D_iR*2P`_sAn4q=@e3OyFRB1wcuR4(5u)EaHG}5#BxAh!UX_5hTO_gdo_=D1ost z?geSb#WX zuoBLaP^0#?cGIW;fCEbe669&xQ=|;t(t>P+u4+v!B_Q^--sbDVl*hw9=DyDBzuf-x z?YrOq-M7Ds`1mvZc&$$@)~mH^|GX>e^VjFzyHPp(?M{ZajK}=uqLsn|rK^Q&kSlm% z#(=tNPGLr48r~j1WFANhv(4L0KGmPk%VX!Yl!Pv=qDK>o9*!NGYxTO;5CkcR zWCS3ixUri>qWKg7iWslBdkTmRlo`DNl5XfHB8b4=Ub~JE2+psHMFvJ2ScVe0WMXCz zVhnKa-I}jGmeuOQ-Z3DUA*UF#lq4mONm8(G( z>DBwAJ#y^15oYcY=qxea?IuUExOA#PR~es*w_PYi<9VINclBaVU)uGw-sZ}YL2mQ99;c6Qrw=1f0{G^e z!>9vcNh5Gzoo&h^9n#bDMOW?fvfM70cB^rvaOeTTU?3-(vc!&uaY#u{m-**UKfirT?~d=@m5+JJ z*SNMc-{7|PGpxEB2Fkgff3D-7^kChp>@$CGl~`2P7EEo%mZuvbLyG$@_7X+VuSLa%;=YZcBZ< z-G0?{VshwNaF>ALbvg_Y-YRalj_$FpRwD)x3jkXkJJeVeLKlQdDCfidM0;&@Cll$d zs~TCoosn)O?ZXE(dYg9N+#l;*JbeE3VYy=P-+lA<(`{{awfy*`KXqEzAgeC^c+m%| z%l+YwsLb_TEdp97P*@5PBGaBw2Ig0f-6B zB8Y+oQ;L#O;zXE{0%N=E(cW8KY@J0{@=ziafig%gGL&iHL*|jTmI2g7Z);mx3pMi1 zaUh7v(;!l)3?-*=8_hc^B2ti(aJVB92D1{PY(jPbG!H^|a*qH|bytUABHSF;#10+; z1|SFlPJtFm%FaZAEGQJ&mCE?W$vdQ7#^e>D-_iL{f<045xYZ#Jw@>iLKgAitXyULWbU=BbrBJY2-kx5e?7);MKQ{=wL|T8>|Q*P&q(Gs%bpnN`-ij1Irtr z!}#XShw07Khwe%g0JbtfVPu*?G_33TIKQm+{Oje9XZ>F3t7^5*9D*ezAqCdHc8E52 z4;)KjN^R*>@^IifD_I2r>;j$$v@1yR9DQv< z?uU1W-=w3w^y|`Z^W)R*}whlJ%cLM-V0G6RKj7DGi| z8DoNR@53AZ%}@$LDqJkM0%{UGFwIzUvcydQW!;f}F5_SNQ9Pd7cU+PlAcSBQ`j=8{S&!{A&^A5FpzIsxGfzDCIJ8%YUT!x$gd(_1?aR*Qa2j_;Wp(Yu_18)SfB-u z={OvBw!}0FLHGHo&`dPFy}|WMy`1p*b3FbMfBf^)*Ps2%1^QxORnUwAKv9-$+hqZa zt2q(?ctY@SbOi9N=7~VC(WV0okpdWjK2D`9?0^zqHS5 zUmw3bKAxVr`FC&r)zj_k(iRjr>;)C(xLx{Jf*~hQmJ%eT2wzda9EapbKOCpKocEN# z(ioBX275j_$#kcgA5XW-D3M3fj`LF2n*swNSVXUe43tpS5NNH72osZAm^&y4u`q;^ zfpxVa-j>K2C}D4A%_$IYb2x9;q=-Qgh=52z5+YV|Dj70oBqEG3S5q~s&Fg|~!S3M* zNg@{>3QfZ_mfawS%sCvbh4ornU3=^1VHBImBDrMD0}WHoIWZOhbl-;kw6$bBkPs!I zWFU6p&BwL15ye+H&2ngFr z8)EZ}y-H|93QrV71TbLuG?kHyjN!?ua}TUHuc&yg6iZ*#5K7{mIiWp2U;gm*`45YK zwWTr-Niw7N)=6YP?SSKU0~svb3`3p9krN_#21!=ID~SX|bp%69Fi6{(%-t4U+VSo~ zhQmOU97bFp&fZsP7&Vpm-%RgfMQv9){PwrwfAjl~&;Rn{FSq9>FGMi9z5UIH!)R)L_ukE;H4{$@_hdK^XX43{&bmt9d>|S zg^Do3P!bQd!CG5u4LNPuJ_s{6GxrX>89fyZ5X5Q&*XDpC7!cExa$W^T4AZW8ai%97Mcdxm))4g%)3Mr1XO&l$P}-a5t~~XkY0V;8(SD-lZYX}mf7BZR`&1jn#yR$9q%OV z3zQ*B9#D!9`gN_9R$^CO&-GEgLyaXa=JVs_r-${Qn?Gw?1t}!~^x9o=VwZs6o*0cV z*H{8tgyVdY@&*A0JTkA{IZW9)uG-e`om-WyYJrrSO4z#yMOyPZzBCLh$obvpP!ef^*9a#Vl%Hghm+V8;0Cp?V9-OH z1G)R)xQB5sRWpi#R^V&MNcu;p;TUU=Vmd#|x35koT z#q6OP7~4)=wsZ_TAS3}G%VQa2aPRJFW{9%M?g(+)Q@BTX02mP=hlrFR(rZyD5Yz(9 zdsk}});Ve;cOY@ekR}|aam>d-j$@h-5^AV=@5@~0wRbagryvXlDI_~C({U`5a3%s* z-^g^Js#|skkcE(u&=7H(*?^O&hjA#nZ$+CxW`Y2cKoz8%QpyltE|gc#n1J%WfdJivnk1IfXOaa$e<2=P_O2S5Sg zNVNIFF#h_{M*s*wgzaW;`}9KqZhd_D^2^1B^4;AB+Lf32%5eIY4njvxBlOtGaHJ1N z9I|G(6Mwn3$AwO(+h4xkzMt{rWQkISDN8cyEW(5kd|eIPgD4Gy&DwedL516GIb|p% z@7NFB4AfO}YtDtxarMiT0BsDyNMYL2FqQoFaJXAL`1-g5LFCI>Ki?iP_}u<9pH}+r zUt^_j{^sxB9*@8N`OkS;+S*QbJ+}qC`k*NlUWb6*Ya$FlL!yAC;CMIe-sEq_amU1- zs5kE5r}_H*%OC6dsl^wBMi$nfgL!l9hA!(G>gW(TkwO(Az(8<9946Av-T?rCfn>`m zsO%DvfjdD*Br|U{v||!RgK%?X0Dz7W28aZdgi4B%L4=YBUu6cXBLpM<@}{d|mo-U9Y!RtNK-@bjEvwx)y-9_TGuWA z$xMl83%UWu_Nr}3Z^j@ZgY9e89E1^(kdVcQ2oadWIEa&ER1udPb>bV}o|luYOaA%C z@sIy6|KW$GR|O<-U|awID8h+w-Eb@fuixv0h`!w*Y^iymKm^)$IrO^c*>*bHbASLr zub#pQHE8 z1M~(?%)4c+tJy$z-031;W~Q8X-^$@Q?$$c1^Y1_Y>&M5-=a;_}Za@6;$K&u$ezI@z z{eSrIAK&E9Umt&bseisWnDnJRTBeluu9n1zx{-rjQCI!Sor~rqx!^PnJIMuh2@rtX7U+i5+`D@x2SY~y2B(dc z8yw-@B0OFLO@;xAOc9Ek(GiIfBuN3@O)WYba~UKsMUTsQY%fv$^di6h^yZ)c$-n>9 z^nw_a$jQSzHbvAH!bW%n!N4|`A`HN7OS65yV?fYfzu^M{UgnNW-5SBR zI79#=2BPGY(NHNEw6IKv_ZcG^)M|N!_|uAY6tlBOnHv`8wvYD{-(mK#1c=DxOo@-KX#a#nN8PED%w=>XOprG~{wO z@a}oOUY6^W@?m%1kSO`xZurgJH`DNqNnT@a)jC&}oB8=B_F~shkH5%Ru>d*=P*-hb zrcA9Z$mF_$G3d{$eeE^wDQ9g@Es9MBNL|f0$%SJeLV#IkVRt8DF2aOCfI@v6mjOZ$5|?IN zd%RrnqF1G_&&NOgS$_PZ&ClI>AXo|@Ro%!4Mo60)65JWl90L)_IUoD9+mAU7n5H+!_XMeB)MVNtrgFG@cm3tz`K7;oJKTSK z2Qtofdbm8*wf*||G^J1b!?zXs-TODs>q;sRoMWi%+Q>kRgFr)=h8XvJw;w+w+5wgh z7>AZB)Wj?Q0t1|+Ck~XZ&DVm@IgaI+6ZCE2f2uzNq0jitl3Iv@5H%Kia03vNl zY<%@zxd#FU5(z}0BVa@j21XDd0+UO^fj|Tq5qX0rx`9>mMdMnb8#;g_%mt?`yI~k) zKbBp_B1G!eahdJ;d3m1OyqG&73Q!^}kP4-eL<;5v9zbrsx~(2-v*zZG0E9?Hw0VI6 z2pqtT-Cl`6ju3)DKt#dpOq3Zglqj|)0b;^b0_CFXmv#9S&d<{iU*U&ey*@Q)AOSgi zyAX^>9z+nhoo$0Kg2Drs0{OLj{Hn~ua0iSnH5l=_SOmOgFJo1N From abb89da4de3b9196933d9a885db822d60da18cac Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Wed, 31 May 2023 02:29:04 -0700 Subject: [PATCH 057/199] update code to reflect latest changes as of May 30th (#3616) * update code to reflect latest changes as of May 30th * update text to image example * reflect changes to textual inversion * make style * fix typo * Revert unnecessary readme changes --------- Co-authored-by: root Co-authored-by: Prathik Rao --- .../text_to_image/train_text_to_image.py | 284 ++++++++++++++++-- .../onnxruntime/textual_inversion/README.md | 14 +- .../textual_inversion/textual_inversion.py | 230 ++++++++++---- .../unconditional_image_generation/README.md | 2 +- .../requirements.txt | 1 + .../train_unconditional.py | 158 ++++++++-- 6 files changed, 566 insertions(+), 123 deletions(-) diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py index 61312fb3a4b3..a5bfbbb7b12a 100644 --- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py +++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py @@ -20,6 +20,7 @@ import random from pathlib import Path +import accelerate import datasets import numpy as np import torch @@ -28,30 +29,96 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger +from accelerate.state import AcceleratorState from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import create_repo, upload_folder +from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer from onnxruntime.training.ortmodule import ORTModule +from packaging import version from torchvision import transforms from tqdm.auto import tqdm from transformers import CLIPTextModel, CLIPTokenizer +from transformers.utils import ContextManagers import diffusers from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel -from diffusers.utils import check_min_version +from diffusers.utils import check_min_version, deprecate, is_wandb_available from diffusers.utils.import_utils import is_xformers_available +if is_wandb_available(): + import wandb + + # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.13.0.dev0") +check_min_version("0.17.0.dev0") logger = get_logger(__name__, log_level="INFO") +DATASET_NAME_MAPPING = { + "lambdalabs/pokemon-blip-captions": ("image", "text"), +} + + +def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight_dtype, epoch): + logger.info("Running validation... ") + + pipeline = StableDiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + vae=accelerator.unwrap_model(vae), + text_encoder=accelerator.unwrap_model(text_encoder), + tokenizer=tokenizer, + unet=accelerator.unwrap_model(unet), + safety_checker=None, + revision=args.revision, + torch_dtype=weight_dtype, + ) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + if args.enable_xformers_memory_efficient_attention: + pipeline.enable_xformers_memory_efficient_attention() + + if args.seed is None: + generator = None + else: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + + images = [] + for i in range(len(args.validation_prompts)): + with torch.autocast("cuda"): + image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0] + + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + elif tracker.name == "wandb": + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompts[i]}") + for i, image in enumerate(images) + ] + } + ) + else: + logger.warn(f"image logging not implemented for {tracker.name}") + + del pipeline + torch.cuda.empty_cache() + def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") + parser.add_argument( + "--input_pertubation", type=float, default=0, help="The scale of input pretubation. Recommended 0.1." + ) parser.add_argument( "--pretrained_model_name_or_path", type=str, @@ -110,6 +177,13 @@ def parse_args(): "value if set." ), ) + parser.add_argument( + "--validation_prompts", + type=str, + default=None, + nargs="+", + help=("A set of prompts evaluated every `--validation_epochs` and logged to `--report_to`."), + ) parser.add_argument( "--output_dir", type=str, @@ -191,6 +265,13 @@ def parse_args(): parser.add_argument( "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." ) + parser.add_argument( + "--snr_gamma", + type=float, + default=None, + help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. " + "More details here: https://arxiv.org/abs/2303.09556.", + ) parser.add_argument( "--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes." ) @@ -295,6 +376,22 @@ def parse_args(): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") + parser.add_argument( + "--validation_epochs", + type=int, + default=5, + help="Run validation every X epochs.", + ) + parser.add_argument( + "--tracker_project_name", + type=str, + default="text2image-fine-tune", + help=( + "The `project_name` argument passed to Accelerator.init_trackers for" + " more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator" + ), + ) args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) @@ -312,13 +409,18 @@ def parse_args(): return args -dataset_name_mapping = { - "lambdalabs/pokemon-blip-captions": ("image", "text"), -} - - def main(): args = parse_args() + + if args.non_ema_revision is not None: + deprecate( + "non_ema_revision!=None", + "0.15.0", + message=( + "Downloading 'non_ema' weights from revision branches of the Hub is deprecated. Please make sure to" + " use `--variant=non_ema` instead." + ), + ) logging_dir = os.path.join(args.output_dir, args.logging_dir) accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) @@ -366,10 +468,34 @@ def main(): tokenizer = CLIPTokenizer.from_pretrained( args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision ) - text_encoder = CLIPTextModel.from_pretrained( - args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision - ) - vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision) + + def deepspeed_zero_init_disabled_context_manager(): + """ + returns either a context list that includes one that will disable zero.Init or an empty context list + """ + deepspeed_plugin = AcceleratorState().deepspeed_plugin if accelerate.state.is_initialized() else None + if deepspeed_plugin is None: + return [] + + return [deepspeed_plugin.zero3_init_context_manager(enable=False)] + + # Currently Accelerate doesn't know how to handle multiple models under Deepspeed ZeRO stage 3. + # For this to work properly all models must be run through `accelerate.prepare`. But accelerate + # will try to assign the same optimizer with the same weights to all models during + # `deepspeed.initialize`, which of course doesn't work. + # + # For now the following workaround will partially support Deepspeed ZeRO-3, by excluding the 2 + # frozen models from being partitioned during `zero.Init` which gets called during + # `from_pretrained` So CLIPTextModel and AutoencoderKL will not enjoy the parameter sharding + # across multiple gpus and only UNet2DConditionModel will get ZeRO sharded. + with ContextManagers(deepspeed_zero_init_disabled_context_manager()): + text_encoder = CLIPTextModel.from_pretrained( + args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision + ) + vae = AutoencoderKL.from_pretrained( + args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision + ) + unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.non_ema_revision ) @@ -383,17 +509,81 @@ def main(): ema_unet = UNet2DConditionModel.from_pretrained( args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision ) - ema_unet = EMAModel(ema_unet.parameters()) + ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config) if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warn( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) unet.enable_xformers_memory_efficient_attention() else: raise ValueError("xformers is not available. Make sure it is installed correctly") + def compute_snr(timesteps): + """ + Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849 + """ + alphas_cumprod = noise_scheduler.alphas_cumprod + sqrt_alphas_cumprod = alphas_cumprod**0.5 + sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod) ** 0.5 + + # Expand the tensors. + # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026 + sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device)[timesteps].float() + while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None] + alpha = sqrt_alphas_cumprod.expand(timesteps.shape) + + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(device=timesteps.device)[timesteps].float() + while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape): + sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[..., None] + sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape) + + # Compute SNR. + snr = (alpha / sigma) ** 2 + return snr + + # `accelerate` 0.16.0 will have better support for customized saving + if version.parse(accelerate.__version__) >= version.parse("0.16.0"): + # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format + def save_model_hook(models, weights, output_dir): + if args.use_ema: + ema_unet.save_pretrained(os.path.join(output_dir, "unet_ema")) + + for i, model in enumerate(models): + model.save_pretrained(os.path.join(output_dir, "unet")) + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + def load_model_hook(models, input_dir): + if args.use_ema: + load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DConditionModel) + ema_unet.load_state_dict(load_model.state_dict()) + ema_unet.to(accelerator.device) + del load_model + + for i in range(len(models)): + # pop models so that they are not loaded again + model = models.pop() + + # load diffusers style into model + load_model = UNet2DConditionModel.from_pretrained(input_dir, subfolder="unet") + model.register_to_config(**load_model.config) + + model.load_state_dict(load_model.state_dict()) + del load_model + + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + if args.gradient_checkpointing: unet.enable_gradient_checkpointing() - vae.enable_gradient_checkpointing() # Enable TF32 for faster training on Ampere GPUs, # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices @@ -426,6 +616,8 @@ def main(): eps=args.adam_epsilon, ) + optimizer = ORT_FP16_Optimizer(optimizer) + # Get the datasets: you can either provide your own training and evaluation files (see below) # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). @@ -455,7 +647,7 @@ def main(): column_names = dataset["train"].column_names # 6. Get the column names for input/target. - dataset_columns = dataset_name_mapping.get(args.dataset_name, None) + dataset_columns = DATASET_NAME_MAPPING.get(args.dataset_name, None) if args.image_column is None: image_column = dataset_columns[0] if dataset_columns is not None else column_names[0] else: @@ -549,10 +741,10 @@ def collate_fn(examples): unet, optimizer, train_dataloader, lr_scheduler ) - unet = ORTModule(unet) - if args.use_ema: - accelerator.register_for_checkpointing(ema_unet) + ema_unet.to(accelerator.device) + + unet = ORTModule(unet) # For mixed precision training we cast the text_encoder and vae weights to half-precision # as these models are only used for inference, keeping weights in full precision is not required. @@ -565,8 +757,6 @@ def collate_fn(examples): # Move text_encode and vae to gpu and cast to weight_dtype text_encoder.to(accelerator.device, dtype=weight_dtype) vae.to(accelerator.device, dtype=weight_dtype) - if args.use_ema: - ema_unet.to(accelerator.device) # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) @@ -578,7 +768,9 @@ def collate_fn(examples): # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: - accelerator.init_trackers("text2image-fine-tune", config=vars(args)) + tracker_config = dict(vars(args)) + tracker_config.pop("validation_prompts") + accelerator.init_trackers(args.tracker_project_name, tracker_config) # Train! total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps @@ -639,6 +831,13 @@ def collate_fn(examples): # Sample noise that we'll add to the latents noise = torch.randn_like(latents) + if args.noise_offset: + # https://www.crosslabs.org//blog/diffusion-with-offset-noise + noise += args.noise_offset * torch.randn( + (latents.shape[0], latents.shape[1], 1, 1), device=latents.device + ) + if args.input_pertubation: + new_noise = noise + args.input_pertubation * torch.randn_like(noise) bsz = latents.shape[0] # Sample a random timestep for each image timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) @@ -646,7 +845,10 @@ def collate_fn(examples): # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) - noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + if args.input_pertubation: + noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps) + else: + noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) # Get the text embedding for conditioning encoder_hidden_states = text_encoder(batch["input_ids"])[0] @@ -660,8 +862,24 @@ def collate_fn(examples): raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") # Predict the noise residual and compute loss - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states, return_dict=False)[0] - loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + + if args.snr_gamma is None: + loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean") + else: + # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556. + # Since we predict the noise instead of x_0, the original formulation is slightly changed. + # This is discussed in Section 4.2 of the same paper. + snr = compute_snr(timesteps) + mse_loss_weights = ( + torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr + ) + # We first calculate the original loss. Then we mean over the non-batch dimensions and + # rebalance the sample-wise losses with their respective loss weights. + # Finally, we take the mean of the rebalanced loss. + loss = F.mse_loss(model_pred.float(), target.float(), reduction="none") + loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights + loss = loss.mean() # Gather the losses across all processes for logging (if we use distributed training). avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean() @@ -696,6 +914,26 @@ def collate_fn(examples): if global_step >= args.max_train_steps: break + if accelerator.is_main_process: + if args.validation_prompts is not None and epoch % args.validation_epochs == 0: + if args.use_ema: + # Store the UNet parameters temporarily and load the EMA parameters to perform inference. + ema_unet.store(unet.parameters()) + ema_unet.copy_to(unet.parameters()) + log_validation( + vae, + text_encoder, + tokenizer, + unet, + args, + accelerator, + weight_dtype, + global_step, + ) + if args.use_ema: + # Switch back to the original UNet parameters. + ema_unet.restore(unet.parameters()) + # Create the pipeline using the trained modules and save it. accelerator.wait_for_everyone() if accelerator.is_main_process: diff --git a/examples/research_projects/onnxruntime/textual_inversion/README.md b/examples/research_projects/onnxruntime/textual_inversion/README.md index 0ed34966e9f1..9f08983eaaad 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/README.md +++ b/examples/research_projects/onnxruntime/textual_inversion/README.md @@ -53,7 +53,19 @@ If you have already cloned the repo, then you won't need to go through these ste
-Now let's get our dataset.Download 3-4 images from [here](https://drive.google.com/drive/folders/1fmJMs25nxS_rSNqS5hTcRdLem_YQXbq5) and save them in a directory. This will be our training data. +Now let's get our dataset. For this example we will use some cat images: https://huggingface.co/datasets/diffusers/cat_toy_example . + +Let's first download it locally: + +```py +from huggingface_hub import snapshot_download + +local_dir = "./cat" +snapshot_download("diffusers/cat_toy_example", local_dir=local_dir, repo_type="dataset", ignore_patterns=".gitattributes") +``` + +This will be our training data. +Now we can launch the training using ## Use ONNXRuntime to accelerate training In order to leverage onnxruntime to accelerate training, please use textual_inversion.py diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py index a3d24066ad7a..7ff77118c38e 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py +++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py @@ -18,9 +18,9 @@ import math import os import random +import warnings from pathlib import Path -import datasets import numpy as np import PIL import torch @@ -31,6 +31,7 @@ from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import create_repo, upload_folder +from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer from onnxruntime.training.ortmodule import ORTModule # TODO: remove and import from diffusers.utils when the new version of diffusers is released @@ -55,6 +56,9 @@ from diffusers.utils.import_utils import is_xformers_available +if is_wandb_available(): + import wandb + if version.parse(version.parse(PIL.__version__).base_version) >= version.parse("9.1.0"): PIL_INTERPOLATION = { "linear": PIL.Image.Resampling.BILINEAR, @@ -75,14 +79,92 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.13.0.dev0") +check_min_version("0.17.0.dev0") logger = get_logger(__name__) -def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path): +def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None): + img_str = "" + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" + + yaml = f""" +--- +license: creativeml-openrail-m +base_model: {base_model} +tags: +- stable-diffusion +- stable-diffusion-diffusers +- text-to-image +- diffusers +- textual_inversion +inference: true +--- + """ + model_card = f""" +# Textual inversion text2image fine-tuning - {repo_id} +These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n +{img_str} +""" + with open(os.path.join(repo_folder, "README.md"), "w") as f: + f.write(yaml + model_card) + + +def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch): + logger.info( + f"Running validation... \n Generating {args.num_validation_images} images with prompt:" + f" {args.validation_prompt}." + ) + # create pipeline (note: unet and vae are loaded again in float32) + pipeline = DiffusionPipeline.from_pretrained( + args.pretrained_model_name_or_path, + text_encoder=accelerator.unwrap_model(text_encoder), + tokenizer=tokenizer, + unet=unet, + vae=vae, + safety_checker=None, + revision=args.revision, + torch_dtype=weight_dtype, + ) + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + # run inference + generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed) + images = [] + for _ in range(args.num_validation_images): + with torch.autocast("cuda"): + image = pipeline(args.validation_prompt, num_inference_steps=25, generator=generator).images[0] + images.append(image) + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") for i, image in enumerate(images) + ] + } + ) + + del pipeline + torch.cuda.empty_cache() + return images + + +def save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path): logger.info("Saving embeddings") - learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id] + learned_embeds = ( + accelerator.unwrap_model(text_encoder) + .get_input_embeddings() + .weight[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] + ) learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()} torch.save(learned_embeds_dict, save_path) @@ -96,10 +178,15 @@ def parse_args(): help="Save learned_embeds.bin every X updates steps.", ) parser.add_argument( - "--only_save_embeds", + "--save_as_full_pipeline", action="store_true", - default=False, - help="Save only the embeddings for the new concept.", + help="Save the complete stable diffusion pipeline.", + ) + parser.add_argument( + "--num_vectors", + type=int, + default=1, + help="How many textual inversion vectors shall be used to learn the concept.", ) parser.add_argument( "--pretrained_model_name_or_path", @@ -269,12 +356,22 @@ def parse_args(): default=4, help="Number of images that should be generated during validation with `validation_prompt`.", ) + parser.add_argument( + "--validation_steps", + type=int, + default=100, + help=( + "Run validation every X steps. Validation consists of running the prompt" + " `args.validation_prompt` multiple times: `args.num_validation_images`" + " and logging the images." + ), + ) parser.add_argument( "--validation_epochs", type=int, - default=50, + default=None, help=( - "Run validation every X epochs. Validation consists of running the prompt" + "Deprecated in favor of validation_steps. Run validation every X epochs. Validation consists of running the prompt" " `args.validation_prompt` multiple times: `args.num_validation_images`" " and logging the images." ), @@ -479,7 +576,6 @@ def main(): if args.report_to == "wandb": if not is_wandb_available(): raise ImportError("Make sure to install wandb if you want to use it for logging during training.") - import wandb # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -489,11 +585,9 @@ def main(): ) logger.info(accelerator.state, main_process_only=False) if accelerator.is_local_main_process: - datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_warning() diffusers.utils.logging.set_verbosity_info() else: - datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() diffusers.utils.logging.set_verbosity_error() @@ -528,8 +622,19 @@ def main(): ) # Add the placeholder token in tokenizer - num_added_tokens = tokenizer.add_tokens(args.placeholder_token) - if num_added_tokens == 0: + placeholder_tokens = [args.placeholder_token] + + if args.num_vectors < 1: + raise ValueError(f"--num_vectors has to be larger or equal to 1, but is {args.num_vectors}") + + # add dummy tokens for multi-vector + additional_tokens = [] + for i in range(1, args.num_vectors): + additional_tokens.append(f"{args.placeholder_token}_{i}") + placeholder_tokens += additional_tokens + + num_added_tokens = tokenizer.add_tokens(placeholder_tokens) + if num_added_tokens != args.num_vectors: raise ValueError( f"The tokenizer already contains the token {args.placeholder_token}. Please pass a different" " `placeholder_token` that is not already in the tokenizer." @@ -542,14 +647,16 @@ def main(): raise ValueError("The initializer token must be a single token.") initializer_token_id = token_ids[0] - placeholder_token_id = tokenizer.convert_tokens_to_ids(args.placeholder_token) + placeholder_token_ids = tokenizer.convert_tokens_to_ids(placeholder_tokens) # Resize the token embeddings as we are adding new special tokens to the tokenizer text_encoder.resize_token_embeddings(len(tokenizer)) # Initialise the newly added placeholder token with the embeddings of the initializer token token_embeds = text_encoder.get_input_embeddings().weight.data - token_embeds[placeholder_token_id] = token_embeds[initializer_token_id] + with torch.no_grad(): + for token_id in placeholder_token_ids: + token_embeds[token_id] = token_embeds[initializer_token_id].clone() # Freeze vae and unet vae.requires_grad_(False) @@ -568,6 +675,13 @@ def main(): if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warn( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) unet.enable_xformers_memory_efficient_attention() else: raise ValueError("xformers is not available. Make sure it is installed correctly") @@ -591,6 +705,8 @@ def main(): eps=args.adam_epsilon, ) + optimizer = ORT_FP16_Optimizer(optimizer) + # Dataset and DataLoaders creation: train_dataset = TextualInversionDataset( data_root=args.train_data_dir, @@ -605,6 +721,15 @@ def main(): train_dataloader = torch.utils.data.DataLoader( train_dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers ) + if args.validation_epochs is not None: + warnings.warn( + f"FutureWarning: You are doing logging with validation_epochs={args.validation_epochs}." + " Deprecated validation_epochs in favor of `validation_steps`" + f"Setting `args.validation_steps` to {args.validation_epochs * len(train_dataset)}", + FutureWarning, + stacklevel=2, + ) + args.validation_steps = args.validation_epochs * len(train_dataset) # Scheduler and math around the number of training steps. overrode_max_train_steps = False @@ -626,6 +751,8 @@ def main(): ) text_encoder = ORTModule(text_encoder) + unet = ORTModule(unet) + vae = ORTModule(vae) # For mixed precision training we cast the unet and vae weights to half-precision # as these models are only used for inference, keeping weights in full precision is not required. @@ -663,7 +790,6 @@ def main(): logger.info(f" Total optimization steps = {args.max_train_steps}") global_step = 0 first_epoch = 0 - # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint != "latest": @@ -744,7 +870,9 @@ def main(): optimizer.zero_grad() # Let's make sure we don't update any embedding weights besides the newly added token - index_no_updates = torch.arange(len(tokenizer)) != placeholder_token_id + index_no_updates = torch.ones((len(tokenizer),), dtype=torch.bool) + index_no_updates[min(placeholder_token_ids) : max(placeholder_token_ids) + 1] = False + with torch.no_grad(): accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[ index_no_updates @@ -752,72 +880,38 @@ def main(): # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: + images = [] progress_bar.update(1) global_step += 1 if global_step % args.save_steps == 0: save_path = os.path.join(args.output_dir, f"learned_embeds-steps-{global_step}.bin") - save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path) + save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path) - if global_step % args.checkpointing_steps == 0: - if accelerator.is_main_process: + if accelerator.is_main_process: + if global_step % args.checkpointing_steps == 0: save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") + if args.validation_prompt is not None and global_step % args.validation_steps == 0: + images = log_validation( + text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch + ) + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) accelerator.log(logs, step=global_step) if global_step >= args.max_train_steps: break - - if accelerator.is_main_process and args.validation_prompt is not None and epoch % args.validation_epochs == 0: - logger.info( - f"Running validation... \n Generating {args.num_validation_images} images with prompt:" - f" {args.validation_prompt}." - ) - # create pipeline (note: unet and vae are loaded again in float32) - pipeline = DiffusionPipeline.from_pretrained( - args.pretrained_model_name_or_path, - text_encoder=accelerator.unwrap_model(text_encoder), - revision=args.revision, - ) - pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config) - pipeline = pipeline.to(accelerator.device) - pipeline.set_progress_bar_config(disable=True) - - # run inference - generator = ( - None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed) - ) - prompt = args.num_validation_images * [args.validation_prompt] - images = pipeline(prompt, num_inference_steps=25, generator=generator).images - - for tracker in accelerator.trackers: - if tracker.name == "tensorboard": - np_images = np.stack([np.asarray(img) for img in images]) - tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") - if tracker.name == "wandb": - tracker.log( - { - "validation": [ - wandb.Image(image, caption=f"{i}: {args.validation_prompt}") - for i, image in enumerate(images) - ] - } - ) - - del pipeline - torch.cuda.empty_cache() - - # Create the pipeline using using the trained modules and save it. + # Create the pipeline using the trained modules and save it. accelerator.wait_for_everyone() if accelerator.is_main_process: - if args.push_to_hub and args.only_save_embeds: + if args.push_to_hub and not args.save_as_full_pipeline: logger.warn("Enabling full model saving because --push_to_hub=True was specified.") save_full_model = True else: - save_full_model = not args.only_save_embeds + save_full_model = args.save_as_full_pipeline if save_full_model: pipeline = StableDiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, @@ -829,9 +923,15 @@ def main(): pipeline.save_pretrained(args.output_dir) # Save the newly trained embeddings save_path = os.path.join(args.output_dir, "learned_embeds.bin") - save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path) + save_progress(text_encoder, placeholder_token_ids, accelerator, args, save_path) if args.push_to_hub: + save_model_card( + repo_id, + images=images, + base_model=args.pretrained_model_name_or_path, + repo_folder=args.output_dir, + ) upload_folder( repo_id=repo_id, folder_path=args.output_dir, diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/README.md b/examples/research_projects/onnxruntime/unconditional_image_generation/README.md index 621e9a2fd69a..c28ecefc9a30 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/README.md +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/README.md @@ -34,7 +34,7 @@ In order to leverage onnxruntime to accelerate training, please use train_uncond The command to train a DDPM UNet model on the Oxford Flowers dataset with onnxruntime: ```bash -accelerate launch train_unconditional_ort.py \ +accelerate launch train_unconditional.py \ --dataset_name="huggan/flowers-102-categories" \ --resolution=64 --center_crop --random_flip \ --output_dir="ddpm-ema-flowers-64" \ diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt b/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt index f366720afd11..ca21143c42d9 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/requirements.txt @@ -1,3 +1,4 @@ accelerate>=0.16.0 torchvision datasets +tensorboard \ No newline at end of file diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py index 1b38036d82c0..9dc46e864ae8 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import Optional +import accelerate import datasets import torch import torch.nn.functional as F @@ -14,7 +15,9 @@ from accelerate.utils import ProjectConfiguration from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami +from onnxruntime.training.optim.fp16_optimizer import FP16_Optimizer as ORT_FP16_Optimizer from onnxruntime.training.ortmodule import ORTModule +from packaging import version from torchvision import transforms from tqdm.auto import tqdm @@ -22,11 +25,12 @@ from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel -from diffusers.utils import check_min_version, is_tensorboard_available, is_wandb_available +from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available +from diffusers.utils.import_utils import is_xformers_available # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.13.0.dev0") +check_min_version("0.17.0.dev0") logger = get_logger(__name__, log_level="INFO") @@ -34,6 +38,7 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): """ Extract values from a 1-D numpy array for a batch of indices. + :param arr: the 1-D numpy array. :param timesteps: a tensor of indices into the array to extract. :param broadcast_shape: a larger shape of K dimensions with the batch @@ -66,6 +71,12 @@ def parse_args(): default=None, help="The config of the Dataset, leave as None if there's only one config.", ) + parser.add_argument( + "--model_config_name_or_path", + type=str, + default=None, + help="The config of the UNet model to train, leave as None to use standard DDPM configuration.", + ) parser.add_argument( "--train_data_dir", type=str, @@ -251,6 +262,9 @@ def parse_args(): ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' ), ) + parser.add_argument( + "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." + ) args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) @@ -295,6 +309,40 @@ def main(args): raise ImportError("Make sure to install wandb if you want to use it for logging during training.") import wandb + # `accelerate` 0.16.0 will have better support for customized saving + if version.parse(accelerate.__version__) >= version.parse("0.16.0"): + # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format + def save_model_hook(models, weights, output_dir): + if args.use_ema: + ema_model.save_pretrained(os.path.join(output_dir, "unet_ema")) + + for i, model in enumerate(models): + model.save_pretrained(os.path.join(output_dir, "unet")) + + # make sure to pop weight so that corresponding model is not saved again + weights.pop() + + def load_model_hook(models, input_dir): + if args.use_ema: + load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel) + ema_model.load_state_dict(load_model.state_dict()) + ema_model.to(accelerator.device) + del load_model + + for i in range(len(models)): + # pop models so that they are not loaded again + model = models.pop() + + # load diffusers style into model + load_model = UNet2DModel.from_pretrained(input_dir, subfolder="unet") + model.register_to_config(**load_model.config) + + model.load_state_dict(load_model.state_dict()) + del load_model + + accelerator.register_save_state_pre_hook(save_model_hook) + accelerator.register_load_state_pre_hook(load_model_hook) + # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", @@ -328,29 +376,33 @@ def main(args): os.makedirs(args.output_dir, exist_ok=True) # Initialize the model - model = UNet2DModel( - sample_size=args.resolution, - in_channels=3, - out_channels=3, - layers_per_block=2, - block_out_channels=(128, 128, 256, 256, 512, 512), - down_block_types=( - "DownBlock2D", - "DownBlock2D", - "DownBlock2D", - "DownBlock2D", - "AttnDownBlock2D", - "DownBlock2D", - ), - up_block_types=( - "UpBlock2D", - "AttnUpBlock2D", - "UpBlock2D", - "UpBlock2D", - "UpBlock2D", - "UpBlock2D", - ), - ) + if args.model_config_name_or_path is None: + model = UNet2DModel( + sample_size=args.resolution, + in_channels=3, + out_channels=3, + layers_per_block=2, + block_out_channels=(128, 128, 256, 256, 512, 512), + down_block_types=( + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + "AttnDownBlock2D", + "DownBlock2D", + ), + up_block_types=( + "UpBlock2D", + "AttnUpBlock2D", + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + ), + ) + else: + config = UNet2DModel.load_config(args.model_config_name_or_path) + model = UNet2DModel.from_config(config) # Create EMA for the model. if args.use_ema: @@ -360,8 +412,23 @@ def main(args): use_ema_warmup=True, inv_gamma=args.ema_inv_gamma, power=args.ema_power, + model_cls=UNet2DModel, + model_config=model.config, ) + if args.enable_xformers_memory_efficient_attention: + if is_xformers_available(): + import xformers + + xformers_version = version.parse(xformers.__version__) + if xformers_version == version.parse("0.0.16"): + logger.warn( + "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." + ) + model.enable_xformers_memory_efficient_attention() + else: + raise ValueError("xformers is not available. Make sure it is installed correctly") + # Initialize the scheduler accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys()) if accepts_prediction_type: @@ -382,6 +449,8 @@ def main(args): eps=args.adam_epsilon, ) + optimizer = ORT_FP16_Optimizer(optimizer) + # Get the datasets: you can either provide your own training and evaluation files (see below) # or specify a Dataset from the hub (the dataset will be downloaded automatically from the datasets Hub). @@ -434,10 +503,7 @@ def transform_images(examples): model, optimizer, train_dataloader, lr_scheduler ) - model = ORTModule(model) - if args.use_ema: - accelerator.register_for_checkpointing(ema_model) ema_model.to(accelerator.device) # We need to initialize the trackers we use, and also store our configuration. @@ -446,6 +512,8 @@ def transform_images(examples): run = os.path.split(__file__)[-1].split(".")[0] accelerator.init_trackers(run) + model = ORTModule(model) + total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) max_train_steps = args.num_epochs * num_update_steps_per_epoch @@ -552,7 +620,7 @@ def transform_images(examples): logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step} if args.use_ema: - logs["ema_decay"] = ema_model.decay + logs["ema_decay"] = ema_model.cur_decay_value progress_bar.set_postfix(**logs) accelerator.log(logs, step=global_step) progress_bar.close() @@ -563,8 +631,11 @@ def transform_images(examples): if accelerator.is_main_process: if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1: unet = accelerator.unwrap_model(model) + if args.use_ema: + ema_model.store(unet.parameters()) ema_model.copy_to(unet.parameters()) + pipeline = DDPMPipeline( unet=unet, scheduler=noise_scheduler, @@ -575,18 +646,24 @@ def transform_images(examples): images = pipeline( generator=generator, batch_size=args.eval_batch_size, - output_type="numpy", num_inference_steps=args.ddpm_num_inference_steps, + output_type="numpy", ).images + if args.use_ema: + ema_model.restore(unet.parameters()) + # denormalize the images and save to tensorboard images_processed = (images * 255).round().astype("uint8") if args.logger == "tensorboard": - accelerator.get_tracker("tensorboard").add_images( - "test_samples", images_processed.transpose(0, 3, 1, 2), epoch - ) + if is_accelerate_version(">=", "0.17.0.dev0"): + tracker = accelerator.get_tracker("tensorboard", unwrap=True) + else: + tracker = accelerator.get_tracker("tensorboard") + tracker.add_images("test_samples", images_processed.transpose(0, 3, 1, 2), epoch) elif args.logger == "wandb": + # Upcoming `log_images` helper coming in https://github.com/huggingface/accelerate/pull/962/files accelerator.get_tracker("wandb").log( {"test_samples": [wandb.Image(img) for img in images_processed], "epoch": epoch}, step=global_step, @@ -594,7 +671,22 @@ def transform_images(examples): if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1: # save the model + unet = accelerator.unwrap_model(model) + + if args.use_ema: + ema_model.store(unet.parameters()) + ema_model.copy_to(unet.parameters()) + + pipeline = DDPMPipeline( + unet=unet, + scheduler=noise_scheduler, + ) + pipeline.save_pretrained(args.output_dir) + + if args.use_ema: + ema_model.restore(unet.parameters()) + if args.push_to_hub: repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False) From f751b8844ebd73bdd9cfd609ea03db10e8fe0f5a Mon Sep 17 00:00:00 2001 From: Will Berman Date: Wed, 31 May 2023 09:39:03 -0700 Subject: [PATCH 058/199] update dreambooth lora to work with IF stage II (#3560) --- examples/dreambooth/train_dreambooth_lora.py | 49 +++++++++++++++++-- .../pipeline_if_img2img_superresolution.py | 6 ++- .../pipeline_if_inpainting_superresolution.py | 6 ++- .../pipeline_if_superresolution.py | 6 ++- 4 files changed, 59 insertions(+), 8 deletions(-) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 4ff759dcd6d4..12b09089186d 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -60,6 +60,7 @@ from diffusers.optimization import get_scheduler from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.torch_utils import randn_tensor # Will error if the minimal version of diffusers is not installed. Remove at your own risks. @@ -425,6 +426,19 @@ def parse_args(input_args=None): required=False, help="Whether to use attention mask for the text encoder", ) + parser.add_argument( + "--validation_images", + required=False, + default=None, + nargs="+", + help="Optional set of images to use for validation. Used when the target pipeline takes an initial image as input such as when training image variation or superresolution.", + ) + parser.add_argument( + "--class_labels_conditioning", + required=False, + default=None, + help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.", + ) if input_args is not None: args = parser.parse_args(input_args) @@ -1121,7 +1135,7 @@ def compute_text_embeddings(prompt): # Sample noise that we'll add to the latents noise = torch.randn_like(model_input) - bsz = model_input.shape[0] + bsz, channels, height, width = model_input.shape # Sample a random timestep for each image timesteps = torch.randint( 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device @@ -1143,8 +1157,24 @@ def compute_text_embeddings(prompt): text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, ) + if unet.config.in_channels > channels: + needed_additional_channels = unet.config.in_channels - channels + additional_latents = randn_tensor( + (bsz, needed_additional_channels, height, width), + device=noisy_model_input.device, + dtype=noisy_model_input.dtype, + ) + noisy_model_input = torch.cat([additional_latents, noisy_model_input], dim=1) + + if args.class_labels_conditioning == "timesteps": + class_labels = timesteps + else: + class_labels = None + # Predict the noise residual - model_pred = unet(noisy_model_input, timesteps, encoder_hidden_states).sample + model_pred = unet( + noisy_model_input, timesteps, encoder_hidden_states, class_labels=class_labels + ).sample # if model predicts variance, throw away the prediction. we will only train on the # simplified training objective. This means that all schedulers using the fine tuned @@ -1248,9 +1278,18 @@ def compute_text_embeddings(prompt): } else: pipeline_args = {"prompt": args.validation_prompt} - images = [ - pipeline(**pipeline_args, generator=generator).images[0] for _ in range(args.num_validation_images) - ] + + if args.validation_images is None: + images = [ + pipeline(**pipeline_args, generator=generator).images[0] + for _ in range(args.num_validation_images) + ] + else: + images = [] + for image in args.validation_images: + image = Image.open(image) + image = pipeline(**pipeline_args, image=image, generator=generator).images[0] + images.append(image) for tracker in accelerator.trackers: if tracker.name == "tensorboard": diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py index a49d25137b2a..0ee9c6ba331d 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_img2img_superresolution.py @@ -10,6 +10,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer +from ...loaders import LoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -112,7 +113,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: """ -class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline): +class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel @@ -1047,6 +1048,9 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + if self.scheduler.config.variance_type not in ["learned", "learned_range"]: + noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1) + # compute the previous noisy sample x_t -> x_t-1 intermediate_images = self.scheduler.step( noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py index f255948dc74b..6a90f2b765d4 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_inpainting_superresolution.py @@ -10,6 +10,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer +from ...loaders import LoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -114,7 +115,7 @@ def resize(images: PIL.Image.Image, img_size: int) -> PIL.Image.Image: """ -class IFInpaintingSuperResolutionPipeline(DiffusionPipeline): +class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel @@ -1154,6 +1155,9 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + if self.scheduler.config.variance_type not in ["learned", "learned_range"]: + noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1) + # compute the previous noisy sample x_t -> x_t-1 prev_intermediate_images = intermediate_images diff --git a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py index 7a8de51579b7..86d9574b97e1 100644 --- a/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py +++ b/src/diffusers/pipelines/deepfloyd_if/pipeline_if_superresolution.py @@ -10,6 +10,7 @@ import torch.nn.functional as F from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer +from ...loaders import LoraLoaderMixin from ...models import UNet2DConditionModel from ...schedulers import DDPMScheduler from ...utils import ( @@ -70,7 +71,7 @@ """ -class IFSuperResolutionPipeline(DiffusionPipeline): +class IFSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin): tokenizer: T5Tokenizer text_encoder: T5EncoderModel @@ -903,6 +904,9 @@ def __call__( noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) noise_pred = torch.cat([noise_pred, predicted_variance], dim=1) + if self.scheduler.config.variance_type not in ["learned", "learned_range"]: + noise_pred, _ = noise_pred.split(intermediate_images.shape[1], dim=1) + # compute the previous noisy sample x_t -> x_t-1 intermediate_images = self.scheduler.step( noise_pred, t, intermediate_images, **extra_step_kwargs, return_dict=False From 4f14b363297cf8deac3e88a3bf31f59880ac8a96 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Wed, 31 May 2023 09:39:31 -0700 Subject: [PATCH 059/199] Full Dreambooth IF stage II upscaling (#3561) * update dreambooth lora to work with IF stage II * Update dreambooth script for IF stage II upscaler --- examples/dreambooth/train_dreambooth.py | 55 +++++++++++++++++++++---- 1 file changed, 46 insertions(+), 9 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 37b06acb6977..e4ab6b2ae014 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -52,6 +52,7 @@ from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.torch_utils import randn_tensor if is_wandb_available(): @@ -114,16 +115,17 @@ def log_validation( pipeline_args = {} - if text_encoder is not None: - pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder) - if vae is not None: pipeline_args["vae"] = vae + if text_encoder is not None: + text_encoder = accelerator.unwrap_model(text_encoder) + # create pipeline (note: unet and vae are loaded again in float32) pipeline = DiffusionPipeline.from_pretrained( args.pretrained_model_name_or_path, tokenizer=tokenizer, + text_encoder=text_encoder, unet=accelerator.unwrap_model(unet), revision=args.revision, torch_dtype=weight_dtype, @@ -156,10 +158,16 @@ def log_validation( # run inference generator = None if args.seed is None else torch.Generator(device=accelerator.device).manual_seed(args.seed) images = [] - for _ in range(args.num_validation_images): - with torch.autocast("cuda"): - image = pipeline(**pipeline_args, num_inference_steps=25, generator=generator).images[0] - images.append(image) + if args.validation_images is None: + for _ in range(args.num_validation_images): + with torch.autocast("cuda"): + image = pipeline(**pipeline_args, num_inference_steps=25, generator=generator).images[0] + images.append(image) + else: + for image in args.validation_images: + image = Image.open(image) + image = pipeline(**pipeline_args, image=image, generator=generator).images[0] + images.append(image) for tracker in accelerator.trackers: if tracker.name == "tensorboard": @@ -525,6 +533,19 @@ def parse_args(input_args=None): parser.add_argument( "--skip_save_text_encoder", action="store_true", required=False, help="Set to not save text encoder" ) + parser.add_argument( + "--validation_images", + required=False, + default=None, + nargs="+", + help="Optional set of images to use for validation. Used when the target pipeline takes an initial image as input such as when training image variation or superresolution.", + ) + parser.add_argument( + "--class_labels_conditioning", + required=False, + default=None, + help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.", + ) if input_args is not None: args = parser.parse_args(input_args) @@ -1169,7 +1190,7 @@ def compute_text_embeddings(prompt): ) else: noise = torch.randn_like(model_input) - bsz = model_input.shape[0] + bsz, channels, height, width = model_input.shape # Sample a random timestep for each image timesteps = torch.randint( 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device @@ -1191,8 +1212,24 @@ def compute_text_embeddings(prompt): text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, ) + if unet.config.in_channels > channels: + needed_additional_channels = unet.config.in_channels - channels + additional_latents = randn_tensor( + (bsz, needed_additional_channels, height, width), + device=noisy_model_input.device, + dtype=noisy_model_input.dtype, + ) + noisy_model_input = torch.cat([additional_latents, noisy_model_input], dim=1) + + if args.class_labels_conditioning == "timesteps": + class_labels = timesteps + else: + class_labels = None + # Predict the noise residual - model_pred = unet(noisy_model_input, timesteps, encoder_hidden_states).sample + model_pred = unet( + noisy_model_input, timesteps, encoder_hidden_states, class_labels=class_labels + ).sample if model_pred.shape[1] == 6: model_pred, _ = torch.chunk(model_pred, 2, dim=1) From 55dbfa0229e82c3f7ec2f9cd82a59c1220e457a7 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Fri, 2 Jun 2023 08:04:35 +0530 Subject: [PATCH 060/199] [Docs] include the instruction-tuning blog link in the InstructPix2Pix docs (#3644) include the instruction-tuning blog link. --- docs/source/en/training/instructpix2pix.mdx | 2 ++ examples/instruct_pix2pix/README.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/source/en/training/instructpix2pix.mdx b/docs/source/en/training/instructpix2pix.mdx index 64d97ecd6c83..03ba8f5635d6 100644 --- a/docs/source/en/training/instructpix2pix.mdx +++ b/docs/source/en/training/instructpix2pix.mdx @@ -207,3 +207,5 @@ speed and quality during performance: Particularly, `image_guidance_scale` and `guidance_scale` can have a profound impact on the generated ("edited") image (see [here](https://twitter.com/RisingSayak/status/1628392199196151808?s=20) for an example). + +If you're looking for some interesting ways to use the InstructPix2Pix training methodology, we welcome you to check out this blog post: [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd). \ No newline at end of file diff --git a/examples/instruct_pix2pix/README.md b/examples/instruct_pix2pix/README.md index 94a7bd2a98f6..355d48193634 100644 --- a/examples/instruct_pix2pix/README.md +++ b/examples/instruct_pix2pix/README.md @@ -185,3 +185,5 @@ speed and quality during performance: Particularly, `image_guidance_scale` and `guidance_scale` can have a profound impact on the generated ("edited") image (see [here](https://twitter.com/RisingSayak/status/1628392199196151808?s=20) for an example). + +If you're looking for some interesting ways to use the InstructPix2Pix training methodology, we welcome you to check out this blog post: [Instruction-tuning Stable Diffusion with InstructPix2Pix](https://huggingface.co/blog/instruction-tuning-sd). \ No newline at end of file From 32ea2142c056fae722b0cabaa799697a861cd039 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Fri, 2 Jun 2023 08:57:20 +0100 Subject: [PATCH 061/199] [Kandinsky] Improve kandinsky API a bit (#3636) * Improve docs * up * Update docs/source/en/api/pipelines/kandinsky.mdx * up * up * correct more * further improve * Update docs/source/en/api/pipelines/kandinsky.mdx Co-authored-by: YiYi Xu --------- Co-authored-by: YiYi Xu --- docs/source/en/api/pipelines/kandinsky.mdx | 193 +++++++++++------- .../pipelines/kandinsky/pipeline_kandinsky.py | 11 +- .../kandinsky/pipeline_kandinsky_img2img.py | 11 +- .../kandinsky/pipeline_kandinsky_inpaint.py | 11 +- .../kandinsky/pipeline_kandinsky_prior.py | 53 +++-- tests/pipelines/kandinsky/test_kandinsky.py | 6 +- .../kandinsky/test_kandinsky_img2img.py | 6 +- .../kandinsky/test_kandinsky_inpaint.py | 6 +- .../kandinsky/test_kandinsky_prior.py | 2 +- tests/pipelines/test_pipelines_common.py | 2 +- 10 files changed, 182 insertions(+), 119 deletions(-) diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx index b5b4f0f06400..b94937e4af85 100644 --- a/docs/source/en/api/pipelines/kandinsky.mdx +++ b/docs/source/en/api/pipelines/kandinsky.mdx @@ -19,81 +19,78 @@ The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene55 ## Available Pipelines: -| Pipeline | Tasks | Colab -|---|---|:---:| -| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | - | -| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | - | -| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | - | +| Pipeline | Tasks | +|---|---| +| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | +| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | +| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | ## Usage example -In the following, we will walk you through some cool examples of using the Kandinsky pipelines to create some visually aesthetic artwork. +In the following, we will walk you through some examples of how to use the Kandinsky pipelines to create some visually aesthetic artwork. ### Text-to-Image Generation -For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. The first step is to encode text prompts with CLIP and then diffuse the CLIP text embeddings to CLIP image embeddings, as first proposed in [DALL-E 2](https://cdn.openai.com/papers/dall-e-2.pdf). Let's throw a fun prompt at Kandinsky to see what it comes up with :) +For text-to-image generation, we need to use both [`KandinskyPriorPipeline`] and [`KandinskyPipeline`]. +The first step is to encode text prompts with CLIP and then diffuse the CLIP text embeddings to CLIP image embeddings, +as first proposed in [DALL-E 2](https://cdn.openai.com/papers/dall-e-2.pdf). +Let's throw a fun prompt at Kandinsky to see what it comes up with. -```python +```py prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" -negative_prompt = "low quality, bad quality" ``` -We will pass both the `prompt` and `negative_prompt` to our prior diffusion pipeline. In contrast to other diffusion pipelines, such as Stable Diffusion, the `prompt` and `negative_prompt` shall be passed separately so that we can retrieve a CLIP image embedding for each prompt input. You can use `guidance_scale`, and `num_inference_steps` arguments to guide this process, just like how you would normally do with all other pipelines in diffusers. +First, let's instantiate the prior pipeline and the text-to-image pipeline. Both +pipelines are diffusion models. -```python -from diffusers import KandinskyPriorPipeline + +```py +from diffusers import DiffusionPipeline import torch -# create prior -pipe_prior = KandinskyPriorPipeline.from_pretrained( - "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 -) +pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16) pipe_prior.to("cuda") -generator = torch.Generator(device="cuda").manual_seed(12) -image_emb = pipe_prior( - prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt -).images +t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +t2i_pipe.to("cuda") +``` -zero_image_emb = pipe_prior( - negative_prompt, guidance_scale=1.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt -).images +Now we pass the prompt through the prior to generate image embeddings. The prior +returns both the image embeddings corresponding to the prompt and negative/unconditional image +embeddings corresponding to an empty string. + +```py +generator = torch.Generator(device="cuda").manual_seed(12) +image_embeds, negative_image_embeds = pipe_prior(prompt, generator=generator).to_tuple() ``` -Once we create the image embedding, we can use [`KandinskyPipeline`] to generate images. + -```python -from PIL import Image -from diffusers import KandinskyPipeline +The text-to-image pipeline expects both `image_embeds`, `negative_image_embeds` and the original +`prompt` as the text-to-image pipeline uses another text encoder to better guide the second diffusion +process of `t2i_pipe`. +By default, the prior returns unconditioned negative image embeddings corresponding to the negative prompt of `""`. +For better results, you can also pass a `negative_prompt` to the prior. This will increase the effective batch size +of the prior by a factor of 2. -def image_grid(imgs, rows, cols): - assert len(imgs) == rows * cols +```py +prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" +negative_prompt = "low quality, bad quality" - w, h = imgs[0].size - grid = Image.new("RGB", size=(cols * w, rows * h)) - grid_w, grid_h = grid.size +image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, generator=generator).to_tuple() +``` - for i, img in enumerate(imgs): - grid.paste(img, box=(i % cols * w, i // cols * h)) - return grid + -# create diffuser pipeline -pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) -pipe.to("cuda") +Next, we can pass the embeddings as well as the prompt to the text-to-image pipeline. Remember that +in case you are using a customized negative prompt, that you should pass this one also to the text-to-image pipelines +with `negative_prompt=negative_prompt`: -images = pipe( - prompt, - image_embeds=image_emb, - negative_image_embeds=zero_image_emb, - num_images_per_prompt=2, - height=768, - width=768, - num_inference_steps=100, - guidance_scale=4.0, - generator=generator, -).images +```py +image = t2i_pipe(prompt, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds).images[0] +image.save("cheeseburger_monster.png") ``` One cheeseburger monster coming up! Enjoy! @@ -164,22 +161,15 @@ prompt = "A fantasy landscape, Cinematic lighting" negative_prompt = "low quality, bad quality" generator = torch.Generator(device="cuda").manual_seed(30) -image_emb = pipe_prior( - prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt -).images - -zero_image_emb = pipe_prior( - negative_prompt, guidance_scale=4.0, num_inference_steps=25, generator=generator, negative_prompt=negative_prompt -).images +image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, generator=generator).to_tuple() out = pipe( prompt, image=original_image, - image_embeds=image_emb, - negative_image_embeds=zero_image_emb, + image_embeds=image_embeds, + negative_image_embeds=negative_image_embeds, height=768, width=768, - num_inference_steps=500, strength=0.3, ) @@ -193,7 +183,7 @@ out.images[0].save("fantasy_land.png") You can use [`KandinskyInpaintPipeline`] to edit images. In this example, we will add a hat to the portrait of a cat. -```python +```py from diffusers import KandinskyInpaintPipeline, KandinskyPriorPipeline from diffusers.utils import load_image import torch @@ -205,7 +195,7 @@ pipe_prior = KandinskyPriorPipeline.from_pretrained( pipe_prior.to("cuda") prompt = "a hat" -image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) +prior_output = pipe_prior(prompt) pipe = KandinskyInpaintPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16) pipe.to("cuda") @@ -222,8 +212,7 @@ out = pipe( prompt, image=init_image, mask_image=mask, - image_embeds=image_emb, - negative_image_embeds=zero_image_emb, + **prior_output, height=768, width=768, num_inference_steps=150, @@ -246,7 +235,6 @@ from diffusers.utils import load_image import PIL import torch -from torchvision import transforms pipe_prior = KandinskyPriorPipeline.from_pretrained( "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16 @@ -263,22 +251,80 @@ img2 = load_image( # add all the conditions we want to interpolate, can be either text or image images_texts = ["a cat", img1, img2] + # specify the weights for each condition in images_texts weights = [0.3, 0.3, 0.4] -image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights) + +# We can leave the prompt empty +prompt = "" +prior_out = pipe_prior.interpolate(images_texts, weights) pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) pipe.to("cuda") -image = pipe( - "", image_embeds=image_emb, negative_image_embeds=zero_image_emb, height=768, width=768, num_inference_steps=150 -).images[0] +image = pipe(prompt, **prior_out, height=768, width=768).images[0] image.save("starry_cat.png") ``` ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/starry_cat.png) +## Optimization + +Running Kandinsky in inference requires running both a first prior pipeline: [`KandinskyPriorPipeline`] +and a second image decoding pipeline which is one of [`KandinskyPipeline`], [`KandinskyImg2ImgPipeline`], or [`KandinskyInpaintPipeline`]. + +The bulk of the computation time will always be the second image decoding pipeline, so when looking +into optimizing the model, one should look into the second image decoding pipeline. + +When running with PyTorch < 2.0, we strongly recommend making use of [`xformers`](https://github.com/facebookresearch/xformers) +to speed-up the optimization. This can be done by simply running: + +```py +from diffusers import DiffusionPipeline +import torch + +t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +t2i_pipe.enable_xformers_memory_efficient_attention() +``` + +When running on PyTorch >= 2.0, PyTorch's SDPA attention will automatically be used. For more information on +PyTorch's SDPA, feel free to have a look at [this blog post](https://pytorch.org/blog/accelerated-diffusers-pt-20/). + +To have explicit control , you can also manually set the pipeline to use PyTorch's 2.0 efficient attention: + +```py +from diffusers.models.attention_processor import AttnAddedKVProcessor2_0 + +t2i_pipe.unet.set_attn_processor(AttnAddedKVProcessor2_0()) +``` + +The slowest and most memory intense attention processor is the default `AttnAddedKVProcessor` processor. +We do **not** recommend using it except for testing purposes or cases where very high determistic behaviour is desired. +You can set it with: + +```py +from diffusers.models.attention_processor import AttnAddedKVProcessor + +t2i_pipe.unet.set_attn_processor(AttnAddedKVProcessor()) +``` + +With PyTorch >= 2.0, you can also use Kandinsky with `torch.compile` which depending +on your hardware can signficantly speed-up your inference time once the model is compiled. +To use Kandinsksy with `torch.compile`, you can do: + +```py +t2i_pipe.unet.to(memory_format=torch.channels_last) +t2i_pipe.unet = torch.compile(t2i_pipe.unet, mode="reduce-overhead", fullgraph=True) +``` + +After compilation you should see a very fast inference time. For more information, +feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0). + + + + + ## KandinskyPriorPipeline [[autodoc]] KandinskyPriorPipeline @@ -292,15 +338,14 @@ image.save("starry_cat.png") - all - __call__ -## KandinskyInpaintPipeline - -[[autodoc]] KandinskyInpaintPipeline - - all - - __call__ - ## KandinskyImg2ImgPipeline [[autodoc]] KandinskyImg2ImgPipeline - all - __call__ +## KandinskyInpaintPipeline + +[[autodoc]] KandinskyInpaintPipeline + - all + - __call__ diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 29545bd88dc2..0da9d205f8e0 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -304,12 +304,12 @@ def __call__( prompt: Union[str, List[str]], image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, - negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, output_type: Optional[str] = "pil", @@ -325,6 +325,9 @@ def __call__( The clip image embeddings for text prompt, that will be used to condition the image generation. negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. width (`int`, *optional*, defaults to 512): @@ -340,9 +343,6 @@ def __call__( usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -418,7 +418,8 @@ def __call__( timestep=t, encoder_hidden_states=text_encoder_hidden_states, added_cond_kwargs=added_cond_kwargs, - ).sample + return_dict=False, + )[0] if do_classifier_free_guidance: noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py index 470fa606af1a..f32528617e5a 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py @@ -368,13 +368,13 @@ def __call__( image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], image_embeds: torch.FloatTensor, negative_image_embeds: torch.FloatTensor, + negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, num_inference_steps: int = 100, strength: float = 0.3, guidance_scale: float = 7.0, num_images_per_prompt: int = 1, - negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, output_type: Optional[str] = "pil", return_dict: bool = True, @@ -392,6 +392,9 @@ def __call__( The clip image embeddings for text prompt, that will be used to condition the image generation. negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. width (`int`, *optional*, defaults to 512): @@ -413,9 +416,6 @@ def __call__( usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -512,7 +512,8 @@ def __call__( timestep=t, encoder_hidden_states=text_encoder_hidden_states, added_cond_kwargs=added_cond_kwargs, - ).sample + return_dict=False, + )[0] if do_classifier_free_guidance: noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py index cc9a35e580b3..04810ddb6e0a 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -466,12 +466,12 @@ def __call__( mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], image_embeds: torch.FloatTensor, negative_image_embeds: torch.FloatTensor, + negative_prompt: Optional[Union[str, List[str]]] = None, height: int = 512, width: int = 512, num_inference_steps: int = 100, guidance_scale: float = 4.0, num_images_per_prompt: int = 1, - negative_prompt: Optional[Union[str, List[str]]] = None, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, output_type: Optional[str] = "pil", @@ -498,6 +498,9 @@ def __call__( The clip image embeddings for text prompt, that will be used to condition the image generation. negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): The clip image embeddings for negative text prompt, will be used to condition the image generation. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). height (`int`, *optional*, defaults to 512): The height in pixels of the generated image. width (`int`, *optional*, defaults to 512): @@ -513,9 +516,6 @@ def __call__( usually at the expense of lower image quality. num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). generator (`torch.Generator` or `List[torch.Generator]`, *optional*): One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation deterministic. @@ -629,7 +629,8 @@ def __call__( timestep=t, encoder_hidden_states=text_encoder_hidden_states, added_cond_kwargs=added_cond_kwargs, - ).sample + return_dict=False, + )[0] if do_classifier_free_guidance: noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py index d9474b43da54..0c262c57abc0 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -116,14 +116,14 @@ class KandinskyPriorPipelineOutput(BaseOutput): Output class for KandinskyPriorPipeline. Args: - images (`torch.FloatTensor`) + image_embeds (`torch.FloatTensor`) clip image embeddings for text prompt - zero_embeds (`List[PIL.Image.Image]` or `np.ndarray`) + negative_image_embeds (`List[PIL.Image.Image]` or `np.ndarray`) clip image embeddings for unconditional tokens """ - images: Union[torch.FloatTensor, np.ndarray] - zero_embeds: Union[torch.FloatTensor, np.ndarray] + image_embeds: Union[torch.FloatTensor, np.ndarray] + negative_image_embeds: Union[torch.FloatTensor, np.ndarray] class KandinskyPriorPipeline(DiffusionPipeline): @@ -231,7 +231,7 @@ def interpolate( image_embeddings = [] for cond, weight in zip(images_and_prompts, weights): if isinstance(cond, str): - image_emb = self.__call__( + image_emb = self( cond, num_inference_steps=num_inference_steps, num_images_per_prompt=num_images_per_prompt, @@ -239,7 +239,7 @@ def interpolate( latents=latents, negative_prompt=negative_prior_prompt, guidance_scale=guidance_scale, - ).images + ).image_embeds elif isinstance(cond, (PIL.Image.Image, torch.Tensor)): if isinstance(cond, PIL.Image.Image): @@ -261,7 +261,7 @@ def interpolate( image_emb = torch.cat(image_embeddings).sum(dim=0, keepdim=True) - out_zero = self.__call__( + out_zero = self( negative_prompt, num_inference_steps=num_inference_steps, num_images_per_prompt=num_images_per_prompt, @@ -270,9 +270,9 @@ def interpolate( negative_prompt=negative_prior_prompt, guidance_scale=guidance_scale, ) - zero_image_emb = out_zero.zero_embeds if negative_prompt == "" else out_zero.images + zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds - return image_emb, zero_image_emb + return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb) def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): if latents is None: @@ -435,11 +435,11 @@ def _encode_prompt( def __call__( self, prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, num_inference_steps: int = 25, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, - negative_prompt: Optional[Union[str, List[str]]] = None, guidance_scale: float = 4.0, output_type: Optional[str] = "pt", # pt only return_dict: bool = True, @@ -450,6 +450,9 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). num_images_per_prompt (`int`, *optional*, defaults to 1): The number of images to generate per prompt. num_inference_steps (`int`, *optional*, defaults to 100): @@ -462,9 +465,6 @@ def __call__( Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image generation. Can be used to tweak the same generation with different prompts. If not provided, a latents tensor will ge generated by sampling using the supplied random `generator`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). guidance_scale (`float`, *optional*, defaults to 4.0): Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). `guidance_scale` is defined as `w` of equation 2. of [Imagen @@ -484,14 +484,24 @@ def __call__( """ if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: + prompt = [prompt] + elif not isinstance(prompt, list): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif not isinstance(negative_prompt, list) and negative_prompt is not None: + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + + # if the negative prompt is defined we double the batch size to + # directly retrieve the negative prompt embedding + if negative_prompt is not None: + prompt = prompt + negative_prompt + negative_prompt = 2 * negative_prompt + device = self._execution_device + batch_size = len(prompt) batch_size = batch_size * num_images_per_prompt do_classifier_free_guidance = guidance_scale > 1.0 @@ -548,7 +558,12 @@ def __call__( latents = self.prior.post_process_latents(latents) image_embeddings = latents - zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device) + + # if negative prompt has been defined, we retrieve split the image embedding into two + if negative_prompt is None: + zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device) + else: + image_embeddings, zero_embeds = image_embeddings.chunk(2) if output_type not in ["pt", "np"]: raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}") @@ -560,4 +575,4 @@ def __call__( if not return_dict: return (image_embeddings, zero_embeds) - return KandinskyPriorPipelineOutput(images=image_embeddings, zero_embeds=zero_embeds) + return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds) diff --git a/tests/pipelines/kandinsky/test_kandinsky.py b/tests/pipelines/kandinsky/test_kandinsky.py index 8f7d5ae2019c..239433910b45 100644 --- a/tests/pipelines/kandinsky/test_kandinsky.py +++ b/tests/pipelines/kandinsky/test_kandinsky.py @@ -258,12 +258,12 @@ def test_kandinsky_text2img(self): prompt = "red cat, 4k photo" generator = torch.Generator(device="cuda").manual_seed(0) - image_emb = pipe_prior( + image_emb, zero_image_emb = pipe_prior( prompt, generator=generator, num_inference_steps=5, - ).images - zero_image_emb = pipe_prior("", num_inference_steps=5).images + negative_prompt="", + ).to_tuple() generator = torch.Generator(device="cuda").manual_seed(0) output = pipeline( diff --git a/tests/pipelines/kandinsky/test_kandinsky_img2img.py b/tests/pipelines/kandinsky/test_kandinsky_img2img.py index 6958403ae11c..94817b3eed4b 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_img2img.py +++ b/tests/pipelines/kandinsky/test_kandinsky_img2img.py @@ -276,12 +276,12 @@ def test_kandinsky_img2img(self): pipeline.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) - image_emb = pipe_prior( + image_emb, zero_image_emb = pipe_prior( prompt, generator=generator, num_inference_steps=5, - ).images - zero_image_emb = pipe_prior("", num_inference_steps=5).images + negative_prompt="", + ).to_tuple() output = pipeline( prompt, diff --git a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py index 1bca753bec18..46926479ae06 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_inpaint.py +++ b/tests/pipelines/kandinsky/test_kandinsky_inpaint.py @@ -286,12 +286,12 @@ def test_kandinsky_inpaint(self): pipeline.set_progress_bar_config(disable=None) generator = torch.Generator(device="cpu").manual_seed(0) - image_emb = pipe_prior( + image_emb, zero_image_emb = pipe_prior( prompt, generator=generator, num_inference_steps=5, - ).images - zero_image_emb = pipe_prior("").images + negative_prompt="", + ).to_tuple() output = pipeline( prompt, diff --git a/tests/pipelines/kandinsky/test_kandinsky_prior.py b/tests/pipelines/kandinsky/test_kandinsky_prior.py index 5ed1f2ac984d..d9c260eabc06 100644 --- a/tests/pipelines/kandinsky/test_kandinsky_prior.py +++ b/tests/pipelines/kandinsky/test_kandinsky_prior.py @@ -194,7 +194,7 @@ def test_kandinsky_prior(self): pipe.set_progress_bar_config(disable=None) output = pipe(**self.get_dummy_inputs(device)) - image = output.images + image = output.image_embeds image_from_tuple = pipe( **self.get_dummy_inputs(device), diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 3ddfd35defb7..8ce0a0f283d7 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -650,7 +650,7 @@ def test_num_images_per_prompt(self): if key in self.batch_params: inputs[key] = batch_size * [inputs[key]] - images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt).images + images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0] assert images.shape[0] == batch_size * num_images_per_prompt From 8e552bb4fe33363762864c62a60d456b1cf1e973 Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Fri, 2 Jun 2023 21:10:24 +0900 Subject: [PATCH 062/199] Support Kohya-ss style LoRA file format (in a limited capacity) (#3437) * add _convert_kohya_lora_to_diffusers * make style * add scaffold * match result: unet attention only * fix monkey-patch for text_encoder * with CLIPAttention While the terrible images are no longer produced, the results do not match those from the hook ver. This may be due to not setting the network_alpha value. * add to support network_alpha * generate diff image * fix monkey-patch for text_encoder * add test_text_encoder_lora_monkey_patch() * verify that it's okay to release the attn_procs * fix closure version * add comment * Revert "fix monkey-patch for text_encoder" This reverts commit bb9c61e6faecc1935c9c4319c77065837655d616. * Fix to reuse utility functions * make LoRAAttnProcessor targets to self_attn * fix LoRAAttnProcessor target * make style * fix split key * Update src/diffusers/loaders.py * remove TEXT_ENCODER_TARGET_MODULES loop * add print memory usage * remove test_kohya_loras_scaffold.py * add: doc on LoRA civitai * remove print statement and refactor in the doc. * fix state_dict test for kohya-ss style lora * Apply suggestions from code review Co-authored-by: Takuma Mori --------- Co-authored-by: Sayak Paul --- docs/source/en/training/lora.mdx | 73 ++++++++++++++- examples/dreambooth/train_dreambooth_lora.py | 6 +- src/diffusers/loaders.py | 95 +++++++++++++++++--- src/diffusers/models/attention_processor.py | 45 ++++++---- src/diffusers/utils/__init__.py | 1 + src/diffusers/utils/constants.py | 1 + tests/models/test_lora_layers.py | 87 +++++++++++++++++- 7 files changed, 272 insertions(+), 36 deletions(-) diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx index 748d99d5020d..484b08ce950a 100644 --- a/docs/source/en/training/lora.mdx +++ b/docs/source/en/training/lora.mdx @@ -272,4 +272,75 @@ Note that the use of [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] is * LoRA parameters that have separate identifiers for the UNet and the text encoder such as: [`"sayakpaul/dreambooth"`](https://huggingface.co/sayakpaul/dreambooth). **Note** that it is possible to provide a local directory path to [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] as well as [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`]. To know about the supported inputs, -refer to the respective docstrings. \ No newline at end of file +refer to the respective docstrings. + +## Supporting A1111 themed LoRA checkpoints from Diffusers + +To provide seamless interoperability with A1111 to our users, we support loading A1111 formatted +LoRA checkpoints using [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] in a limited capacity. +In this section, we explain how to load an A1111 formatted LoRA checkpoint from [CivitAI](https://civitai.com/) +in Diffusers and perform inference with it. + +First, download a checkpoint. We'll use +[this one](https://civitai.com/models/13239/light-and-shadow) for demonstration purposes. + +```bash +wget https://civitai.com/api/download/models/15603 -O light_and_shadow.safetensors +``` + +Next, we initialize a [`~DiffusionPipeline`]: + +```python +import torch + +from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler + +pipeline = StableDiffusionPipeline.from_pretrained( + "gsdf/Counterfeit-V2.5", torch_dtype=torch.float16, safety_checker=None +).to("cuda") +pipeline.scheduler = DPMSolverMultistepScheduler.from_config( + pipeline.scheduler.config, use_karras_sigmas=True +) +``` + +We then load the checkpoint downloaded from CivitAI: + +```python +pipeline.load_lora_weights(".", weight_name="light_and_shadow.safetensors") +``` + + + +If you're loading a checkpoint in the `safetensors` format, please ensure you have `safetensors` installed. + + + +And then it's time for running inference: + +```python +prompt = "masterpiece, best quality, 1girl, at dusk" +negative_prompt = ("(low quality, worst quality:1.4), (bad anatomy), (inaccurate limb:1.2), " + "bad composition, inaccurate eyes, extra digit, fewer digits, (extra arms:1.2), large breasts") + +images = pipeline(prompt=prompt, + negative_prompt=negative_prompt, + width=512, + height=768, + num_inference_steps=15, + num_images_per_prompt=4, + generator=torch.manual_seed(0) +).images +``` + +Below is a comparison between the LoRA and the non-LoRA results: + +![lora_non_lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_non_lora_comparison.png) + +You have a similar checkpoint stored on the Hugging Face Hub, you can load it +directly with [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] like so: + +```python +lora_model_id = "sayakpaul/civitai-light-shadow-lora" +lora_filename = "light_and_shadow.safetensors" +pipeline.load_lora_weights(lora_model_id, weight_name=lora_filename) +``` \ No newline at end of file diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 12b09089186d..319348bd40bb 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -58,7 +58,7 @@ SlicedAttnAddedKVProcessor, ) from diffusers.optimization import get_scheduler -from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, check_min_version, is_wandb_available +from diffusers.utils import TEXT_ENCODER_ATTN_MODULE, check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.torch_utils import randn_tensor @@ -861,9 +861,9 @@ def main(args): if args.train_text_encoder: text_lora_attn_procs = {} for name, module in text_encoder.named_modules(): - if any(x in name for x in TEXT_ENCODER_TARGET_MODULES): + if name.endswith(TEXT_ENCODER_ATTN_MODULE): text_lora_attn_procs[name] = LoRAAttnProcessor( - hidden_size=module.out_features, cross_attention_dim=None + hidden_size=module.out_proj.out_features, cross_attention_dim=None ) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) temp_pipeline = DiffusionPipeline.from_pretrained( diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 84e6b4e61f0f..42625270c12e 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -72,8 +72,8 @@ def __init__(self, state_dict: Dict[str, torch.Tensor]): self.mapping = dict(enumerate(state_dict.keys())) self.rev_mapping = {v: k for k, v in enumerate(state_dict.keys())} - # .processor for unet, .k_proj, ".q_proj", ".v_proj", and ".out_proj" for text encoder - self.split_keys = [".processor", ".k_proj", ".q_proj", ".v_proj", ".out_proj"] + # .processor for unet, .self_attn for text encoder + self.split_keys = [".processor", ".self_attn"] # we add a hook to state_dict() and load_state_dict() so that the # naming fits with `unet.attn_processors` @@ -182,6 +182,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict subfolder = kwargs.pop("subfolder", None) weight_name = kwargs.pop("weight_name", None) use_safetensors = kwargs.pop("use_safetensors", None) + # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script. + # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning + network_alpha = kwargs.pop("network_alpha", None) if use_safetensors and not is_safetensors_available(): raise ValueError( @@ -287,7 +290,10 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict attn_processor_class = LoRAAttnProcessor attn_processors[key] = attn_processor_class( - hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + rank=rank, + network_alpha=network_alpha, ) attn_processors[key].load_state_dict(value_dict) elif is_custom_diffusion: @@ -774,6 +780,8 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di + We support loading A1111 formatted LoRA checkpoints in a limited capacity. + This function is experimental and might change in the future. @@ -898,6 +906,11 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di else: state_dict = pretrained_model_name_or_path_or_dict + # Convert kohya-ss Style LoRA attn procs to diffusers attn procs + network_alpha = None + if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()): + state_dict, network_alpha = self._convert_kohya_lora_to_diffusers(state_dict) + # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918), # then the `state_dict` keys should have `self.unet_name` and/or `self.text_encoder_name` as # their prefixes. @@ -909,7 +922,7 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di unet_lora_state_dict = { k.replace(f"{self.unet_name}.", ""): v for k, v in state_dict.items() if k in unet_keys } - self.unet.load_attn_procs(unet_lora_state_dict) + self.unet.load_attn_procs(unet_lora_state_dict, network_alpha=network_alpha) # Load the layers corresponding to text encoder and make necessary adjustments. text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)] @@ -918,7 +931,9 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys } if len(text_encoder_lora_state_dict) > 0: - attn_procs_text_encoder = self._load_text_encoder_attn_procs(text_encoder_lora_state_dict) + attn_procs_text_encoder = self._load_text_encoder_attn_procs( + text_encoder_lora_state_dict, network_alpha=network_alpha + ) self._modify_text_encoder(attn_procs_text_encoder) # save lora attn procs of text encoder so that it can be easily retrieved @@ -954,14 +969,20 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): module = self.text_encoder.get_submodule(name) # Construct a new function that performs the LoRA merging. We will monkey patch # this forward pass. - lora_layer = getattr(attn_processors[name], self._get_lora_layer_attribute(name)) + attn_processor_name = ".".join(name.split(".")[:-1]) + lora_layer = getattr(attn_processors[attn_processor_name], self._get_lora_layer_attribute(name)) old_forward = module.forward - def new_forward(x): - return old_forward(x) + lora_layer(x) + # create a new scope that locks in the old_forward, lora_layer value for each new_forward function + # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060 + def make_new_forward(old_forward, lora_layer): + def new_forward(x): + return old_forward(x) + lora_layer(x) + + return new_forward # Monkey-patch. - module.forward = new_forward + module.forward = make_new_forward(old_forward, lora_layer) def _get_lora_layer_attribute(self, name: str) -> str: if "q_proj" in name: @@ -1048,6 +1069,7 @@ def _load_text_encoder_attn_procs( subfolder = kwargs.pop("subfolder", None) weight_name = kwargs.pop("weight_name", None) use_safetensors = kwargs.pop("use_safetensors", None) + network_alpha = kwargs.pop("network_alpha", None) if use_safetensors and not is_safetensors_available(): raise ValueError( @@ -1125,7 +1147,10 @@ def _load_text_encoder_attn_procs( hidden_size = value_dict["to_k_lora.up.weight"].shape[0] attn_processors[key] = LoRAAttnProcessor( - hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + rank=rank, + network_alpha=network_alpha, ) attn_processors[key].load_state_dict(value_dict) @@ -1219,6 +1244,56 @@ def save_function(weights, filename): save_function(state_dict, os.path.join(save_directory, weight_name)) logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}") + def _convert_kohya_lora_to_diffusers(self, state_dict): + unet_state_dict = {} + te_state_dict = {} + network_alpha = None + + for key, value in state_dict.items(): + if "lora_down" in key: + lora_name = key.split(".")[0] + lora_name_up = lora_name + ".lora_up.weight" + lora_name_alpha = lora_name + ".alpha" + if lora_name_alpha in state_dict: + alpha = state_dict[lora_name_alpha].item() + if network_alpha is None: + network_alpha = alpha + elif network_alpha != alpha: + raise ValueError("Network alpha is not consistent") + + if lora_name.startswith("lora_unet_"): + diffusers_name = key.replace("lora_unet_", "").replace("_", ".") + diffusers_name = diffusers_name.replace("down.blocks", "down_blocks") + diffusers_name = diffusers_name.replace("mid.block", "mid_block") + diffusers_name = diffusers_name.replace("up.blocks", "up_blocks") + diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks") + diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora") + diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora") + diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora") + diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora") + if "transformer_blocks" in diffusers_name: + if "attn1" in diffusers_name or "attn2" in diffusers_name: + diffusers_name = diffusers_name.replace("attn1", "attn1.processor") + diffusers_name = diffusers_name.replace("attn2", "attn2.processor") + unet_state_dict[diffusers_name] = value + unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up] + elif lora_name.startswith("lora_te_"): + diffusers_name = key.replace("lora_te_", "").replace("_", ".") + diffusers_name = diffusers_name.replace("text.model", "text_model") + diffusers_name = diffusers_name.replace("self.attn", "self_attn") + diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora") + diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora") + diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora") + diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora") + if "self_attn" in diffusers_name: + te_state_dict[diffusers_name] = value + te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up] + + unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()} + te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()} + new_state_dict = {**unet_state_dict, **te_state_dict} + return new_state_dict, network_alpha + class FromCkptMixin: """This helper class allows to directly load .ckpt stable diffusion file_extension diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index e39bdc0429c1..61a1faea07f4 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -508,7 +508,7 @@ def __call__( class LoRALinearLayer(nn.Module): - def __init__(self, in_features, out_features, rank=4): + def __init__(self, in_features, out_features, rank=4, network_alpha=None): super().__init__() if rank > min(in_features, out_features): @@ -516,6 +516,10 @@ def __init__(self, in_features, out_features, rank=4): self.down = nn.Linear(in_features, rank, bias=False) self.up = nn.Linear(rank, out_features, bias=False) + # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script. + # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning + self.network_alpha = network_alpha + self.rank = rank nn.init.normal_(self.down.weight, std=1 / rank) nn.init.zeros_(self.up.weight) @@ -527,6 +531,9 @@ def forward(self, hidden_states): down_hidden_states = self.down(hidden_states.to(dtype)) up_hidden_states = self.up(down_hidden_states) + if self.network_alpha is not None: + up_hidden_states *= self.network_alpha / self.rank + return up_hidden_states.to(orig_dtype) @@ -543,17 +550,17 @@ class LoRAAttnProcessor(nn.Module): The dimension of the LoRA update matrices. """ - def __init__(self, hidden_size, cross_attention_dim=None, rank=4): + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): super().__init__() self.hidden_size = hidden_size self.cross_attention_dim = cross_attention_dim self.rank = rank - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) - self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) - self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank) + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) def __call__( self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None @@ -838,19 +845,19 @@ class LoRAAttnAddedKVProcessor(nn.Module): The dimension of the LoRA update matrices. """ - def __init__(self, hidden_size, cross_attention_dim=None, rank=4): + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): super().__init__() self.hidden_size = hidden_size self.cross_attention_dim = cross_attention_dim self.rank = rank - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) - self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) - self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank) + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0): residual = hidden_states @@ -1157,7 +1164,9 @@ class LoRAXFormersAttnProcessor(nn.Module): operator. """ - def __init__(self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None): + def __init__( + self, hidden_size, cross_attention_dim, rank=4, attention_op: Optional[Callable] = None, network_alpha=None + ): super().__init__() self.hidden_size = hidden_size @@ -1165,10 +1174,10 @@ def __init__(self, hidden_size, cross_attention_dim, rank=4, attention_op: Optio self.rank = rank self.attention_op = attention_op - self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank) - self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) - self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank) - self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank) + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) def __call__( self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0, temb=None diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index cd3a1b8f3dd4..772c36b1177b 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -30,6 +30,7 @@ ONNX_EXTERNAL_WEIGHTS_NAME, ONNX_WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME, + TEXT_ENCODER_ATTN_MODULE, TEXT_ENCODER_TARGET_MODULES, WEIGHTS_NAME, ) diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py index 1134ba6fb656..93d5c8cc42cd 100644 --- a/src/diffusers/utils/constants.py +++ b/src/diffusers/utils/constants.py @@ -31,3 +31,4 @@ HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules")) DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"] TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "out_proj"] +TEXT_ENCODER_ATTN_MODULE = ".self_attn" diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 64e30ba4057d..d04d87e08b7a 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import gc import os import tempfile import unittest @@ -30,7 +31,7 @@ LoRAXFormersAttnProcessor, XFormersAttnProcessor, ) -from diffusers.utils import TEXT_ENCODER_TARGET_MODULES, floats_tensor, torch_device +from diffusers.utils import TEXT_ENCODER_ATTN_MODULE, floats_tensor, torch_device def create_unet_lora_layers(unet: nn.Module): @@ -50,15 +51,35 @@ def create_unet_lora_layers(unet: nn.Module): return lora_attn_procs, unet_lora_layers -def create_text_encoder_lora_layers(text_encoder: nn.Module): +def create_text_encoder_lora_attn_procs(text_encoder: nn.Module): text_lora_attn_procs = {} for name, module in text_encoder.named_modules(): - if any(x in name for x in TEXT_ENCODER_TARGET_MODULES): - text_lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=module.out_features, cross_attention_dim=None) + if name.endswith(TEXT_ENCODER_ATTN_MODULE): + text_lora_attn_procs[name] = LoRAAttnProcessor( + hidden_size=module.out_proj.out_features, cross_attention_dim=None + ) + return text_lora_attn_procs + + +def create_text_encoder_lora_layers(text_encoder: nn.Module): + text_lora_attn_procs = create_text_encoder_lora_attn_procs(text_encoder) text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs) return text_encoder_lora_layers +def set_lora_up_weights(text_lora_attn_procs, randn_weight=False): + for _, attn_proc in text_lora_attn_procs.items(): + # set up.weights + for layer_name, layer_module in attn_proc.named_modules(): + if layer_name.endswith("_lora"): + weight = ( + torch.randn_like(layer_module.up.weight) + if randn_weight + else torch.zeros_like(layer_module.up.weight) + ) + layer_module.up.weight = torch.nn.Parameter(weight) + + class LoraLoaderMixinTests(unittest.TestCase): def get_dummy_components(self): torch.manual_seed(0) @@ -220,6 +241,64 @@ def test_lora_save_load_legacy(self): # Outputs shouldn't match. self.assertFalse(torch.allclose(torch.from_numpy(orig_image_slice), torch.from_numpy(lora_image_slice))) + # copied from: https://colab.research.google.com/gist/sayakpaul/df2ef6e1ae6d8c10a49d859883b10860/scratchpad.ipynb + def get_dummy_tokens(self): + max_seq_length = 77 + + inputs = torch.randint(2, 56, size=(1, max_seq_length), generator=torch.manual_seed(0)) + + prepared_inputs = {} + prepared_inputs["input_ids"] = inputs + return prepared_inputs + + def test_text_encoder_lora_monkey_patch(self): + pipeline_components, _ = self.get_dummy_components() + pipe = StableDiffusionPipeline(**pipeline_components) + + dummy_tokens = self.get_dummy_tokens() + + # inference without lora + outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0] + assert outputs_without_lora.shape == (1, 77, 32) + + # create lora_attn_procs with zeroed out up.weights + text_attn_procs = create_text_encoder_lora_attn_procs(pipe.text_encoder) + set_lora_up_weights(text_attn_procs, randn_weight=False) + + # monkey patch + pipe._modify_text_encoder(text_attn_procs) + + # verify that it's okay to release the text_attn_procs which holds the LoRAAttnProcessor. + del text_attn_procs + gc.collect() + + # inference with lora + outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0] + assert outputs_with_lora.shape == (1, 77, 32) + + assert torch.allclose( + outputs_without_lora, outputs_with_lora + ), "lora_up_weight are all zero, so the lora outputs should be the same to without lora outputs" + + # create lora_attn_procs with randn up.weights + text_attn_procs = create_text_encoder_lora_attn_procs(pipe.text_encoder) + set_lora_up_weights(text_attn_procs, randn_weight=True) + + # monkey patch + pipe._modify_text_encoder(text_attn_procs) + + # verify that it's okay to release the text_attn_procs which holds the LoRAAttnProcessor. + del text_attn_procs + gc.collect() + + # inference with lora + outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0] + assert outputs_with_lora.shape == (1, 77, 32) + + assert not torch.allclose( + outputs_without_lora, outputs_with_lora + ), "lora_up_weight are not zero, so the lora outputs should be different to without lora outputs" + def create_lora_weight_file(self, tmpdirname): _, lora_components = self.get_dummy_components() LoraLoaderMixin.save_lora_weights( From a6c7b5b6b7a80f37e8cb940fb58bf245476358b2 Mon Sep 17 00:00:00 2001 From: Lachlan Nicholson <57090563+lachlan-nicholson@users.noreply.github.com> Date: Sat, 3 Jun 2023 01:10:22 +1000 Subject: [PATCH 063/199] Iterate over unique tokens to avoid duplicate replacements for multivector embeddings (#3588) * iterate over unique tokens to avoid duplicate replacements * added test for multiple references to multi embedding * adhere to black formatting * reorder test post-rebase --- src/diffusers/loaders.py | 3 ++- tests/pipelines/test_pipelines.py | 12 ++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 42625270c12e..e657406912f2 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -462,7 +462,8 @@ def _maybe_convert_prompt(self, prompt: str, tokenizer: "PreTrainedTokenizer"): `str`: The converted prompt """ tokens = tokenizer.tokenize(prompt) - for token in tokens: + unique_tokens = set(tokens) + for token in unique_tokens: if token in tokenizer.added_tokens_encoder: replacement = token i = 1 diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index bb7c980875ef..5af3a6c16b40 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -722,6 +722,18 @@ def test_text_inversion_download(self): out = pipe(prompt, num_inference_steps=1, output_type="numpy").images assert out.shape == (1, 128, 128, 3) + # multiple references to multi embedding + ten = {"": torch.ones(3, 32)} + pipe.load_textual_inversion(ten) + + assert ( + pipe._maybe_convert_prompt(" ", pipe.tokenizer) == " _1 _2 _1 _2" + ) + + prompt = "hey " + out = pipe(prompt, num_inference_steps=1, output_type="numpy").images + assert out.shape == (1, 128, 128, 3) + def test_download_ignore_files(self): # Check https://huggingface.co/hf-internal-testing/tiny-stable-diffusion-pipe-ignore-files/blob/72f58636e5508a218c6b3f60550dc96445547817/model_index.json#L4 with tempfile.TemporaryDirectory() as tmpdirname: From f1d47433946210fa6837c79268eb1f65bf767ea4 Mon Sep 17 00:00:00 2001 From: Kashif Rasul Date: Fri, 2 Jun 2023 17:24:54 +0200 Subject: [PATCH 064/199] fixed typo in example train_text_to_image.py (#3608) fixed typo --- examples/text_to_image/train_text_to_image.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 82370fc4e2dd..bbf7bf9b85bb 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -115,7 +115,7 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( - "--input_pertubation", type=float, default=0, help="The scale of input pretubation. Recommended 0.1." + "--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1." ) parser.add_argument( "--pretrained_model_name_or_path", @@ -830,8 +830,8 @@ def collate_fn(examples): noise += args.noise_offset * torch.randn( (latents.shape[0], latents.shape[1], 1, 1), device=latents.device ) - if args.input_pertubation: - new_noise = noise + args.input_pertubation * torch.randn_like(noise) + if args.input_perturbation: + new_noise = noise + args.input_perturbation * torch.randn_like(noise) bsz = latents.shape[0] # Sample a random timestep for each image timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) @@ -839,7 +839,7 @@ def collate_fn(examples): # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) - if args.input_pertubation: + if args.input_perturbation: noisy_latents = noise_scheduler.add_noise(latents, new_noise, timesteps) else: noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) From 0e8688113a9a1def80be33efc4665b8b719efe40 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Fri, 2 Jun 2023 06:03:15 -1000 Subject: [PATCH 065/199] fix inpainting pipeline when providing initial latents (#3641) * fix latents * fix copies --------- Co-authored-by: yiyixuxu --- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 4 ++-- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 821a93028c5d..8ebcac2589a3 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -868,8 +868,8 @@ def prepare_latents( # if pure noise then scale the initial latents by the Scheduler's init sigma latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: - latents = latents.to(device) - latents = latents * self.scheduler.init_noise_sigma + noise = latents.to(device) + latents = noise * self.scheduler.init_noise_sigma outputs = (latents,) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 534748c35363..9a6f5dbf9480 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -653,8 +653,8 @@ def prepare_latents( # if pure noise then scale the initial latents by the Scheduler's init sigma latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents else: - latents = latents.to(device) - latents = latents * self.scheduler.init_noise_sigma + noise = latents.to(device) + latents = noise * self.scheduler.init_noise_sigma outputs = (latents,) From 0dbdc0cbae466a10df146bf61db489fb447029b3 Mon Sep 17 00:00:00 2001 From: Kadir Nar Date: Fri, 2 Jun 2023 19:53:09 +0300 Subject: [PATCH 066/199] [Community Doc] Updated the filename and readme file. (#3634) * Updated the filename and readme file. * reformatter * reformetter --- examples/community/README.md | 9 +- examples/community/mixture_tiling.py | 407 +++++++++++++++++++++++++++ 2 files changed, 412 insertions(+), 4 deletions(-) create mode 100644 examples/community/mixture_tiling.py diff --git a/examples/community/README.md b/examples/community/README.md index 8afd3aed18a6..1c4a36614d80 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1605,13 +1605,12 @@ pipe_images = mixing_pipeline( This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details. ```python -from diffusers import LMSDiscreteScheduler -from mixdiff import StableDiffusionTilingPipeline +from diffusers import LMSDiscreteScheduler, DiffusionPipeline # Creater scheduler and model (similar to StableDiffusionPipeline) scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) -pipeline = StableDiffusionTilingPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler) -pipeline.to("cuda:0") +pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling") +pipeline.to("cuda") # Mixture of Diffusers generation image = pipeline( @@ -1629,3 +1628,5 @@ image = pipeline( num_inference_steps=50, )["images"][0] ``` +![mixture_tiling_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/mixture_tiling.png) + diff --git a/examples/community/mixture_tiling.py b/examples/community/mixture_tiling.py new file mode 100644 index 000000000000..83389887455e --- /dev/null +++ b/examples/community/mixture_tiling.py @@ -0,0 +1,407 @@ +import inspect +from copy import deepcopy +from enum import Enum +from typing import List, Optional, Tuple, Union + +import torch +from tqdm.auto import tqdm + +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker +from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler +from diffusers.utils import logging + + +try: + from ligo.segments import segment + from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer +except ImportError: + raise ImportError("Please install transformers and ligo-segments to use the mixture pipeline") + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import LMSDiscreteScheduler, DiffusionPipeline + + >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) + >>> pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler, custom_pipeline="mixture_tiling") + >>> pipeline.to("cuda") + + >>> image = pipeline( + >>> prompt=[[ + >>> "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + >>> "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", + >>> "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece" + >>> ]], + >>> tile_height=640, + >>> tile_width=640, + >>> tile_row_overlap=0, + >>> tile_col_overlap=256, + >>> guidance_scale=8, + >>> seed=7178915308, + >>> num_inference_steps=50, + >>> )["images"][0] + ``` +""" + + +def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): + """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image + + Returns a tuple with: + - Starting coordinates of rows in pixel space + - Ending coordinates of rows in pixel space + - Starting coordinates of columns in pixel space + - Ending coordinates of columns in pixel space + """ + px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap) + px_row_end = px_row_init + tile_height + px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap) + px_col_end = px_col_init + tile_width + return px_row_init, px_row_end, px_col_init, px_col_end + + +def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end): + """Translates coordinates in pixel space to coordinates in latent space""" + return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8 + + +def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): + """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image + + Returns a tuple with: + - Starting coordinates of rows in latent space + - Ending coordinates of rows in latent space + - Starting coordinates of columns in latent space + - Ending coordinates of columns in latent space + """ + px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end) + + +def _tile2latent_exclusive_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns +): + """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image + + Returns a tuple with: + - Starting coordinates of rows in latent space + - Ending coordinates of rows in latent space + - Starting coordinates of columns in latent space + - Ending coordinates of columns in latent space + """ + row_init, row_end, col_init, col_end = _tile2latent_indices( + tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + row_segment = segment(row_init, row_end) + col_segment = segment(col_init, col_end) + # Iterate over the rest of tiles, clipping the region for the current tile + for row in range(rows): + for column in range(columns): + if row != tile_row and column != tile_col: + clip_row_init, clip_row_end, clip_col_init, clip_col_end = _tile2latent_indices( + row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + row_segment = row_segment - segment(clip_row_init, clip_row_end) + col_segment = col_segment - segment(clip_col_init, clip_col_end) + # return row_init, row_end, col_init, col_end + return row_segment[0], row_segment[1], col_segment[0], col_segment[1] + + +class StableDiffusionExtrasMixin: + """Mixin providing additional convenience method to Stable Diffusion pipelines""" + + def decode_latents(self, latents, cpu_vae=False): + """Decodes a given array of latents into pixel space""" + # scale and decode the image latents with vae + if cpu_vae: + lat = deepcopy(latents).cpu() + vae = deepcopy(self.vae).cpu() + else: + lat = latents + vae = self.vae + + lat = 1 / 0.18215 * lat + image = vae.decode(lat).sample + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + return self.numpy_to_pil(image) + + +class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin): + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, PNDMScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + class SeedTilesMode(Enum): + """Modes in which the latents of a particular tile can be re-seeded""" + + FULL = "full" + EXCLUSIVE = "exclusive" + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[List[str]]], + num_inference_steps: Optional[int] = 50, + guidance_scale: Optional[float] = 7.5, + eta: Optional[float] = 0.0, + seed: Optional[int] = None, + tile_height: Optional[int] = 512, + tile_width: Optional[int] = 512, + tile_row_overlap: Optional[int] = 256, + tile_col_overlap: Optional[int] = 256, + guidance_scale_tiles: Optional[List[List[float]]] = None, + seed_tiles: Optional[List[List[int]]] = None, + seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full", + seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None, + cpu_vae: Optional[bool] = False, + ): + r""" + Function to run the diffusion pipeline with tiling support. + + Args: + prompt: either a single string (no tiling) or a list of lists with all the prompts to use (one list for each row of tiles). This will also define the tiling structure. + num_inference_steps: number of diffusions steps. + guidance_scale: classifier-free guidance. + seed: general random seed to initialize latents. + tile_height: height in pixels of each grid tile. + tile_width: width in pixels of each grid tile. + tile_row_overlap: number of overlap pixels between tiles in consecutive rows. + tile_col_overlap: number of overlap pixels between tiles in consecutive columns. + guidance_scale_tiles: specific weights for classifier-free guidance in each tile. + guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used. + seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter. + seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden. + seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles. + cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues. + + Examples: + + Returns: + A PIL image with the generated image. + + """ + if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt): + raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}") + grid_rows = len(prompt) + grid_cols = len(prompt[0]) + if not all(len(row) == grid_cols for row in prompt): + raise ValueError("All prompt rows must have the same number of prompt columns") + if not isinstance(seed_tiles_mode, str) and ( + not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode) + ): + raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") + if isinstance(seed_tiles_mode, str): + seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] + if any( + mode not in (modes := [mode.value for mode in self.SeedTilesMode]) + for row in seed_tiles_mode + for mode in row + ): + raise ValueError(f"Seed tiles mode must be one of {modes}") + if seed_reroll_regions is None: + seed_reroll_regions = [] + batch_size = 1 + + # create original noisy latents using the timesteps + height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap) + width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap) + latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8) + generator = torch.Generator("cuda").manual_seed(seed) + latents = torch.randn(latents_shape, generator=generator, device=self.device) + + # overwrite latents for specific tiles if provided + if seed_tiles is not None: + for row in range(grid_rows): + for col in range(grid_cols): + if (seed_tile := seed_tiles[row][col]) is not None: + mode = seed_tiles_mode[row][col] + if mode == self.SeedTilesMode.FULL.value: + row_init, row_end, col_init, col_end = _tile2latent_indices( + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + else: + row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices( + row, + col, + tile_width, + tile_height, + tile_row_overlap, + tile_col_overlap, + grid_rows, + grid_cols, + ) + tile_generator = torch.Generator("cuda").manual_seed(seed_tile) + tile_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init) + latents[:, :, row_init:row_end, col_init:col_end] = torch.randn( + tile_shape, generator=tile_generator, device=self.device + ) + + # overwrite again for seed reroll regions + for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions: + row_init, row_end, col_init, col_end = _pixel2latent_indices( + row_init, row_end, col_init, col_end + ) # to latent space coordinates + reroll_generator = torch.Generator("cuda").manual_seed(seed_reroll) + region_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init) + latents[:, :, row_init:row_end, col_init:col_end] = torch.randn( + region_shape, generator=reroll_generator, device=self.device + ) + + # Prepare scheduler + accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) + extra_set_kwargs = {} + if accepts_offset: + extra_set_kwargs["offset"] = 1 + self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) + # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas + if isinstance(self.scheduler, LMSDiscreteScheduler): + latents = latents * self.scheduler.sigmas[0] + + # get prompts text embeddings + text_input = [ + [ + self.tokenizer( + col, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + for col in row + ] + for row in prompt + ] + text_embeddings = [[self.text_encoder(col.input_ids.to(self.device))[0] for col in row] for row in text_input] + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 # TODO: also active if any tile has guidance scale + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance: + for i in range(grid_rows): + for j in range(grid_cols): + max_length = text_input[i][j].input_ids.shape[-1] + uncond_input = self.tokenizer( + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt" + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + text_embeddings[i][j] = torch.cat([uncond_embeddings, text_embeddings[i][j]]) + + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # Mask for tile weights strenght + tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size) + + # Diffusion timesteps + for i, t in tqdm(enumerate(self.scheduler.timesteps)): + # Diffuse each tile + noise_preds = [] + for row in range(grid_rows): + noise_preds_row = [] + for col in range(grid_cols): + px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end] + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([tile_latents] * 2) if do_classifier_free_guidance else tile_latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[ + "sample" + ] + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + guidance = ( + guidance_scale + if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None + else guidance_scale_tiles[row][col] + ) + noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond) + noise_preds_row.append(noise_pred_tile) + noise_preds.append(noise_preds_row) + # Stitch noise predictions for all tiles + noise_pred = torch.zeros(latents.shape, device=self.device) + contributors = torch.zeros(latents.shape, device=self.device) + # Add each tile contribution to overall latents + for row in range(grid_rows): + for col in range(grid_cols): + px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( + row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap + ) + noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += ( + noise_preds[row][col] * tile_weights + ) + contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights + # Average overlapping areas with more than 1 contributor + noise_pred /= contributors + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + # scale and decode the image latents with vae + image = self.decode_latents(latents, cpu_vae) + + return {"images": image} + + def _gaussian_weights(self, tile_width, tile_height, nbatches): + """Generates a gaussian mask of weights for tile contributions""" + import numpy as np + from numpy import exp, pi, sqrt + + latent_width = tile_width // 8 + latent_height = tile_height // 8 + + var = 0.01 + midpoint = (latent_width - 1) / 2 # -1 because index goes from 0 to latent_width - 1 + x_probs = [ + exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var) + for x in range(latent_width) + ] + midpoint = latent_height / 2 + y_probs = [ + exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var) + for y in range(latent_height) + ] + + weights = np.outer(y_probs, x_probs) + return torch.tile(torch.tensor(weights, device=self.device), (nbatches, self.unet.config.in_channels, 1, 1)) From d3717e63681a83c618aaf9f19cc6ed45df6a7967 Mon Sep 17 00:00:00 2001 From: asfiyab-nvidia <117682710+asfiyab-nvidia@users.noreply.github.com> Date: Fri, 2 Jun 2023 10:14:31 -0700 Subject: [PATCH 067/199] add Stable Diffusion TensorRT Inpainting pipeline (#3642) * add tensorrt inpaint pipeline Signed-off-by: Asfiya Baig * run make style Signed-off-by: Asfiya Baig --------- Signed-off-by: Asfiya Baig Co-authored-by: Patrick von Platen --- examples/community/README.md | 43 + .../stable_diffusion_tensorrt_inpaint.py | 1088 +++++++++++++++++ 2 files changed, 1131 insertions(+) create mode 100755 examples/community/stable_diffusion_tensorrt_inpaint.py diff --git a/examples/community/README.md b/examples/community/README.md index 1c4a36614d80..065b46f5410c 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -37,6 +37,7 @@ If a community doesn't work as expected, please open an issue and ping the autho | TensorRT Stable Diffusion Image to Image Pipeline | Accelerates the Stable Diffusion Image2Image Pipeline using TensorRT | [TensorRT Stable Diffusion Image to Image Pipeline](#tensorrt-image2image-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) | | Stable Diffusion IPEX Pipeline | Accelerate Stable Diffusion inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion on IPEX](#stable-diffusion-on-ipex) | - | [Yingjie Han](https://github.com/yingjie-han/) | | CLIP Guided Images Mixing Stable Diffusion Pipeline | Сombine images using usual diffusion models. | [CLIP Guided Images Mixing Using Stable Diffusion](#clip-guided-images-mixing-with-stable-diffusion) | - | [Karachev Denis](https://github.com/TheDenk) | +| TensorRT Stable Diffusion Inpainting Pipeline | Accelerates the Stable Diffusion Inpainting Pipeline using TensorRT | [TensorRT Stable Diffusion Inpainting Pipeline](#tensorrt-inpainting-stable-diffusion-pipeline) | - | [Asfiya Baig](https://github.com/asfiyab-nvidia) | To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly. ```py @@ -1630,3 +1631,45 @@ image = pipeline( ``` ![mixture_tiling_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/mixture_tiling.png) +### TensorRT Inpainting Stable Diffusion Pipeline + +The TensorRT Pipeline can be used to accelerate the Inpainting Stable Diffusion Inference run. + +NOTE: The ONNX conversions and TensorRT engine build may take up to 30 minutes. + +```python +import requests +from io import BytesIO +from PIL import Image +import torch +from diffusers import PNDMScheduler +from diffusers.pipelines.stable_diffusion import StableDiffusionImg2ImgPipeline + +# Use the PNDMScheduler scheduler here instead +scheduler = PNDMScheduler.from_pretrained("stabilityai/stable-diffusion-2-inpainting", subfolder="scheduler") + + +pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-inpainting", + custom_pipeline="stable_diffusion_tensorrt_inpaint", + revision='fp16', + torch_dtype=torch.float16, + scheduler=scheduler, + ) + +# re-use cached folder to save ONNX models and TensorRT Engines +pipe.set_cached_folder("stabilityai/stable-diffusion-2-inpainting", revision='fp16',) + +pipe = pipe.to("cuda") + +url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" +response = requests.get(url) +input_image = Image.open(BytesIO(response.content)).convert("RGB") + +mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" +response = requests.get(mask_url) +mask_image = Image.open(BytesIO(response.content)).convert("RGB") + +prompt = "a mecha robot sitting on a bench" +image = pipe(prompt, image=input_image, mask_image=mask_image, strength=0.75,).images[0] +image.save('tensorrt_inpaint_mecha_robot.png') +``` \ No newline at end of file diff --git a/examples/community/stable_diffusion_tensorrt_inpaint.py b/examples/community/stable_diffusion_tensorrt_inpaint.py new file mode 100755 index 000000000000..44f3bf5049b8 --- /dev/null +++ b/examples/community/stable_diffusion_tensorrt_inpaint.py @@ -0,0 +1,1088 @@ +# +# Copyright 2023 The HuggingFace Inc. team. +# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import os +from collections import OrderedDict +from copy import copy +from typing import List, Optional, Union + +import numpy as np +import onnx +import onnx_graphsurgeon as gs +import PIL +import tensorrt as trt +import torch +from huggingface_hub import snapshot_download +from onnx import shape_inference +from polygraphy import cuda +from polygraphy.backend.common import bytes_from_path +from polygraphy.backend.onnx.loader import fold_constants +from polygraphy.backend.trt import ( + CreateConfig, + Profile, + engine_from_bytes, + engine_from_network, + network_from_onnx_path, + save_engine, +) +from polygraphy.backend.trt import util as trt_util +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipelines.stable_diffusion import ( + StableDiffusionInpaintPipeline, + StableDiffusionPipelineOutput, + StableDiffusionSafetyChecker, +) +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image +from diffusers.schedulers import DDIMScheduler +from diffusers.utils import DIFFUSERS_CACHE, logging + + +""" +Installation instructions +python3 -m pip install --upgrade transformers diffusers>=0.16.0 +python3 -m pip install --upgrade tensorrt>=8.6.1 +python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com +python3 -m pip install onnxruntime +""" + +TRT_LOGGER = trt.Logger(trt.Logger.ERROR) +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +# Map of numpy dtype -> torch dtype +numpy_to_torch_dtype_dict = { + np.uint8: torch.uint8, + np.int8: torch.int8, + np.int16: torch.int16, + np.int32: torch.int32, + np.int64: torch.int64, + np.float16: torch.float16, + np.float32: torch.float32, + np.float64: torch.float64, + np.complex64: torch.complex64, + np.complex128: torch.complex128, +} +if np.version.full_version >= "1.24.0": + numpy_to_torch_dtype_dict[np.bool_] = torch.bool +else: + numpy_to_torch_dtype_dict[np.bool] = torch.bool + +# Map of torch dtype -> numpy dtype +torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()} + + +def device_view(t): + return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype]) + + +def preprocess_image(image): + """ + image: torch.Tensor + """ + w, h = image.size + w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h)) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image).contiguous() + return 2.0 * image - 1.0 + + +class Engine: + def __init__(self, engine_path): + self.engine_path = engine_path + self.engine = None + self.context = None + self.buffers = OrderedDict() + self.tensors = OrderedDict() + + def __del__(self): + [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray)] + del self.engine + del self.context + del self.buffers + del self.tensors + + def build( + self, + onnx_path, + fp16, + input_profile=None, + enable_preview=False, + enable_all_tactics=False, + timing_cache=None, + workspace_size=0, + ): + logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}") + p = Profile() + if input_profile: + for name, dims in input_profile.items(): + assert len(dims) == 3 + p.add(name, min=dims[0], opt=dims[1], max=dims[2]) + + config_kwargs = {} + + config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805] + if enable_preview: + # Faster dynamic shapes made optional since it increases engine build time. + config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805) + if workspace_size > 0: + config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size} + if not enable_all_tactics: + config_kwargs["tactic_sources"] = [] + + engine = engine_from_network( + network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]), + config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs), + save_timing_cache=timing_cache, + ) + save_engine(engine, path=self.engine_path) + + def load(self): + logger.warning(f"Loading TensorRT engine: {self.engine_path}") + self.engine = engine_from_bytes(bytes_from_path(self.engine_path)) + + def activate(self): + self.context = self.engine.create_execution_context() + + def allocate_buffers(self, shape_dict=None, device="cuda"): + for idx in range(trt_util.get_bindings_per_profile(self.engine)): + binding = self.engine[idx] + if shape_dict and binding in shape_dict: + shape = shape_dict[binding] + else: + shape = self.engine.get_binding_shape(binding) + dtype = trt.nptype(self.engine.get_binding_dtype(binding)) + if self.engine.binding_is_input(binding): + self.context.set_binding_shape(idx, shape) + tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device) + self.tensors[binding] = tensor + self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype) + + def infer(self, feed_dict, stream): + start_binding, end_binding = trt_util.get_active_profile_bindings(self.context) + # shallow copy of ordered dict + device_buffers = copy(self.buffers) + for name, buf in feed_dict.items(): + assert isinstance(buf, cuda.DeviceView) + device_buffers[name] = buf + bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()] + noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr) + if not noerror: + raise ValueError("ERROR: inference failed.") + + return self.tensors + + +class Optimizer: + def __init__(self, onnx_graph): + self.graph = gs.import_onnx(onnx_graph) + + def cleanup(self, return_onnx=False): + self.graph.cleanup().toposort() + if return_onnx: + return gs.export_onnx(self.graph) + + def select_outputs(self, keep, names=None): + self.graph.outputs = [self.graph.outputs[o] for o in keep] + if names: + for i, name in enumerate(names): + self.graph.outputs[i].name = name + + def fold_constants(self, return_onnx=False): + onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True) + self.graph = gs.import_onnx(onnx_graph) + if return_onnx: + return onnx_graph + + def infer_shapes(self, return_onnx=False): + onnx_graph = gs.export_onnx(self.graph) + if onnx_graph.ByteSize() > 2147483648: + raise TypeError("ERROR: model size exceeds supported 2GB limit") + else: + onnx_graph = shape_inference.infer_shapes(onnx_graph) + + self.graph = gs.import_onnx(onnx_graph) + if return_onnx: + return onnx_graph + + +class BaseModel: + def __init__(self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77): + self.model = model + self.name = "SD Model" + self.fp16 = fp16 + self.device = device + + self.min_batch = 1 + self.max_batch = max_batch_size + self.min_image_shape = 256 # min image resolution: 256x256 + self.max_image_shape = 1024 # max image resolution: 1024x1024 + self.min_latent_shape = self.min_image_shape // 8 + self.max_latent_shape = self.max_image_shape // 8 + + self.embedding_dim = embedding_dim + self.text_maxlen = text_maxlen + + def get_model(self): + return self.model + + def get_input_names(self): + pass + + def get_output_names(self): + pass + + def get_dynamic_axes(self): + return None + + def get_sample_input(self, batch_size, image_height, image_width): + pass + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + return None + + def get_shape_dict(self, batch_size, image_height, image_width): + return None + + def optimize(self, onnx_graph): + opt = Optimizer(onnx_graph) + opt.cleanup() + opt.fold_constants() + opt.infer_shapes() + onnx_opt_graph = opt.cleanup(return_onnx=True) + return onnx_opt_graph + + def check_dims(self, batch_size, image_height, image_width): + assert batch_size >= self.min_batch and batch_size <= self.max_batch + assert image_height % 8 == 0 or image_width % 8 == 0 + latent_height = image_height // 8 + latent_width = image_width // 8 + assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape + assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape + return (latent_height, latent_width) + + def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape): + min_batch = batch_size if static_batch else self.min_batch + max_batch = batch_size if static_batch else self.max_batch + latent_height = image_height // 8 + latent_width = image_width // 8 + min_image_height = image_height if static_shape else self.min_image_shape + max_image_height = image_height if static_shape else self.max_image_shape + min_image_width = image_width if static_shape else self.min_image_shape + max_image_width = image_width if static_shape else self.max_image_shape + min_latent_height = latent_height if static_shape else self.min_latent_shape + max_latent_height = latent_height if static_shape else self.max_latent_shape + min_latent_width = latent_width if static_shape else self.min_latent_shape + max_latent_width = latent_width if static_shape else self.max_latent_shape + return ( + min_batch, + max_batch, + min_image_height, + max_image_height, + min_image_width, + max_image_width, + min_latent_height, + max_latent_height, + min_latent_width, + max_latent_width, + ) + + +def getOnnxPath(model_name, onnx_dir, opt=True): + return os.path.join(onnx_dir, model_name + (".opt" if opt else "") + ".onnx") + + +def getEnginePath(model_name, engine_dir): + return os.path.join(engine_dir, model_name + ".plan") + + +def build_engines( + models: dict, + engine_dir, + onnx_dir, + onnx_opset, + opt_image_height, + opt_image_width, + opt_batch_size=1, + force_engine_rebuild=False, + static_batch=False, + static_shape=True, + enable_preview=False, + enable_all_tactics=False, + timing_cache=None, + max_workspace_size=0, +): + built_engines = {} + if not os.path.isdir(onnx_dir): + os.makedirs(onnx_dir) + if not os.path.isdir(engine_dir): + os.makedirs(engine_dir) + + # Export models to ONNX + for model_name, model_obj in models.items(): + engine_path = getEnginePath(model_name, engine_dir) + if force_engine_rebuild or not os.path.exists(engine_path): + logger.warning("Building Engines...") + logger.warning("Engine build can take a while to complete") + onnx_path = getOnnxPath(model_name, onnx_dir, opt=False) + onnx_opt_path = getOnnxPath(model_name, onnx_dir) + if force_engine_rebuild or not os.path.exists(onnx_opt_path): + if force_engine_rebuild or not os.path.exists(onnx_path): + logger.warning(f"Exporting model: {onnx_path}") + model = model_obj.get_model() + with torch.inference_mode(), torch.autocast("cuda"): + inputs = model_obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width) + torch.onnx.export( + model, + inputs, + onnx_path, + export_params=True, + opset_version=onnx_opset, + do_constant_folding=True, + input_names=model_obj.get_input_names(), + output_names=model_obj.get_output_names(), + dynamic_axes=model_obj.get_dynamic_axes(), + ) + del model + torch.cuda.empty_cache() + gc.collect() + else: + logger.warning(f"Found cached model: {onnx_path}") + + # Optimize onnx + if force_engine_rebuild or not os.path.exists(onnx_opt_path): + logger.warning(f"Generating optimizing model: {onnx_opt_path}") + onnx_opt_graph = model_obj.optimize(onnx.load(onnx_path)) + onnx.save(onnx_opt_graph, onnx_opt_path) + else: + logger.warning(f"Found cached optimized model: {onnx_opt_path} ") + + # Build TensorRT engines + for model_name, model_obj in models.items(): + engine_path = getEnginePath(model_name, engine_dir) + engine = Engine(engine_path) + onnx_path = getOnnxPath(model_name, onnx_dir, opt=False) + onnx_opt_path = getOnnxPath(model_name, onnx_dir) + + if force_engine_rebuild or not os.path.exists(engine.engine_path): + engine.build( + onnx_opt_path, + fp16=True, + input_profile=model_obj.get_input_profile( + opt_batch_size, + opt_image_height, + opt_image_width, + static_batch=static_batch, + static_shape=static_shape, + ), + enable_preview=enable_preview, + timing_cache=timing_cache, + workspace_size=max_workspace_size, + ) + built_engines[model_name] = engine + + # Load and activate TensorRT engines + for model_name, model_obj in models.items(): + engine = built_engines[model_name] + engine.load() + engine.activate() + + return built_engines + + +def runEngine(engine, feed_dict, stream): + return engine.infer(feed_dict, stream) + + +class CLIP(BaseModel): + def __init__(self, model, device, max_batch_size, embedding_dim): + super(CLIP, self).__init__( + model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim + ) + self.name = "CLIP" + + def get_input_names(self): + return ["input_ids"] + + def get_output_names(self): + return ["text_embeddings", "pooler_output"] + + def get_dynamic_axes(self): + return {"input_ids": {0: "B"}, "text_embeddings": {0: "B"}} + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + self.check_dims(batch_size, image_height, image_width) + min_batch, max_batch, _, _, _, _, _, _, _, _ = self.get_minmax_dims( + batch_size, image_height, image_width, static_batch, static_shape + ) + return { + "input_ids": [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)] + } + + def get_shape_dict(self, batch_size, image_height, image_width): + self.check_dims(batch_size, image_height, image_width) + return { + "input_ids": (batch_size, self.text_maxlen), + "text_embeddings": (batch_size, self.text_maxlen, self.embedding_dim), + } + + def get_sample_input(self, batch_size, image_height, image_width): + self.check_dims(batch_size, image_height, image_width) + return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device) + + def optimize(self, onnx_graph): + opt = Optimizer(onnx_graph) + opt.select_outputs([0]) # delete graph output#1 + opt.cleanup() + opt.fold_constants() + opt.infer_shapes() + opt.select_outputs([0], names=["text_embeddings"]) # rename network output + opt_onnx_graph = opt.cleanup(return_onnx=True) + return opt_onnx_graph + + +def make_CLIP(model, device, max_batch_size, embedding_dim, inpaint=False): + return CLIP(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim) + + +class UNet(BaseModel): + def __init__( + self, model, fp16=False, device="cuda", max_batch_size=16, embedding_dim=768, text_maxlen=77, unet_dim=4 + ): + super(UNet, self).__init__( + model=model, + fp16=fp16, + device=device, + max_batch_size=max_batch_size, + embedding_dim=embedding_dim, + text_maxlen=text_maxlen, + ) + self.unet_dim = unet_dim + self.name = "UNet" + + def get_input_names(self): + return ["sample", "timestep", "encoder_hidden_states"] + + def get_output_names(self): + return ["latent"] + + def get_dynamic_axes(self): + return { + "sample": {0: "2B", 2: "H", 3: "W"}, + "encoder_hidden_states": {0: "2B"}, + "latent": {0: "2B", 2: "H", 3: "W"}, + } + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + ( + min_batch, + max_batch, + _, + _, + _, + _, + min_latent_height, + max_latent_height, + min_latent_width, + max_latent_width, + ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + return { + "sample": [ + (2 * min_batch, self.unet_dim, min_latent_height, min_latent_width), + (2 * batch_size, self.unet_dim, latent_height, latent_width), + (2 * max_batch, self.unet_dim, max_latent_height, max_latent_width), + ], + "encoder_hidden_states": [ + (2 * min_batch, self.text_maxlen, self.embedding_dim), + (2 * batch_size, self.text_maxlen, self.embedding_dim), + (2 * max_batch, self.text_maxlen, self.embedding_dim), + ], + } + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return { + "sample": (2 * batch_size, self.unet_dim, latent_height, latent_width), + "encoder_hidden_states": (2 * batch_size, self.text_maxlen, self.embedding_dim), + "latent": (2 * batch_size, 4, latent_height, latent_width), + } + + def get_sample_input(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + dtype = torch.float16 if self.fp16 else torch.float32 + return ( + torch.randn( + 2 * batch_size, self.unet_dim, latent_height, latent_width, dtype=torch.float32, device=self.device + ), + torch.tensor([1.0], dtype=torch.float32, device=self.device), + torch.randn(2 * batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device), + ) + + +def make_UNet(model, device, max_batch_size, embedding_dim, inpaint=False, unet_dim=4): + return UNet( + model, + fp16=True, + device=device, + max_batch_size=max_batch_size, + embedding_dim=embedding_dim, + unet_dim=unet_dim, + ) + + +class VAE(BaseModel): + def __init__(self, model, device, max_batch_size, embedding_dim): + super(VAE, self).__init__( + model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim + ) + self.name = "VAE decoder" + + def get_input_names(self): + return ["latent"] + + def get_output_names(self): + return ["images"] + + def get_dynamic_axes(self): + return {"latent": {0: "B", 2: "H", 3: "W"}, "images": {0: "B", 2: "8H", 3: "8W"}} + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + ( + min_batch, + max_batch, + _, + _, + _, + _, + min_latent_height, + max_latent_height, + min_latent_width, + max_latent_width, + ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + return { + "latent": [ + (min_batch, 4, min_latent_height, min_latent_width), + (batch_size, 4, latent_height, latent_width), + (max_batch, 4, max_latent_height, max_latent_width), + ] + } + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return { + "latent": (batch_size, 4, latent_height, latent_width), + "images": (batch_size, 3, image_height, image_width), + } + + def get_sample_input(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device) + + +def make_VAE(model, device, max_batch_size, embedding_dim, inpaint=False): + return VAE(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim) + + +class TorchVAEEncoder(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.vae_encoder = model + + def forward(self, x): + return self.vae_encoder.encode(x).latent_dist.sample() + + +class VAEEncoder(BaseModel): + def __init__(self, model, device, max_batch_size, embedding_dim): + super(VAEEncoder, self).__init__( + model=model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim + ) + self.name = "VAE encoder" + + def get_model(self): + vae_encoder = TorchVAEEncoder(self.model) + return vae_encoder + + def get_input_names(self): + return ["images"] + + def get_output_names(self): + return ["latent"] + + def get_dynamic_axes(self): + return {"images": {0: "B", 2: "8H", 3: "8W"}, "latent": {0: "B", 2: "H", 3: "W"}} + + def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape): + assert batch_size >= self.min_batch and batch_size <= self.max_batch + min_batch = batch_size if static_batch else self.min_batch + max_batch = batch_size if static_batch else self.max_batch + self.check_dims(batch_size, image_height, image_width) + ( + min_batch, + max_batch, + min_image_height, + max_image_height, + min_image_width, + max_image_width, + _, + _, + _, + _, + ) = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape) + + return { + "images": [ + (min_batch, 3, min_image_height, min_image_width), + (batch_size, 3, image_height, image_width), + (max_batch, 3, max_image_height, max_image_width), + ] + } + + def get_shape_dict(self, batch_size, image_height, image_width): + latent_height, latent_width = self.check_dims(batch_size, image_height, image_width) + return { + "images": (batch_size, 3, image_height, image_width), + "latent": (batch_size, 4, latent_height, latent_width), + } + + def get_sample_input(self, batch_size, image_height, image_width): + self.check_dims(batch_size, image_height, image_width) + return torch.randn(batch_size, 3, image_height, image_width, dtype=torch.float32, device=self.device) + + +def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False): + return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim) + + +class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline): + r""" + Pipeline for inpainting using TensorRT accelerated Stable Diffusion. + + This model inherits from [`StableDiffusionInpaintPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPFeatureExtractor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: DDIMScheduler, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + requires_safety_checker: bool = True, + stages=["clip", "unet", "vae", "vae_encoder"], + image_height: int = 512, + image_width: int = 512, + max_batch_size: int = 16, + # ONNX export parameters + onnx_opset: int = 17, + onnx_dir: str = "onnx", + # TensorRT engine build parameters + engine_dir: str = "engine", + build_preview_features: bool = True, + force_engine_rebuild: bool = False, + timing_cache: str = "timing_cache", + ): + super().__init__( + vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker + ) + + self.vae.forward = self.vae.decode + + self.stages = stages + self.image_height, self.image_width = image_height, image_width + self.inpaint = True + self.onnx_opset = onnx_opset + self.onnx_dir = onnx_dir + self.engine_dir = engine_dir + self.force_engine_rebuild = force_engine_rebuild + self.timing_cache = timing_cache + self.build_static_batch = False + self.build_dynamic_shape = False + self.build_preview_features = build_preview_features + + self.max_batch_size = max_batch_size + # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation. + if self.build_dynamic_shape or self.image_height > 512 or self.image_width > 512: + self.max_batch_size = 4 + + self.stream = None # loaded in loadResources() + self.models = {} # loaded in __loadModels() + self.engine = {} # loaded in build_engines() + + def __loadModels(self): + # Load pipeline models + self.embedding_dim = self.text_encoder.config.hidden_size + models_args = { + "device": self.torch_device, + "max_batch_size": self.max_batch_size, + "embedding_dim": self.embedding_dim, + "inpaint": self.inpaint, + } + if "clip" in self.stages: + self.models["clip"] = make_CLIP(self.text_encoder, **models_args) + if "unet" in self.stages: + self.models["unet"] = make_UNet(self.unet, **models_args, unet_dim=self.unet.config.in_channels) + if "vae" in self.stages: + self.models["vae"] = make_VAE(self.vae, **models_args) + if "vae_encoder" in self.stages: + self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args) + + @classmethod + def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): + cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + + cls.cached_folder = ( + pretrained_model_name_or_path + if os.path.isdir(pretrained_model_name_or_path) + else snapshot_download( + pretrained_model_name_or_path, + cache_dir=cache_dir, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + ) + ) + + def to(self, torch_device: Optional[Union[str, torch.device]] = None, silence_dtype_warnings: bool = False): + super().to(torch_device, silence_dtype_warnings=silence_dtype_warnings) + + self.onnx_dir = os.path.join(self.cached_folder, self.onnx_dir) + self.engine_dir = os.path.join(self.cached_folder, self.engine_dir) + self.timing_cache = os.path.join(self.cached_folder, self.timing_cache) + + # set device + self.torch_device = self._execution_device + logger.warning(f"Running inference on device: {self.torch_device}") + + # load models + self.__loadModels() + + # build engines + self.engine = build_engines( + self.models, + self.engine_dir, + self.onnx_dir, + self.onnx_opset, + opt_image_height=self.image_height, + opt_image_width=self.image_width, + force_engine_rebuild=self.force_engine_rebuild, + static_batch=self.build_static_batch, + static_shape=not self.build_dynamic_shape, + enable_preview=self.build_preview_features, + timing_cache=self.timing_cache, + ) + + return self + + def __initialize_timesteps(self, timesteps, strength): + self.scheduler.set_timesteps(timesteps) + offset = self.scheduler.steps_offset if hasattr(self.scheduler, "steps_offset") else 0 + init_timestep = int(timesteps * strength) + offset + init_timestep = min(init_timestep, timesteps) + t_start = max(timesteps - init_timestep + offset, 0) + timesteps = self.scheduler.timesteps[t_start:].to(self.torch_device) + return timesteps, t_start + + def __preprocess_images(self, batch_size, images=()): + init_images = [] + for image in images: + image = image.to(self.torch_device).float() + image = image.repeat(batch_size, 1, 1, 1) + init_images.append(image) + return tuple(init_images) + + def __encode_image(self, init_image): + init_latents = runEngine(self.engine["vae_encoder"], {"images": device_view(init_image)}, self.stream)[ + "latent" + ] + init_latents = 0.18215 * init_latents + return init_latents + + def __encode_prompt(self, prompt, negative_prompt): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + """ + # Tokenize prompt + text_input_ids = ( + self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + .input_ids.type(torch.int32) + .to(self.torch_device) + ) + + text_input_ids_inp = device_view(text_input_ids) + # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt + text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[ + "text_embeddings" + ].clone() + + # Tokenize negative prompt + uncond_input_ids = ( + self.tokenizer( + negative_prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + .input_ids.type(torch.int32) + .to(self.torch_device) + ) + uncond_input_ids_inp = device_view(uncond_input_ids) + uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[ + "text_embeddings" + ] + + # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance + text_embeddings = torch.cat([uncond_embeddings, text_embeddings]).to(dtype=torch.float16) + + return text_embeddings + + def __denoise_latent( + self, latents, text_embeddings, timesteps=None, step_offset=0, mask=None, masked_image_latents=None + ): + if not isinstance(timesteps, torch.Tensor): + timesteps = self.scheduler.timesteps + for step_index, timestep in enumerate(timesteps): + # Expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, timestep) + if isinstance(mask, torch.Tensor): + latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1) + + # Predict the noise residual + timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep + + sample_inp = device_view(latent_model_input) + timestep_inp = device_view(timestep_float) + embeddings_inp = device_view(text_embeddings) + noise_pred = runEngine( + self.engine["unet"], + {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp}, + self.stream, + )["latent"] + + # Perform guidance + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample + + latents = 1.0 / 0.18215 * latents + return latents + + def __decode_latent(self, latents): + images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"] + images = (images / 2 + 0.5).clamp(0, 1) + return images.cpu().permute(0, 2, 3, 1).float().numpy() + + def __loadResources(self, image_height, image_width, batch_size): + self.stream = cuda.Stream() + + # Allocate buffers for TensorRT engine bindings + for model_name, obj in self.models.items(): + self.engine[model_name].allocate_buffers( + shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.torch_device + ) + + @torch.no_grad() + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: Union[torch.FloatTensor, PIL.Image.Image] = None, + mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + strength: float = 0.75, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`PIL.Image.Image`): + `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will + be masked out with `mask_image` and repainted according to `prompt`. + mask_image (`PIL.Image.Image`): + `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be + repainted, while black pixels will be preserved. If `mask_image` is a PIL image, it will be converted + to a single channel (luminance) before use. If it's a tensor, it should contain one color channel (L) + instead of 3, so the expected shape would be `(B, H, W, 1)`. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead. + Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`). + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + + """ + self.generator = generator + self.denoising_steps = num_inference_steps + self.guidance_scale = guidance_scale + + # Pre-compute latent input scales and linear multistep coefficients + self.scheduler.set_timesteps(self.denoising_steps, device=self.torch_device) + + # Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + prompt = [prompt] + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"Expected prompt to be of type list or str but got {type(prompt)}") + + if negative_prompt is None: + negative_prompt = [""] * batch_size + + if negative_prompt is not None and isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + + assert len(prompt) == len(negative_prompt) + + if batch_size > self.max_batch_size: + raise ValueError( + f"Batch size {len(prompt)} is larger than allowed {self.max_batch_size}. If dynamic shape is used, then maximum batch size is 4" + ) + + # Validate image dimensions + mask_width, mask_height = mask_image.size + if mask_height != self.image_height or mask_width != self.image_width: + raise ValueError( + f"Input image height and width {self.image_height} and {self.image_width} are not equal to " + f"the respective dimensions of the mask image {mask_height} and {mask_width}" + ) + + # load resources + self.__loadResources(self.image_height, self.image_width, batch_size) + + with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER): + # Spatial dimensions of latent tensor + latent_height = self.image_height // 8 + latent_width = self.image_width // 8 + + # Pre-initialize latents + num_channels_latents = self.vae.config.latent_channels + latents = self.prepare_latents( + batch_size, + num_channels_latents, + self.image_height, + self.image_width, + torch.float32, + self.torch_device, + generator, + ) + + # Pre-process input images + mask, masked_image = self.__preprocess_images(batch_size, prepare_mask_and_masked_image(image, mask_image)) + # print(mask) + mask = torch.nn.functional.interpolate(mask, size=(latent_height, latent_width)) + mask = torch.cat([mask] * 2) + + # Initialize timesteps + timesteps, t_start = self.__initialize_timesteps(self.denoising_steps, strength) + + # VAE encode masked image + masked_latents = self.__encode_image(masked_image) + masked_latents = torch.cat([masked_latents] * 2) + + # CLIP text encoder + text_embeddings = self.__encode_prompt(prompt, negative_prompt) + + # UNet denoiser + latents = self.__denoise_latent( + latents, + text_embeddings, + timesteps=timesteps, + step_offset=t_start, + mask=mask, + masked_image_latents=masked_latents, + ) + + # VAE decode latent + images = self.__decode_latent(latents) + + images = self.numpy_to_pil(images) + return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=None) From b7af94613816e590e09eb536897de37dd2a07e10 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Fri, 2 Jun 2023 10:26:41 -0700 Subject: [PATCH 068/199] set config from original module but set compiled module on class (#3650) * set config from original module but set compiled module on class * add test --- src/diffusers/pipelines/pipeline_utils.py | 16 +++++++++------- tests/pipelines/test_pipelines.py | 6 ++++++ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index d5fa22548a15..4bdae21907da 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -485,17 +485,19 @@ def register_modules(self, **kwargs): if module is None: register_dict = {name: (None, None)} else: - # register the original module, not the dynamo compiled one + # register the config from the original module, not the dynamo compiled one if is_compiled_module(module): - module = module._orig_mod + not_compiled_module = module._orig_mod + else: + not_compiled_module = module - library = module.__module__.split(".")[0] + library = not_compiled_module.__module__.split(".")[0] # check if the module is a pipeline module - module_path_items = module.__module__.split(".") + module_path_items = not_compiled_module.__module__.split(".") pipeline_dir = module_path_items[-2] if len(module_path_items) > 2 else None - path = module.__module__.split(".") + path = not_compiled_module.__module__.split(".") is_pipeline_module = pipeline_dir in path and hasattr(pipelines, pipeline_dir) # if library is not in LOADABLE_CLASSES, then it is a custom module. @@ -504,10 +506,10 @@ def register_modules(self, **kwargs): if is_pipeline_module: library = pipeline_dir elif library not in LOADABLE_CLASSES: - library = module.__module__ + library = not_compiled_module.__module__ # retrieve class_name - class_name = module.__class__.__name__ + class_name = not_compiled_module.__class__.__name__ register_dict = {name: (library, class_name)} diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index 5af3a6c16b40..cd3700d0ccdf 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -61,6 +61,7 @@ CONFIG_NAME, WEIGHTS_NAME, floats_tensor, + is_compiled_module, nightly, require_torch_2, slow, @@ -99,6 +100,11 @@ def _test_from_save_pretrained_dynamo(in_queue, out_queue, timeout): scheduler = DDPMScheduler(num_train_timesteps=10) ddpm = DDPMPipeline(model, scheduler) + + # previous diffusers versions stripped compilation off + # compiled modules + assert is_compiled_module(ddpm.unet) + ddpm.to(torch_device) ddpm.set_progress_bar_config(disable=None) From 5911a3aa4767a605513dfaaea60d5812590e6b26 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Fri, 2 Jun 2023 10:37:13 -0700 Subject: [PATCH 069/199] dreambooth if docs - stage II, more info (#3628) * dreambooth if docs - stage II, more info * Update docs/source/en/training/dreambooth.mdx Co-authored-by: Patrick von Platen * Update docs/source/en/training/dreambooth.mdx Co-authored-by: Patrick von Platen * Update docs/source/en/training/dreambooth.mdx Co-authored-by: Sayak Paul * download instructions for downsized images * update source README to match docs --------- Co-authored-by: Patrick von Platen Co-authored-by: Sayak Paul --- docs/source/en/training/dreambooth.mdx | 164 ++++++++++++++++++++++--- examples/dreambooth/README.md | 162 +++++++++++++++++++++--- 2 files changed, 295 insertions(+), 31 deletions(-) diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx index 039cf1f5ca7b..9bba9df5bffc 100644 --- a/docs/source/en/training/dreambooth.mdx +++ b/docs/source/en/training/dreambooth.mdx @@ -502,9 +502,65 @@ You may also run inference from any of the [saved training checkpoints](#inferen ## IF -You can use the lora and full dreambooth scripts to also train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). A few alternative cli flags are needed due to the model size, the expected input resolution, and the text encoder conventions. +You can use the lora and full dreambooth scripts to train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0) and the stage II upscaler +[IF model](https://huggingface.co/DeepFloyd/IF-II-L-v1.0). -### LoRA Dreambooth +Note that IF has a predicted variance, and our finetuning scripts only train the models predicted error, so for finetuned IF models we switch to a fixed +variance schedule. The full finetuning scripts will update the scheduler config for the full saved model. However, when loading saved LoRA weights, you +must also update the pipeline's scheduler config. + +```py +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0") + +pipe.load_lora_weights("") + +# Update scheduler config to fixed variance schedule +pipe.scheduler = pipe.scheduler.__class__.from_config(pipe.scheduler.config, variance_type="fixed_small") +``` + +Additionally, a few alternative cli flags are needed for IF. + +`--resolution=64`: IF is a pixel space diffusion model. In order to operate on un-compressed pixels, the input images are of a much smaller resolution. + +`--pre_compute_text_embeddings`: IF uses [T5](https://huggingface.co/docs/transformers/model_doc/t5) for its text encoder. In order to save GPU memory, we pre compute all text embeddings and then de-allocate +T5. + +`--tokenizer_max_length=77`: T5 has a longer default text length, but the default IF encoding procedure uses a smaller number. + +`--text_encoder_use_attention_mask`: T5 passes the attention mask to the text encoder. + +### Tips and Tricks +We find LoRA to be sufficient for finetuning the stage I model as the low resolution of the model makes representing finegrained detail hard regardless. + +For common and/or not-visually complex object concepts, you can get away with not-finetuning the upscaler. Just be sure to adjust the prompt passed to the +upscaler to remove the new token from the instance prompt. I.e. if your stage I prompt is "a sks dog", use "a dog" for your stage II prompt. + +For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than +LoRA finetuning stage II. + +For finegrained detail like faces, we find that lower learning rates work best. + +For stage II, we find that lower learning rates are also needed. + +### Stage II additional validation images + +The stage II validation requires images to upscale, we can download a downsized version of the training set: + +```py +from huggingface_hub import snapshot_download + +local_dir = "./dog_downsized" +snapshot_download( + "diffusers/dog-example-downsized", + local_dir=local_dir, + repo_type="dataset", + ignore_patterns=".gitattributes", +) +``` + +### IF stage I LoRA Dreambooth This training configuration requires ~28 GB VRAM. ```sh @@ -518,7 +574,7 @@ accelerate launch train_dreambooth_lora.py \ --instance_data_dir=$INSTANCE_DIR \ --output_dir=$OUTPUT_DIR \ --instance_prompt="a sks dog" \ - --resolution=64 \ # The input resolution of the IF unet is 64x64 + --resolution=64 \ --train_batch_size=4 \ --gradient_accumulation_steps=1 \ --learning_rate=5e-6 \ @@ -527,16 +583,57 @@ accelerate launch train_dreambooth_lora.py \ --validation_prompt="a sks dog" \ --validation_epochs=25 \ --checkpointing_steps=100 \ - --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory - --tokenizer_max_length=77 \ # IF expects an override of the max token length - --text_encoder_use_attention_mask # IF expects attention mask for text embeddings + --pre_compute_text_embeddings \ + --tokenizer_max_length=77 \ + --text_encoder_use_attention_mask ``` -### Full Dreambooth -Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam. -Using 8bit adam and the rest of the following config, the model can be trained in ~48 GB VRAM. +### IF stage II LoRA Dreambooth -For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. +`--validation_images`: These images are upscaled during validation steps. + +`--class_labels_conditioning=timesteps`: Pass additional conditioning to the UNet needed for stage II. + +`--learning_rate=1e-6`: Lower learning rate than stage I. + +`--resolution=256`: The upscaler expects higher resolution inputs + +```sh +export MODEL_NAME="DeepFloyd/IF-II-L-v1.0" +export INSTANCE_DIR="dog" +export OUTPUT_DIR="dreambooth_dog_upscale" +export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png" + +python train_dreambooth_lora.py \ + --report_to wandb \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a sks dog" \ + --resolution=256 \ + --train_batch_size=4 \ + --gradient_accumulation_steps=1 \ + --learning_rate=1e-6 \ + --max_train_steps=2000 \ + --validation_prompt="a sks dog" \ + --validation_epochs=100 \ + --checkpointing_steps=500 \ + --pre_compute_text_embeddings \ + --tokenizer_max_length=77 \ + --text_encoder_use_attention_mask \ + --validation_images $VALIDATION_IMAGES \ + --class_labels_conditioning=timesteps +``` + +### IF Stage I Full Dreambooth +`--skip_save_text_encoder`: When training the full model, this will skip saving the entire T5 with the finetuned model. You can still load the pipeline +with a T5 loaded from the original model. + +`use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam. + +`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. + +Using 8bit adam and a batch size of 4, the model can be trained in ~48 GB VRAM. ```sh export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0" @@ -549,17 +646,52 @@ accelerate launch train_dreambooth.py \ --instance_data_dir=$INSTANCE_DIR \ --output_dir=$OUTPUT_DIR \ --instance_prompt="a photo of sks dog" \ - --resolution=64 \ # The input resolution of the IF unet is 64x64 + --resolution=64 \ --train_batch_size=4 \ --gradient_accumulation_steps=1 \ --learning_rate=1e-7 \ --max_train_steps=150 \ --validation_prompt "a photo of sks dog" \ --validation_steps 25 \ - --text_encoder_use_attention_mask \ # IF expects attention mask for text embeddings - --tokenizer_max_length 77 \ # IF expects an override of the max token length - --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory + --text_encoder_use_attention_mask \ + --tokenizer_max_length 77 \ + --pre_compute_text_embeddings \ --use_8bit_adam \ # --set_grads_to_none \ - --skip_save_text_encoder # do not save the full T5 text encoder with the model -``` \ No newline at end of file + --skip_save_text_encoder \ + --push_to_hub +``` + +### IF Stage II Full Dreambooth + +`--learning_rate=1e-8`: Even lower learning rate. + +`--resolution=256`: The upscaler expects higher resolution inputs + +```sh +export MODEL_NAME="DeepFloyd/IF-II-L-v1.0" +export INSTANCE_DIR="dog" +export OUTPUT_DIR="dreambooth_dog_upscale" +export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png" + +accelerate launch train_dreambooth.py \ + --report_to wandb \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a sks dog" \ + --resolution=256 \ + --train_batch_size=2 \ + --gradient_accumulation_steps=2 \ + --learning_rate=1e-8 \ + --max_train_steps=2000 \ + --validation_prompt="a sks dog" \ + --validation_steps=150 \ + --checkpointing_steps=500 \ + --pre_compute_text_embeddings \ + --tokenizer_max_length=77 \ + --text_encoder_use_attention_mask \ + --validation_images $VALIDATION_IMAGES \ + --class_labels_conditioning timesteps \ + --push_to_hub +``` diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md index 83073210ac04..339152915adc 100644 --- a/examples/dreambooth/README.md +++ b/examples/dreambooth/README.md @@ -536,9 +536,65 @@ You can refer to [this blog post](https://huggingface.co/blog/dreambooth) that d ## IF -You can use the lora and full dreambooth scripts to also train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0). A few alternative cli flags are needed due to the model size, the expected input resolution, and the text encoder conventions. +You can use the lora and full dreambooth scripts to train the text to image [IF model](https://huggingface.co/DeepFloyd/IF-I-XL-v1.0) and the stage II upscaler +[IF model](https://huggingface.co/DeepFloyd/IF-II-L-v1.0). -### LoRA Dreambooth +Note that IF has a predicted variance, and our finetuning scripts only train the models predicted error, so for finetuned IF models we switch to a fixed +variance schedule. The full finetuning scripts will update the scheduler config for the full saved model. However, when loading saved LoRA weights, you +must also update the pipeline's scheduler config. + +```py +from diffusers import DiffusionPipeline + +pipe = DiffusionPipeline.from_pretrained("DeepFloyd/IF-I-XL-v1.0") + +pipe.load_lora_weights("") + +# Update scheduler config to fixed variance schedule +pipe.scheduler = pipe.scheduler.__class__.from_config(pipe.scheduler.config, variance_type="fixed_small") +``` + +Additionally, a few alternative cli flags are needed for IF. + +`--resolution=64`: IF is a pixel space diffusion model. In order to operate on un-compressed pixels, the input images are of a much smaller resolution. + +`--pre_compute_text_embeddings`: IF uses [T5](https://huggingface.co/docs/transformers/model_doc/t5) for its text encoder. In order to save GPU memory, we pre compute all text embeddings and then de-allocate +T5. + +`--tokenizer_max_length=77`: T5 has a longer default text length, but the default IF encoding procedure uses a smaller number. + +`--text_encoder_use_attention_mask`: T5 passes the attention mask to the text encoder. + +### Tips and Tricks +We find LoRA to be sufficient for finetuning the stage I model as the low resolution of the model makes representing finegrained detail hard regardless. + +For common and/or not-visually complex object concepts, you can get away with not-finetuning the upscaler. Just be sure to adjust the prompt passed to the +upscaler to remove the new token from the instance prompt. I.e. if your stage I prompt is "a sks dog", use "a dog" for your stage II prompt. + +For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than +LoRA finetuning stage II. + +For finegrained detail like faces, we find that lower learning rates work best. + +For stage II, we find that lower learning rates are also needed. + +### Stage II additional validation images + +The stage II validation requires images to upscale, we can download a downsized version of the training set: + +```py +from huggingface_hub import snapshot_download + +local_dir = "./dog_downsized" +snapshot_download( + "diffusers/dog-example-downsized", + local_dir=local_dir, + repo_type="dataset", + ignore_patterns=".gitattributes", +) +``` + +### IF stage I LoRA Dreambooth This training configuration requires ~28 GB VRAM. ```sh @@ -552,7 +608,7 @@ accelerate launch train_dreambooth_lora.py \ --instance_data_dir=$INSTANCE_DIR \ --output_dir=$OUTPUT_DIR \ --instance_prompt="a sks dog" \ - --resolution=64 \ # The input resolution of the IF unet is 64x64 + --resolution=64 \ --train_batch_size=4 \ --gradient_accumulation_steps=1 \ --learning_rate=5e-6 \ @@ -561,16 +617,57 @@ accelerate launch train_dreambooth_lora.py \ --validation_prompt="a sks dog" \ --validation_epochs=25 \ --checkpointing_steps=100 \ - --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory - --tokenizer_max_length=77 \ # IF expects an override of the max token length - --text_encoder_use_attention_mask # IF expects attention mask for text embeddings + --pre_compute_text_embeddings \ + --tokenizer_max_length=77 \ + --text_encoder_use_attention_mask ``` -### Full Dreambooth -Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam. -Using 8bit adam and the rest of the following config, the model can be trained in ~48 GB VRAM. +### IF stage II LoRA Dreambooth + +`--validation_images`: These images are upscaled during validation steps. + +`--class_labels_conditioning=timesteps`: Pass additional conditioning to the UNet needed for stage II. -For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. +`--learning_rate=1e-6`: Lower learning rate than stage I. + +`--resolution=256`: The upscaler expects higher resolution inputs + +```sh +export MODEL_NAME="DeepFloyd/IF-II-L-v1.0" +export INSTANCE_DIR="dog" +export OUTPUT_DIR="dreambooth_dog_upscale" +export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png" + +python train_dreambooth_lora.py \ + --report_to wandb \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a sks dog" \ + --resolution=256 \ + --train_batch_size=4 \ + --gradient_accumulation_steps=1 \ + --learning_rate=1e-6 \ + --max_train_steps=2000 \ + --validation_prompt="a sks dog" \ + --validation_epochs=100 \ + --checkpointing_steps=500 \ + --pre_compute_text_embeddings \ + --tokenizer_max_length=77 \ + --text_encoder_use_attention_mask \ + --validation_images $VALIDATION_IMAGES \ + --class_labels_conditioning=timesteps +``` + +### IF Stage I Full Dreambooth +`--skip_save_text_encoder`: When training the full model, this will skip saving the entire T5 with the finetuned model. You can still load the pipeline +with a T5 loaded from the original model. + +`use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam. + +`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. + +Using 8bit adam and a batch size of 4, the model can be trained in ~48 GB VRAM. ```sh export MODEL_NAME="DeepFloyd/IF-I-XL-v1.0" @@ -583,17 +680,52 @@ accelerate launch train_dreambooth.py \ --instance_data_dir=$INSTANCE_DIR \ --output_dir=$OUTPUT_DIR \ --instance_prompt="a photo of sks dog" \ - --resolution=64 \ # The input resolution of the IF unet is 64x64 + --resolution=64 \ --train_batch_size=4 \ --gradient_accumulation_steps=1 \ --learning_rate=1e-7 \ --max_train_steps=150 \ --validation_prompt "a photo of sks dog" \ --validation_steps 25 \ - --text_encoder_use_attention_mask \ # IF expects attention mask for text embeddings - --tokenizer_max_length 77 \ # IF expects an override of the max token length - --pre_compute_text_embeddings \ # Pre compute text embeddings to that T5 doesn't have to be kept in memory + --text_encoder_use_attention_mask \ + --tokenizer_max_length 77 \ + --pre_compute_text_embeddings \ --use_8bit_adam \ # --set_grads_to_none \ - --skip_save_text_encoder # do not save the full T5 text encoder with the model + --skip_save_text_encoder \ + --push_to_hub +``` + +### IF Stage II Full Dreambooth + +`--learning_rate=1e-8`: Even lower learning rate. + +`--resolution=256`: The upscaler expects higher resolution inputs + +```sh +export MODEL_NAME="DeepFloyd/IF-II-L-v1.0" +export INSTANCE_DIR="dog" +export OUTPUT_DIR="dreambooth_dog_upscale" +export VALIDATION_IMAGES="dog_downsized/image_1.png dog_downsized/image_2.png dog_downsized/image_3.png dog_downsized/image_4.png" + +accelerate launch train_dreambooth.py \ + --report_to wandb \ + --pretrained_model_name_or_path=$MODEL_NAME \ + --instance_data_dir=$INSTANCE_DIR \ + --output_dir=$OUTPUT_DIR \ + --instance_prompt="a sks dog" \ + --resolution=256 \ + --train_batch_size=2 \ + --gradient_accumulation_steps=2 \ + --learning_rate=1e-8 \ + --max_train_steps=2000 \ + --validation_prompt="a sks dog" \ + --validation_steps=150 \ + --checkpointing_steps=500 \ + --pre_compute_text_embeddings \ + --tokenizer_max_length=77 \ + --text_encoder_use_attention_mask \ + --validation_images $VALIDATION_IMAGES \ + --class_labels_conditioning timesteps \ + --push_to_hub ``` From 7a39691362e5448b4417f37e135158fcda5ae7fb Mon Sep 17 00:00:00 2001 From: Will Berman Date: Fri, 2 Jun 2023 13:33:19 -0700 Subject: [PATCH 070/199] linting fix (#3653) --- examples/community/mixture_tiling.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/community/mixture_tiling.py b/examples/community/mixture_tiling.py index 83389887455e..3e701cf607f5 100644 --- a/examples/community/mixture_tiling.py +++ b/examples/community/mixture_tiling.py @@ -218,11 +218,9 @@ def __call__( raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") if isinstance(seed_tiles_mode, str): seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] - if any( - mode not in (modes := [mode.value for mode in self.SeedTilesMode]) - for row in seed_tiles_mode - for mode in row - ): + + modes = [mode.value for mode in self.SeedTilesMode] + if any(mode not in modes for row in seed_tiles_mode for mode in row): raise ValueError(f"Seed tiles mode must be one of {modes}") if seed_reroll_regions is None: seed_reroll_regions = [] From b95cbdf6fc7115c40d8cde803423882a4345236d Mon Sep 17 00:00:00 2001 From: 0x1355 <0x1355@gmail.com> Date: Mon, 5 Jun 2023 06:46:26 +0200 Subject: [PATCH 071/199] Set step_rules correctly for piecewise_constant scheduler (#3605) So that schedule_func() calls get_piecewise_constant_schedule() with correctly named kwarg. --- src/diffusers/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/optimization.py b/src/diffusers/optimization.py index 78d68b7978a9..46e6125a0f55 100644 --- a/src/diffusers/optimization.py +++ b/src/diffusers/optimization.py @@ -318,7 +318,7 @@ def get_scheduler( return schedule_func(optimizer, last_epoch=last_epoch) if name == SchedulerType.PIECEWISE_CONSTANT: - return schedule_func(optimizer, rules=step_rules, last_epoch=last_epoch) + return schedule_func(optimizer, step_rules=step_rules, last_epoch=last_epoch) # All other schedulers require `num_warmup_steps` if num_warmup_steps is None: From de45af4a4627feeef2eb9bf65851bbe1228a101a Mon Sep 17 00:00:00 2001 From: 0x1355 <0x1355@gmail.com> Date: Mon, 5 Jun 2023 06:48:29 +0200 Subject: [PATCH 072/199] Allow setting num_cycles for cosine_with_restarts lr scheduler (#3606) Expose num_cycles kwarg of get_schedule() through args.lr_num_cycles. --- examples/textual_inversion/textual_inversion.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index b66d117e90be..0bf76c166835 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -285,6 +285,12 @@ def parse_args(): parser.add_argument( "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler." ) + parser.add_argument( + "--lr_num_cycles", + type=int, + default=1, + help="Number of hard resets of the lr in cosine_with_restarts scheduler.", + ) parser.add_argument( "--dataloader_num_workers", type=int, @@ -739,6 +745,7 @@ def main(): optimizer=optimizer, num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps, num_training_steps=args.max_train_steps * args.gradient_accumulation_steps, + num_cycles=args.lr_num_cycles * args.gradient_accumulation_steps, ) # Prepare everything with our `accelerator`. From 523a50a8eb4bf80d2cd622e7d4499c9b69c5f780 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Sun, 4 Jun 2023 22:35:42 -0700 Subject: [PATCH 073/199] [docs] Load A1111 LoRA (#3629) * load a1111 lora * fix * apply feedback * fix --- .../en/using-diffusers/other-formats.mdx | 67 ++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/other-formats.mdx b/docs/source/en/using-diffusers/other-formats.mdx index c8dc7cca86fc..1b2ce4bfc610 100644 --- a/docs/source/en/using-diffusers/other-formats.mdx +++ b/docs/source/en/using-diffusers/other-formats.mdx @@ -123,4 +123,69 @@ pipeline.to("cuda") placeholder_token = "" prompt = f"two {placeholder_token} getting married, photorealistic, high quality" image = pipeline(prompt, num_inference_steps=50).images[0] -``` \ No newline at end of file +``` + +## A1111 LoRA files + +[Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) (A1111) is a popular web UI for Stable Diffusion that supports model sharing platforms like [Civitai](https://civitai.com/). Models trained with the Low-Rank Adaptation (LoRA) technique are especially popular because they're fast to train and have a much smaller file size than a fully finetuned model. 🤗 Diffusers supports loading A1111 LoRA checkpoints with [`~LoraLoaderMixin.load_lora_weights`]: + +```py +from diffusers import DiffusionPipeline, UniPCMultistepScheduler +import torch + +pipeline = DiffusionPipeline.from_pretrained( + "andite/anything-v4.0", torch_dtype=torch.float16, safety_checker=None +).to("cuda") +pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config) +``` + +Download a LoRA checkpoint from Civitai; this example uses the [Howls Moving Castle,Interior/Scenery LoRA (Ghibli Stlye)](https://civitai.com/models/14605?modelVersionId=19998) checkpoint, but feel free to try out any LoRA checkpoint! + +```bash +!wget https://civitai.com/api/download/models/19998 -O howls_moving_castle.safetensors +``` + +Load the LoRA checkpoint into the pipeline with the [`~LoraLoaderMixin.load_lora_weights`] method: + +```py +pipeline.load_lora_weights(".", weight_name="howls_moving_castle.safetensors") +``` + +Now you can use the pipeline to generate images: + +```py +prompt = "masterpiece, illustration, ultra-detailed, cityscape, san francisco, golden gate bridge, california, bay area, in the snow, beautiful detailed starry sky" +negative_prompt = "lowres, cropped, worst quality, low quality, normal quality, artifacts, signature, watermark, username, blurry, more than one bridge, bad architecture" + +images = pipeline( + prompt=prompt, + negative_prompt=negative_prompt, + width=512, + height=512, + num_inference_steps=25, + num_images_per_prompt=4, + generator=torch.manual_seed(0), +).images +``` + +Finally, create a helper function to display the images: + +```py +from PIL import Image + + +def image_grid(imgs, rows=2, cols=2): + w, h = imgs[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + + for i, img in enumerate(imgs): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid + + +image_grid(images) +``` + +

From 0fc2fb71c1378c7eec3b6bc4c4f063730c7179bc Mon Sep 17 00:00:00 2001 From: Will Berman Date: Mon, 5 Jun 2023 02:32:16 -0700 Subject: [PATCH 074/199] dreambooth upscaling fix added latents (#3659) --- docs/source/en/training/dreambooth.mdx | 20 ++++++++++++++------ examples/dreambooth/README.md | 20 ++++++++++++++------ examples/dreambooth/train_dreambooth.py | 11 ++--------- examples/dreambooth/train_dreambooth_lora.py | 11 ++--------- 4 files changed, 32 insertions(+), 30 deletions(-) diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx index 9bba9df5bffc..c26762d4a75d 100644 --- a/docs/source/en/training/dreambooth.mdx +++ b/docs/source/en/training/dreambooth.mdx @@ -540,10 +540,13 @@ upscaler to remove the new token from the instance prompt. I.e. if your stage I For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than LoRA finetuning stage II. -For finegrained detail like faces, we find that lower learning rates work best. +For finegrained detail like faces, we find that lower learning rates along with larger batch sizes work best. For stage II, we find that lower learning rates are also needed. +We found experimentally that the DDPM scheduler with the default larger number of denoising steps to sometimes work better than the DPM Solver scheduler +used in the training scripts. + ### Stage II additional validation images The stage II validation requires images to upscale, we can download a downsized version of the training set: @@ -631,7 +634,8 @@ with a T5 loaded from the original model. `use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam. -`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. +`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. Note that it is +likely the learning rate can be increased with larger batch sizes. Using 8bit adam and a batch size of 4, the model can be trained in ~48 GB VRAM. @@ -656,7 +660,7 @@ accelerate launch train_dreambooth.py \ --text_encoder_use_attention_mask \ --tokenizer_max_length 77 \ --pre_compute_text_embeddings \ - --use_8bit_adam \ # + --use_8bit_adam \ --set_grads_to_none \ --skip_save_text_encoder \ --push_to_hub @@ -664,10 +668,14 @@ accelerate launch train_dreambooth.py \ ### IF Stage II Full Dreambooth -`--learning_rate=1e-8`: Even lower learning rate. +`--learning_rate=5e-6`: With a smaller effective batch size of 4, we found that we required learning rates as low as +1e-8. `--resolution=256`: The upscaler expects higher resolution inputs +`--train_batch_size=2` and `--gradient_accumulation_steps=6`: We found that full training of stage II particularly with +faces required large effective batch sizes. + ```sh export MODEL_NAME="DeepFloyd/IF-II-L-v1.0" export INSTANCE_DIR="dog" @@ -682,8 +690,8 @@ accelerate launch train_dreambooth.py \ --instance_prompt="a sks dog" \ --resolution=256 \ --train_batch_size=2 \ - --gradient_accumulation_steps=2 \ - --learning_rate=1e-8 \ + --gradient_accumulation_steps=6 \ + --learning_rate=5e-6 \ --max_train_steps=2000 \ --validation_prompt="a sks dog" \ --validation_steps=150 \ diff --git a/examples/dreambooth/README.md b/examples/dreambooth/README.md index 339152915adc..5813c42cd5d3 100644 --- a/examples/dreambooth/README.md +++ b/examples/dreambooth/README.md @@ -574,10 +574,13 @@ upscaler to remove the new token from the instance prompt. I.e. if your stage I For finegrained detail like faces that aren't present in the original training set, we find that full finetuning of the stage II upscaler is better than LoRA finetuning stage II. -For finegrained detail like faces, we find that lower learning rates work best. +For finegrained detail like faces, we find that lower learning rates along with larger batch sizes work best. For stage II, we find that lower learning rates are also needed. +We found experimentally that the DDPM scheduler with the default larger number of denoising steps to sometimes work better than the DPM Solver scheduler +used in the training scripts. + ### Stage II additional validation images The stage II validation requires images to upscale, we can download a downsized version of the training set: @@ -665,7 +668,8 @@ with a T5 loaded from the original model. `use_8bit_adam`: Due to the size of the optimizer states, we recommend training the full XL IF model with 8bit adam. -`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. +`--learning_rate=1e-7`: For full dreambooth, IF requires very low learning rates. With higher learning rates model quality will degrade. Note that it is +likely the learning rate can be increased with larger batch sizes. Using 8bit adam and a batch size of 4, the model can be trained in ~48 GB VRAM. @@ -690,7 +694,7 @@ accelerate launch train_dreambooth.py \ --text_encoder_use_attention_mask \ --tokenizer_max_length 77 \ --pre_compute_text_embeddings \ - --use_8bit_adam \ # + --use_8bit_adam \ --set_grads_to_none \ --skip_save_text_encoder \ --push_to_hub @@ -698,10 +702,14 @@ accelerate launch train_dreambooth.py \ ### IF Stage II Full Dreambooth -`--learning_rate=1e-8`: Even lower learning rate. +`--learning_rate=5e-6`: With a smaller effective batch size of 4, we found that we required learning rates as low as +1e-8. `--resolution=256`: The upscaler expects higher resolution inputs +`--train_batch_size=2` and `--gradient_accumulation_steps=6`: We found that full training of stage II particularly with +faces required large effective batch sizes. + ```sh export MODEL_NAME="DeepFloyd/IF-II-L-v1.0" export INSTANCE_DIR="dog" @@ -716,8 +724,8 @@ accelerate launch train_dreambooth.py \ --instance_prompt="a sks dog" \ --resolution=256 \ --train_batch_size=2 \ - --gradient_accumulation_steps=2 \ - --learning_rate=1e-8 \ + --gradient_accumulation_steps=6 \ + --learning_rate=5e-6 \ --max_train_steps=2000 \ --validation_prompt="a sks dog" \ --validation_steps=150 \ diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index e4ab6b2ae014..ad03829fd1bc 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -52,7 +52,6 @@ from diffusers.optimization import get_scheduler from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.torch_utils import randn_tensor if is_wandb_available(): @@ -1212,14 +1211,8 @@ def compute_text_embeddings(prompt): text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, ) - if unet.config.in_channels > channels: - needed_additional_channels = unet.config.in_channels - channels - additional_latents = randn_tensor( - (bsz, needed_additional_channels, height, width), - device=noisy_model_input.device, - dtype=noisy_model_input.dtype, - ) - noisy_model_input = torch.cat([additional_latents, noisy_model_input], dim=1) + if unet.config.in_channels == channels * 2: + noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1) if args.class_labels_conditioning == "timesteps": class_labels = timesteps diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 319348bd40bb..49aef1cc4a99 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -60,7 +60,6 @@ from diffusers.optimization import get_scheduler from diffusers.utils import TEXT_ENCODER_ATTN_MODULE, check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available -from diffusers.utils.torch_utils import randn_tensor # Will error if the minimal version of diffusers is not installed. Remove at your own risks. @@ -1157,14 +1156,8 @@ def compute_text_embeddings(prompt): text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, ) - if unet.config.in_channels > channels: - needed_additional_channels = unet.config.in_channels - channels - additional_latents = randn_tensor( - (bsz, needed_additional_channels, height, width), - device=noisy_model_input.device, - dtype=noisy_model_input.dtype, - ) - noisy_model_input = torch.cat([additional_latents, noisy_model_input], dim=1) + if unet.config.in_channels == channels * 2: + noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1) if args.class_labels_conditioning == "timesteps": class_labels = timesteps From 262d539a8a8f505dc72958f7ea50915a4b56dfac Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 5 Jun 2023 12:03:11 +0200 Subject: [PATCH 075/199] Correct multi gpu dreambooth (#3673) Correct multi gpu --- examples/dreambooth/train_dreambooth.py | 2 +- examples/dreambooth/train_dreambooth_lora.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index ad03829fd1bc..97b7f334bc9f 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -1211,7 +1211,7 @@ def compute_text_embeddings(prompt): text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, ) - if unet.config.in_channels == channels * 2: + if accelerator.unwrap_model(unet).config.in_channels == channels * 2: noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1) if args.class_labels_conditioning == "timesteps": diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 49aef1cc4a99..ca25152fcb1c 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -1156,7 +1156,7 @@ def compute_text_embeddings(prompt): text_encoder_use_attention_mask=args.text_encoder_use_attention_mask, ) - if unet.config.in_channels == channels * 2: + if accelerator.unwrap_model(unet).config.in_channels == channels * 2: noisy_model_input = torch.cat([noisy_model_input, noisy_model_input], dim=1) if args.class_labels_conditioning == "timesteps": From 1994dbcb5e62bd8d0c60e5d5d6bf4b580653c74c Mon Sep 17 00:00:00 2001 From: Vladislav Lyubimov <43727166+LyubimovVladislav@users.noreply.github.com> Date: Mon, 5 Jun 2023 13:55:37 +0300 Subject: [PATCH 076/199] Fix from_ckpt not working properly on windows (#3666) --- src/diffusers/loaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index e657406912f2..3c8081ccbbbd 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -1447,8 +1447,8 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): ckpt_path = Path(pretrained_model_link_or_path) if not ckpt_path.is_file(): # get repo_id and (potentially nested) file path of ckpt in repo - repo_id = str(Path().joinpath(*ckpt_path.parts[:2])) - file_path = str(Path().joinpath(*ckpt_path.parts[2:])) + repo_id = "/".join(ckpt_path.parts[:2]) + file_path = "/".join(ckpt_path.parts[2:]) if file_path.startswith("blob/"): file_path = file_path[len("blob/") :] From d0416ab0904ea2114b42503289d697245b5a742d Mon Sep 17 00:00:00 2001 From: pdoane Date: Mon, 5 Jun 2023 04:16:27 -0700 Subject: [PATCH 077/199] Update Compel documentation for textual inversions (#3663) * Update Compel documentation for textual inversions * Fix typo --- docs/source/en/using-diffusers/weighted_prompts.mdx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/source/en/using-diffusers/weighted_prompts.mdx b/docs/source/en/using-diffusers/weighted_prompts.mdx index c1316dc9f47d..58e670fbafe9 100644 --- a/docs/source/en/using-diffusers/weighted_prompts.mdx +++ b/docs/source/en/using-diffusers/weighted_prompts.mdx @@ -94,5 +94,15 @@ a try! If your favorite pipeline does not have a `prompt_embeds` input, please make sure to open an issue, the diffusers team tries to be as responsive as possible. +Compel 1.1.6 adds a utility class to simplify using textual inversions. Instantiate a `DiffusersTextualInversionManager` and pass it to Compel init: + +``` +textual_inversion_manager = DiffusersTextualInversionManager(pipe) +compel = Compel( + tokenizer=pipe.tokenizer, + text_encoder=pipe.text_encoder, + textual_inversion_manager=textual_inversion_manager) +``` + Also, please check out the documentation of the [compel](https://github.com/damian0815/compel) library for more information. From 995bbcb9aa708d76e95a0014a0a4b991c1f7c084 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 5 Jun 2023 17:42:31 +0530 Subject: [PATCH 078/199] [UniDiffuser test] fix one test so that it runs correctly on V100 (#3675) * fix: assertion. * assertion fix. From 1a6a647e06592ba1157f620ec28efaf3c8b4509e Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Mon, 5 Jun 2023 09:47:26 -0700 Subject: [PATCH 079/199] [docs] More API fixes (#3640) * part 2 of api fixes * move randn_tensor * add to toctree * apply feedback * more feedback --- docs/source/en/_toctree.yml | 42 +++++---- docs/source/en/api/diffusion_pipeline.mdx | 28 ++---- docs/source/en/api/outputs.mdx | 42 ++++++--- .../attend_and_excite.mdx | 0 .../{stable_diffusion => }/diffedit.mdx | 0 .../{stable_diffusion => }/model_editing.mdx | 0 .../{stable_diffusion => }/panorama.mdx | 0 .../{stable_diffusion => }/pix2pix.mdx | 0 .../{stable_diffusion => }/pix2pix_zero.mdx | 0 .../self_attention_guidance.mdx | 0 .../stable_diffusion_2.mdx | 0 .../stable_diffusion_safe.mdx | 0 docs/source/en/api/utilities.mdx | 23 +++++ .../en/using-diffusers/reproducibility.mdx | 5 +- src/diffusers/configuration_utils.py | 66 +++++++------- src/diffusers/pipelines/pipeline_utils.py | 91 ++++++++----------- src/diffusers/utils/pil_utils.py | 3 + src/diffusers/utils/testing_utils.py | 6 +- src/diffusers/utils/torch_utils.py | 6 +- 19 files changed, 161 insertions(+), 151 deletions(-) rename docs/source/en/api/pipelines/{stable_diffusion => }/attend_and_excite.mdx (100%) rename docs/source/en/api/pipelines/{stable_diffusion => }/diffedit.mdx (100%) rename docs/source/en/api/pipelines/{stable_diffusion => }/model_editing.mdx (100%) rename docs/source/en/api/pipelines/{stable_diffusion => }/panorama.mdx (100%) rename docs/source/en/api/pipelines/{stable_diffusion => }/pix2pix.mdx (100%) rename docs/source/en/api/pipelines/{stable_diffusion => }/pix2pix_zero.mdx (100%) rename docs/source/en/api/pipelines/{stable_diffusion => }/self_attention_guidance.mdx (100%) rename docs/source/en/api/pipelines/{ => stable_diffusion}/stable_diffusion_2.mdx (100%) rename docs/source/en/api/pipelines/{ => stable_diffusion}/stable_diffusion_safe.mdx (100%) create mode 100644 docs/source/en/api/utilities.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 5bd271c18873..5084299bb0dd 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -144,12 +144,16 @@ title: Outputs - local: api/loaders title: Loaders + - local: api/utilities + title: Utilities title: Main Classes - sections: - local: api/pipelines/overview title: Overview - local: api/pipelines/alt_diffusion title: AltDiffusion + - local: api/pipelines/attend_and_excite + title: Attend and Excite - local: api/pipelines/audio_diffusion title: Audio Diffusion - local: api/pipelines/audioldm @@ -164,24 +168,32 @@ title: DDIM - local: api/pipelines/ddpm title: DDPM + - local: api/pipelines/diffedit + title: DiffEdit - local: api/pipelines/dit title: DiT - local: api/pipelines/if title: IF + - local: api/pipelines/pix2pix + title: InstructPix2Pix - local: api/pipelines/kandinsky title: Kandinsky - local: api/pipelines/latent_diffusion title: Latent Diffusion + - local: api/pipelines/panorama + title: MultiDiffusion Panorama - local: api/pipelines/paint_by_example title: PaintByExample + - local: api/pipelines/pix2pix_zero + title: Pix2Pix Zero - local: api/pipelines/pndm title: PNDM - local: api/pipelines/repaint title: RePaint - - local: api/pipelines/stable_diffusion_safe - title: Safe Stable Diffusion - local: api/pipelines/score_sde_ve title: Score SDE VE + - local: api/pipelines/self_attention_guidance + title: Self-Attention Guidance - local: api/pipelines/semantic_stable_diffusion title: Semantic Guidance - local: api/pipelines/spectrogram_diffusion @@ -199,31 +211,21 @@ title: Depth-to-Image - local: api/pipelines/stable_diffusion/image_variation title: Image-Variation - - local: api/pipelines/stable_diffusion/upscale - title: Super-Resolution + - local: api/pipelines/stable_diffusion/stable_diffusion_safe + title: Safe Stable Diffusion + - local: api/pipelines/stable_diffusion/stable_diffusion_2 + title: Stable Diffusion 2 - local: api/pipelines/stable_diffusion/latent_upscale title: Stable-Diffusion-Latent-Upscaler - - local: api/pipelines/stable_diffusion/pix2pix - title: InstructPix2Pix - - local: api/pipelines/stable_diffusion/attend_and_excite - title: Attend and Excite - - local: api/pipelines/stable_diffusion/pix2pix_zero - title: Pix2Pix Zero - - local: api/pipelines/stable_diffusion/self_attention_guidance - title: Self-Attention Guidance - - local: api/pipelines/stable_diffusion/panorama - title: MultiDiffusion Panorama - - local: api/pipelines/stable_diffusion/model_editing - title: Text-to-Image Model Editing - - local: api/pipelines/stable_diffusion/diffedit - title: DiffEdit + - local: api/pipelines/stable_diffusion/upscale + title: Super-Resolution title: Stable Diffusion - - local: api/pipelines/stable_diffusion_2 - title: Stable Diffusion 2 - local: api/pipelines/stable_unclip title: Stable unCLIP - local: api/pipelines/stochastic_karras_ve title: Stochastic Karras VE + - local: api/pipelines/model_editing + title: Text-to-Image Model Editing - local: api/pipelines/text_to_video title: Text-to-Video - local: api/pipelines/text_to_video_zero diff --git a/docs/source/en/api/diffusion_pipeline.mdx b/docs/source/en/api/diffusion_pipeline.mdx index 66e5b7b23bbb..a47025a3e94a 100644 --- a/docs/source/en/api/diffusion_pipeline.mdx +++ b/docs/source/en/api/diffusion_pipeline.mdx @@ -12,41 +12,25 @@ specific language governing permissions and limitations under the License. # Pipelines -The [`DiffusionPipeline`] is the easiest way to load any pretrained diffusion pipeline from the [Hub](https://huggingface.co/models?library=diffusers) and to use it in inference. +The [`DiffusionPipeline`] is the easiest way to load any pretrained diffusion pipeline from the [Hub](https://huggingface.co/models?library=diffusers) and use it for inference. - One should not use the Diffusion Pipeline class for training or fine-tuning a diffusion model. Individual - components of diffusion pipelines are usually trained individually, so we suggest to directly work - with [`UNetModel`] and [`UNetConditionModel`]. +You shouldn't use the [`DiffusionPipeline`] class for training or finetuning a diffusion model. Individual +components (for example, [`UNetModel`] and [`UNetConditionModel`]) of diffusion pipelines are usually trained individually, so we suggest directly working with instead. -Any diffusion pipeline that is loaded with [`~DiffusionPipeline.from_pretrained`] will automatically -detect the pipeline type, *e.g.* [`StableDiffusionPipeline`] and consequently load each component of the -pipeline and pass them into the `__init__` function of the pipeline, *e.g.* [`~StableDiffusionPipeline.__init__`]. +The pipeline type (for example [`StableDiffusionPipeline`]) of any diffusion pipeline loaded with [`~DiffusionPipeline.from_pretrained`] is automatically +detected and pipeline components are loaded and passed to the `__init__` function of the pipeline. Any pipeline object can be saved locally with [`~DiffusionPipeline.save_pretrained`]. ## DiffusionPipeline + [[autodoc]] DiffusionPipeline - all - __call__ - device - to - components - -## ImagePipelineOutput -By default diffusion pipelines return an object of class - -[[autodoc]] pipelines.ImagePipelineOutput - -## AudioPipelineOutput -By default diffusion pipelines return an object of class - -[[autodoc]] pipelines.AudioPipelineOutput - -## ImageTextPipelineOutput -By default diffusion pipelines return an object of class - -[[autodoc]] ImageTextPipelineOutput diff --git a/docs/source/en/api/outputs.mdx b/docs/source/en/api/outputs.mdx index 9466f354541d..1e9fbedba35b 100644 --- a/docs/source/en/api/outputs.mdx +++ b/docs/source/en/api/outputs.mdx @@ -12,11 +12,11 @@ specific language governing permissions and limitations under the License. # BaseOutputs -All models have outputs that are instances of subclasses of [`~utils.BaseOutput`]. Those are -data structures containing all the information returned by the model, but that can also be used as tuples or +All models have outputs that are subclasses of [`~utils.BaseOutput`]. Those are +data structures containing all the information returned by the model, but they can also be used as tuples or dictionaries. -Let's see how this looks in an example: +For example: ```python from diffusers import DDIMPipeline @@ -25,31 +25,45 @@ pipeline = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32") outputs = pipeline() ``` -The `outputs` object is a [`~pipelines.ImagePipelineOutput`], as we can see in the -documentation of that class below, it means it has an image attribute. +The `outputs` object is a [`~pipelines.ImagePipelineOutput`] which means it has an image attribute. -You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you will get `None`: +You can access each attribute as you normally would or with a keyword lookup, and if that attribute is not returned by the model, you will get `None`: ```python outputs.images -``` - -or via keyword lookup - -```python outputs["images"] ``` -When considering our `outputs` object as tuple, it only considers the attributes that don't have `None` values. -Here for instance, we could retrieve images via indexing: +When considering the `outputs` object as a tuple, it only considers the attributes that don't have `None` values. +For instance, retrieving an image by indexing into it returns the tuple `(outputs.images)`: ```python outputs[:1] ``` -which will return the tuple `(outputs.images)` for instance. + + +To check a specific pipeline or model output, refer to its corresponding API documentation. + + ## BaseOutput [[autodoc]] utils.BaseOutput - to_tuple + +## ImagePipelineOutput + +[[autodoc]] pipelines.ImagePipelineOutput + +## FlaxImagePipelineOutput + +[[autodoc]] pipelines.pipeline_flax_utils.FlaxImagePipelineOutput + +## AudioPipelineOutput + +[[autodoc]] pipelines.AudioPipelineOutput + +## ImageTextPipelineOutput + +[[autodoc]] ImageTextPipelineOutput \ No newline at end of file diff --git a/docs/source/en/api/pipelines/stable_diffusion/attend_and_excite.mdx b/docs/source/en/api/pipelines/attend_and_excite.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/attend_and_excite.mdx rename to docs/source/en/api/pipelines/attend_and_excite.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion/diffedit.mdx b/docs/source/en/api/pipelines/diffedit.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/diffedit.mdx rename to docs/source/en/api/pipelines/diffedit.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx b/docs/source/en/api/pipelines/model_editing.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/model_editing.mdx rename to docs/source/en/api/pipelines/model_editing.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion/panorama.mdx b/docs/source/en/api/pipelines/panorama.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/panorama.mdx rename to docs/source/en/api/pipelines/panorama.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx b/docs/source/en/api/pipelines/pix2pix.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/pix2pix.mdx rename to docs/source/en/api/pipelines/pix2pix.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion/pix2pix_zero.mdx b/docs/source/en/api/pipelines/pix2pix_zero.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/pix2pix_zero.mdx rename to docs/source/en/api/pipelines/pix2pix_zero.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion/self_attention_guidance.mdx b/docs/source/en/api/pipelines/self_attention_guidance.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion/self_attention_guidance.mdx rename to docs/source/en/api/pipelines/self_attention_guidance.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion_2.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion_2.mdx rename to docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx diff --git a/docs/source/en/api/pipelines/stable_diffusion_safe.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.mdx similarity index 100% rename from docs/source/en/api/pipelines/stable_diffusion_safe.mdx rename to docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_safe.mdx diff --git a/docs/source/en/api/utilities.mdx b/docs/source/en/api/utilities.mdx new file mode 100644 index 000000000000..16143a2a66a6 --- /dev/null +++ b/docs/source/en/api/utilities.mdx @@ -0,0 +1,23 @@ +# Utilities + +Utility and helper functions for working with 🤗 Diffusers. + +## randn_tensor + +[[autodoc]] diffusers.utils.randn_tensor + +## numpy_to_pil + +[[autodoc]] utils.pil_utils.numpy_to_pil + +## pt_to_pil + +[[autodoc]] utils.pil_utils.pt_to_pil + +## load_image + +[[autodoc]] utils.testing_utils.load_image + +## export_to_video + +[[autodoc]] utils.testing_utils.export_to_video \ No newline at end of file diff --git a/docs/source/en/using-diffusers/reproducibility.mdx b/docs/source/en/using-diffusers/reproducibility.mdx index 5bef10bfe190..b666dac72cbf 100644 --- a/docs/source/en/using-diffusers/reproducibility.mdx +++ b/docs/source/en/using-diffusers/reproducibility.mdx @@ -111,7 +111,7 @@ print(np.abs(image).sum()) The result is not the same even though you're using an identical seed because the GPU uses a different random number generator than the CPU. -To circumvent this problem, 🧨 Diffusers has a [`randn_tensor`](#diffusers.utils.randn_tensor) function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU. +To circumvent this problem, 🧨 Diffusers has a [`~diffusers.utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The `randn_tensor` function is used everywhere inside the pipeline, allowing the user to **always** pass a CPU `Generator` even if the pipeline is run on a GPU. You'll see the results are much closer now! @@ -147,9 +147,6 @@ susceptible to precision error propagation. Don't expect similar results across different GPU hardware or PyTorch versions. In this case, you'll need to run exactly the same hardware and PyTorch version for full reproducibility. -### randn_tensor -[[autodoc]] diffusers.utils.randn_tensor - ## Deterministic algorithms You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. However, you should be aware that deterministic algorithms may be slower than nondeterministic ones and you may observe a decrease in performance. But if reproducibility is important to you, then this is the way to go! diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index af639de306ee..bb5adf3e9444 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -160,7 +160,7 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool @classmethod def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_unused_kwargs=False, **kwargs): r""" - Instantiate a Python class from a config dictionary + Instantiate a Python class from a config dictionary. Parameters: config (`Dict[str, Any]`): @@ -170,9 +170,13 @@ def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_un Whether kwargs that are not consumed by the Python class should be returned or not. kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to update the configuration object (after it being loaded) and initiate the Python class. - `**kwargs` will be directly passed to the underlying scheduler/model's `__init__` method and eventually - overwrite same named arguments of `config`. + Can be used to update the configuration object (after it is loaded) and initiate the Python class. + `**kwargs` are directly passed to the underlying scheduler/model's `__init__` method and eventually + overwrite same named arguments in `config`. + + Returns: + [`ModelMixin`] or [`SchedulerMixin`]: + A model or scheduler object instantiated from a config dictionary. Examples: @@ -258,59 +262,57 @@ def load_config( **kwargs, ) -> Tuple[Dict[str, Any], Dict[str, Any]]: r""" - Instantiate a Python class from a config dictionary + Load a model or scheduler configuration. Parameters: pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): Can be either: - - A string, the *model id* of a model repo on huggingface.co. Valid model ids should have an - organization name, like `google/ddpm-celebahq-256`. - - A path to a *directory* containing model weights saved using [`~ConfigMixin.save_config`], e.g., - `./my_model_directory/`. + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing model weights saved with + [`~ConfigMixin.save_config`]. cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to False, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`): Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + Whether to only load local model weights and configuration files or not. If set to True, the model + won’t be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `transformers-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. subfolder (`str`, *optional*, defaults to `""`): - In case the relevant files are located inside a subfolder of the model repo (either remote in - huggingface.co or downloaded locally), you can specify the folder name here. + The subfolder location of a model file within a larger model repository on the Hub or locally. return_unused_kwargs (`bool`, *optional*, defaults to `False): - Whether unused keyword arguments of the config shall be returned. + Whether unused keyword arguments of the config are returned. return_commit_hash (`bool`, *optional*, defaults to `False): - Whether the commit_hash of the loaded configuration shall be returned. - - + Whether the `commit_hash` of the loaded configuration are returned. - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models). - - + Returns: + `dict`: + A dictionary of all the parameters stored in a JSON configuration file. - Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to - use this method in a firewalled environment. + To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with + `huggingface-cli login`. You can also activate the special + ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to use this method in a + firewalled environment. """ diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 4bdae21907da..ed95163087a8 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1111,95 +1111,78 @@ def load_module(name, value): @classmethod def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: r""" - Download and cache a PyTorch diffusion pipeline from pre-trained pipeline weights. + Download and cache a PyTorch diffusion pipeline from pretrained pipeline weights. Parameters: pretrained_model_name (`str` or `os.PathLike`, *optional*): - Should be a string, the *repo id* of a pretrained pipeline hosted inside a model repo on - https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like - `CompVis/ldm-text2im-large-256`. + A string, the repository id (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline + hosted on the Hub. custom_pipeline (`str`, *optional*): - - - - This is an experimental feature and is likely to change in the future. - - - Can be either: - - A string, the *repo id* of a custom pipeline hosted inside a model repo on - https://huggingface.co/. Valid repo ids have to be located under a user or organization name, - like `hf-internal-testing/diffusers-dummy-pipeline`. - - - - It is required that the model repo has a file, called `pipeline.py` that defines the custom - pipeline. - - + - A string, the repository id (for example `CompVis/ldm-text2im-large-256`) of a pretrained + pipeline hosted on the Hub. The repository must contain a file called `pipeline.py` that defines + the custom pipeline. - A string, the *file name* of a community pipeline hosted on GitHub under - https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to - match exactly the file name without `.py` located under the above link, *e.g.* - `clip_guided_stable_diffusion`. + [Community](https://github.com/huggingface/diffusers/tree/main/examples/community). Valid file + names must match the file name and not the pipeline script (`clip_guided_stable_diffusion` + instead of `clip_guided_stable_diffusion.py`). Community pipelines are always loaded from the + current `main` branch of GitHub. - - - Community pipelines are always loaded from the current `main` branch of GitHub. - - - - - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`. + - A path to a *directory* (`./my_pipeline_directory/`) containing a custom pipeline. The directory + must contain a file called `pipeline.py` that defines the custom pipeline. - + - It is required that the directory has a file, called `pipeline.py` that defines the custom - pipeline. + 🧪 This is an experimental feature and may change in the future. - + - For more information on how to load and create custom pipelines, please have a look at [Loading and - Adding Custom - Pipelines](https://huggingface.co/docs/diffusers/using-diffusers/custom_pipeline_overview) + For more information on how to load and create custom pipelines, take a look at [How to contribute a + community pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/contribute_pipeline). force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to False, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`): Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + Whether to only load local model weights and configuration files or not. If set to True, the model + won’t be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `huggingface-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of `diffusers` when loading from GitHub): The specific model version to use. It can be a branch name, a tag name, or a commit id similar to `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a custom pipeline from GitHub. mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. specify the folder name here. + Mirror source to resolve accessibility issues if you're downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. variant (`str`, *optional*): - If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is - ignored when using `from_flax`. + Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when + loading `from_flax`. + + Returns: + `os.PathLike`: + A path to the downloaded pipeline. - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models) + To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with + `huggingface-cli login`. diff --git a/src/diffusers/utils/pil_utils.py b/src/diffusers/utils/pil_utils.py index ad76a32230fb..15b97c73dcb7 100644 --- a/src/diffusers/utils/pil_utils.py +++ b/src/diffusers/utils/pil_utils.py @@ -23,6 +23,9 @@ def pt_to_pil(images): + """ + Convert a torch image to a PIL image. + """ images = (images / 2 + 0.5).clamp(0, 1) images = images.cpu().permute(0, 2, 3, 1).float().numpy() images = numpy_to_pil(images) diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index abddd48851bf..dcb80169de74 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -261,12 +261,14 @@ def load_pt(url: str): def load_image(image: Union[str, PIL.Image.Image]) -> PIL.Image.Image: """ - Args: Loads `image` to a PIL Image. + + Args: image (`str` or `PIL.Image.Image`): The image to convert to the PIL Image format. Returns: - `PIL.Image.Image`: A PIL Image. + `PIL.Image.Image`: + A PIL Image. """ if isinstance(image, str): if image.startswith("http://") or image.startswith("https://"): diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py index 2b626a3b425a..5f64bce25e78 100644 --- a/src/diffusers/utils/torch_utils.py +++ b/src/diffusers/utils/torch_utils.py @@ -40,9 +40,9 @@ def randn_tensor( dtype: Optional["torch.dtype"] = None, layout: Optional["torch.layout"] = None, ): - """This is a helper function that allows to create random tensors on the desired `device` with the desired `dtype`. When - passing a list of generators one can seed each batched size individually. If CPU generators are passed the tensor - will always be created on CPU. + """A helper function to create random tensors on the desired `device` with the desired `dtype`. When + passing a list of generators, you can seed each batch size individually. If CPU generators are passed, the tensor + is always created on the CPU. """ # device on which tensor is created defaults to device rand_device = device From 5990014700060912d7248970b3969a9d91dfc026 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Mon, 5 Jun 2023 07:11:00 -1000 Subject: [PATCH 080/199] [WIP]Vae preprocessor refactor (PR1) (#3557) VaeImageProcessor.preprocess refactor * refactored VaeImageProcessor - allow passing optional height and width argument to resize() - add convert_to_rgb * refactored prepare_latents method for img2img pipelines so that if we pass latents directly as image input, it will not encode it again * added a test in test_pipelines_common.py to test latents as image inputs * refactored img2img pipelines that accept latents as image: - controlnet img2img, stable diffusion img2img , instruct_pix2pix --------- Co-authored-by: yiyixuxu Co-authored-by: Patrick von Platen Co-authored-by: Pedro Cuenca Co-authored-by: Sayak Paul --- src/diffusers/image_processor.py | 69 +++++++-- .../pipeline_alt_diffusion_img2img.py | 48 ++++-- .../controlnet/pipeline_controlnet.py | 98 ++++-------- .../controlnet/pipeline_controlnet_img2img.py | 144 +++++++----------- .../controlnet/pipeline_controlnet_inpaint.py | 55 +++---- .../pipelines/repaint/pipeline_repaint.py | 6 + .../pipeline_cycle_diffusion.py | 50 ++++-- .../pipeline_onnx_stable_diffusion_img2img.py | 8 + .../pipeline_stable_diffusion_depth2img.py | 54 +++++-- .../pipeline_stable_diffusion_diffedit.py | 31 ++-- .../pipeline_stable_diffusion_img2img.py | 50 ++++-- ...eline_stable_diffusion_instruct_pix2pix.py | 48 ++++-- ...ipeline_stable_diffusion_latent_upscale.py | 15 +- .../pipeline_stable_diffusion_pix2pix_zero.py | 57 ++++--- .../unidiffuser/pipeline_unidiffuser.py | 6 + .../altdiffusion/test_alt_diffusion.py | 1 + tests/pipelines/controlnet/test_controlnet.py | 5 +- .../controlnet/test_controlnet_img2img.py | 4 +- .../controlnet/test_controlnet_inpaint.py | 4 +- .../stable_diffusion/test_cycle_diffusion.py | 10 +- .../stable_diffusion/test_stable_diffusion.py | 1 + .../test_stable_diffusion_image_variation.py | 1 + .../test_stable_diffusion_img2img.py | 11 +- .../test_stable_diffusion_inpaint.py | 1 + ...st_stable_diffusion_instruction_pix2pix.py | 35 ++++- .../test_stable_diffusion_model_editing.py | 1 + .../test_stable_diffusion_panorama.py | 1 + .../test_stable_diffusion_pix2pix_zero.py | 70 ++++++++- .../test_stable_diffusion_sag.py | 1 + .../test_stable_diffusion.py | 1 + ...test_stable_diffusion_attend_and_excite.py | 1 + .../test_stable_diffusion_depth.py | 12 +- .../test_stable_diffusion_diffedit.py | 1 + .../test_stable_diffusion_inpaint.py | 1 + .../test_stable_diffusion_latent_upscale.py | 1 + .../stable_unclip/test_stable_unclip.py | 1 + .../test_stable_unclip_img2img.py | 1 + tests/pipelines/test_pipelines_common.py | 64 +++++++- 38 files changed, 624 insertions(+), 344 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 68782d1f5f79..17c083914753 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -30,7 +30,8 @@ class VaeImageProcessor(ConfigMixin): Args: do_resize (`bool`, *optional*, defaults to `True`): - Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. + Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept + `height` and `width` arguments from `preprocess` method vae_scale_factor (`int`, *optional*, defaults to `8`): VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this factor. @@ -38,6 +39,8 @@ class VaeImageProcessor(ConfigMixin): Resampling filter to use when resizing the image. do_normalize (`bool`, *optional*, defaults to `True`): Whether to normalize the image to [-1,1] + do_convert_rgb (`bool`, *optional*, defaults to be `False`): + Whether to convert the images to RGB format. """ config_name = CONFIG_NAME @@ -49,11 +52,12 @@ def __init__( vae_scale_factor: int = 8, resample: str = "lanczos", do_normalize: bool = True, + do_convert_rgb: bool = False, ): super().__init__() @staticmethod - def numpy_to_pil(images): + def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image: """ Convert a numpy image or a batch of images to a PIL image. """ @@ -69,7 +73,19 @@ def numpy_to_pil(images): return pil_images @staticmethod - def numpy_to_pt(images): + def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray: + """ + Convert a PIL image or a list of PIL images to numpy arrays. + """ + if not isinstance(images, list): + images = [images] + images = [np.array(image).astype(np.float32) / 255.0 for image in images] + images = np.stack(images, axis=0) + + return images + + @staticmethod + def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: """ Convert a numpy image to a pytorch tensor """ @@ -80,7 +96,7 @@ def numpy_to_pt(images): return images @staticmethod - def pt_to_numpy(images): + def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: """ Convert a pytorch tensor to a numpy image """ @@ -101,18 +117,39 @@ def denormalize(images): """ return (images / 2 + 0.5).clamp(0, 1) - def resize(self, images: PIL.Image.Image) -> PIL.Image.Image: + @staticmethod + def convert_to_rgb(image: PIL.Image.Image) -> PIL.Image.Image: + """ + Converts an image to RGB format. + """ + image = image.convert("RGB") + return image + + def resize( + self, + image: PIL.Image.Image, + height: Optional[int] = None, + width: Optional[int] = None, + ) -> PIL.Image.Image: """ Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor` """ - w, h = images.size - w, h = (x - x % self.config.vae_scale_factor for x in (w, h)) # resize to integer multiple of vae_scale_factor - images = images.resize((w, h), resample=PIL_INTERPOLATION[self.config.resample]) - return images + if height is None: + height = image.height + if width is None: + width = image.width + + width, height = ( + x - x % self.config.vae_scale_factor for x in (width, height) + ) # resize to integer multiple of vae_scale_factor + image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) + return image def preprocess( self, image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], + height: Optional[int] = None, + width: Optional[int] = None, ) -> torch.Tensor: """ Preprocess the image input, accepted formats are PIL images, numpy arrays or pytorch tensors" @@ -126,10 +163,11 @@ def preprocess( ) if isinstance(image[0], PIL.Image.Image): + if self.config.do_convert_rgb: + image = [self.convert_to_rgb(i) for i in image] if self.config.do_resize: - image = [self.resize(i) for i in image] - image = [np.array(i).astype(np.float32) / 255.0 for i in image] - image = np.stack(image, axis=0) # to np + image = [self.resize(i, height, width) for i in image] + image = self.pil_to_numpy(image) # to np image = self.numpy_to_pt(image) # to pt elif isinstance(image[0], np.ndarray): @@ -146,7 +184,12 @@ def preprocess( elif isinstance(image[0], torch.Tensor): image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) - _, _, height, width = image.shape + _, channel, height, width = image.shape + + # don't need any preprocess if the image is latents + if channel == 4: + return image + if self.config.do_resize and ( height % self.config.vae_scale_factor != 0 or width % self.config.vae_scale_factor != 0 ): diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index b10d85f722eb..f0d4d91ce966 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -69,6 +69,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -538,21 +543,26 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt image = image.to(device=device, dtype=dtype) batch_size = batch_size * num_images_per_prompt - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) + if image.shape[1] == 4: + init_latents = image + else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective" + f" batch size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) - init_latents = self.vae.config.scaling_factor * init_latents + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -586,7 +596,14 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -609,9 +626,10 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the - process. + process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded + again. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 632cd546ed0a..4ac43377c82a 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -29,7 +29,6 @@ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( - PIL_INTERPOLATION, is_accelerate_available, is_accelerate_version, is_compiled_module, @@ -172,7 +171,10 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -477,17 +479,12 @@ def check_inputs( self, prompt, image, - height, - width, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, controlnet_conditioning_scale=1.0, ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): @@ -592,21 +589,26 @@ def check_inputs( def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) - if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list: + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): raise TypeError( - "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors" + "image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors" ) if image_is_pil: image_batch_size = 1 - elif image_is_tensor: - image_batch_size = image.shape[0] - elif image_is_pil_list: - image_batch_size = len(image) - elif image_is_tensor_list: + else: image_batch_size = len(image) if prompt is not None and isinstance(prompt, str): @@ -633,29 +635,7 @@ def prepare_image( do_classifier_free_guidance=False, guess_mode=False, ): - if not isinstance(image, torch.Tensor): - if isinstance(image, PIL.Image.Image): - image = [image] - - if isinstance(image[0], PIL.Image.Image): - images = [] - - for image_ in image: - image_ = image_.convert("RGB") - image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) - image_ = np.array(image_) - image_ = image_[None, :] - images.append(image_) - - image = images - - image = np.concatenate(image, axis=0) - image = np.array(image).astype(np.float32) / 255.0 - image = image.transpose(0, 3, 1, 2) - image = torch.from_numpy(image) - elif isinstance(image[0], torch.Tensor): - image = torch.cat(image, dim=0) - + image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) image_batch_size = image.shape[0] if image_batch_size == 1: @@ -691,31 +671,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - def _default_height_width(self, height, width, image): - # NOTE: It is possible that a list of images have different - # dimensions for each image, so just checking the first image - # is not _exactly_ correct, but it is simple. - while isinstance(image, list): - image = image[0] - - if height is None: - if isinstance(image, PIL.Image.Image): - height = image.height - elif isinstance(image, torch.Tensor): - height = image.shape[2] - - height = (height // 8) * 8 # round down to nearest multiple of 8 - - if width is None: - if isinstance(image, PIL.Image.Image): - width = image.width - elif isinstance(image, torch.Tensor): - width = image.shape[3] - - width = (width // 8) * 8 # round down to nearest multiple of 8 - - return height, width - # override DiffusionPipeline def save_pretrained( self, @@ -733,7 +688,14 @@ def save_pretrained( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, @@ -760,8 +722,8 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, - `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If @@ -837,15 +799,11 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - # 0. Default height and width to unet - height, width = self._default_height_width(height, width, image) # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, image, - height, - width, callback_steps, negative_prompt, prompt_embeds, @@ -903,6 +861,7 @@ def __call__( do_classifier_free_guidance=do_classifier_free_guidance, guess_mode=guess_mode, ) + height, width = image.shape[-2:] elif isinstance(controlnet, MultiControlNetModel): images = [] @@ -922,6 +881,7 @@ def __call__( images.append(image_) image = images + height, width = image[0].shape[-2:] else: assert False diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 72b90f334725..6667cf43ce46 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -29,7 +29,6 @@ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( - PIL_INTERPOLATION, deprecate, is_accelerate_available, is_accelerate_version, @@ -198,7 +197,10 @@ def __init__( feature_extractor=feature_extractor, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -503,17 +505,12 @@ def check_inputs( self, prompt, image, - height, - width, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None, controlnet_conditioning_scale=1.0, ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) ): @@ -615,24 +612,30 @@ def check_inputs( else: assert False + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) - if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list: + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): raise TypeError( - "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors" + "image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors" ) if image_is_pil: image_batch_size = 1 - elif image_is_tensor: - image_batch_size = image.shape[0] - elif image_is_pil_list: - image_batch_size = len(image) - elif image_is_tensor_list: + else: image_batch_size = len(image) if prompt is not None and isinstance(prompt, str): @@ -660,29 +663,7 @@ def prepare_control_image( do_classifier_free_guidance=False, guess_mode=False, ): - if not isinstance(image, torch.Tensor): - if isinstance(image, PIL.Image.Image): - image = [image] - - if isinstance(image[0], PIL.Image.Image): - images = [] - - for image_ in image: - image_ = image_.convert("RGB") - image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) - image_ = np.array(image_) - image_ = image_[None, :] - images.append(image_) - - image = images - - image = np.concatenate(image, axis=0) - image = np.array(image).astype(np.float32) / 255.0 - image = image.transpose(0, 3, 1, 2) - image = torch.from_numpy(image) - elif isinstance(image[0], torch.Tensor): - image = torch.cat(image, dim=0) - + image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) image_batch_size = image.shape[0] if image_batch_size == 1: @@ -720,21 +701,26 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt image = image.to(device=device, dtype=dtype) batch_size = batch_size * num_images_per_prompt - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) + if image.shape[1] == 4: + init_latents = image + else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) - init_latents = self.vae.config.scaling_factor * init_latents + elif isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) + + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -763,31 +749,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt return latents - def _default_height_width(self, height, width, image): - # NOTE: It is possible that a list of images have different - # dimensions for each image, so just checking the first image - # is not _exactly_ correct, but it is simple. - while isinstance(image, list): - image = image[0] - - if height is None: - if isinstance(image, PIL.Image.Image): - height = image.height - elif isinstance(image, torch.Tensor): - height = image.shape[2] - - height = (height // 8) * 8 # round down to nearest multiple of 8 - - if width is None: - if isinstance(image, PIL.Image.Image): - width = image.width - elif isinstance(image, torch.Tensor): - width = image.shape[3] - - width = (width // 8) * 8 # round down to nearest multiple of 8 - - return height, width - # override DiffusionPipeline def save_pretrained( self, @@ -805,9 +766,21 @@ def save_pretrained( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, control_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], ] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -836,8 +809,12 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, - `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The initial image will be used as the starting point for the image generation process. Can also accpet + image latents as `image`, if passing latents directly, it will not be encoded again. + control_image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If @@ -914,15 +891,10 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - # 0. Default height and width to unet - height, width = self._default_height_width(height, width, image) - # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, control_image, - height, - width, callback_steps, negative_prompt, prompt_embeds, @@ -966,10 +938,10 @@ def __call__( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, ) - # 4. Prepare image, and controlnet_conditioning_image - image = prepare_image(image) + # 4. Prepare image + image = self.image_processor.preprocess(image).to(dtype=torch.float32) - # 5. Prepare image + # 5. Prepare controlnet_conditioning_image if isinstance(controlnet, ControlNetModel): control_image = self.prepare_control_image( image=control_image, diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 8ebcac2589a3..c20f2d518f96 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -30,7 +30,6 @@ from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( - PIL_INTERPOLATION, is_accelerate_available, is_accelerate_version, is_compiled_module, @@ -316,6 +315,9 @@ def __init__( ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.control_image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True, do_normalize=False + ) self.register_to_config(requires_safety_checker=requires_safety_checker) # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing @@ -742,24 +744,30 @@ def check_inputs( else: assert False + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) - if not image_is_pil and not image_is_tensor and not image_is_pil_list and not image_is_tensor_list: + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): raise TypeError( - "image must be passed and be one of PIL image, torch tensor, list of PIL images, or list of torch tensors" + "image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors" ) if image_is_pil: image_batch_size = 1 - elif image_is_tensor: - image_batch_size = image.shape[0] - elif image_is_pil_list: - image_batch_size = len(image) - elif image_is_tensor_list: + else: image_batch_size = len(image) if prompt is not None and isinstance(prompt, str): @@ -787,29 +795,7 @@ def prepare_control_image( do_classifier_free_guidance=False, guess_mode=False, ): - if not isinstance(image, torch.Tensor): - if isinstance(image, PIL.Image.Image): - image = [image] - - if isinstance(image[0], PIL.Image.Image): - images = [] - - for image_ in image: - image_ = image_.convert("RGB") - image_ = image_.resize((width, height), resample=PIL_INTERPOLATION["lanczos"]) - image_ = np.array(image_) - image_ = image_[None, :] - images.append(image_) - - image = images - - image = np.concatenate(image, axis=0) - image = np.array(image).astype(np.float32) / 255.0 - image = image.transpose(0, 3, 1, 2) - image = torch.from_numpy(image) - elif isinstance(image[0], torch.Tensor): - image = torch.cat(image, dim=0) - + image = self.control_image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) image_batch_size = image.shape[0] if image_batch_size == 1: @@ -983,7 +969,12 @@ def __call__( image: Union[torch.Tensor, PIL.Image.Image] = None, mask_image: Union[torch.Tensor, PIL.Image.Image] = None, control_image: Union[ - torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image] + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], ] = None, height: Optional[int] = None, width: Optional[int] = None, diff --git a/src/diffusers/pipelines/repaint/pipeline_repaint.py b/src/diffusers/pipelines/repaint/pipeline_repaint.py index f4914c46db51..d2aa1d4f1f77 100644 --- a/src/diffusers/pipelines/repaint/pipeline_repaint.py +++ b/src/diffusers/pipelines/repaint/pipeline_repaint.py @@ -13,6 +13,7 @@ # limitations under the License. +import warnings from typing import List, Optional, Tuple, Union import numpy as np @@ -30,6 +31,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def _preprocess_image(image: Union[List, PIL.Image.Image, torch.Tensor]): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 8babc6ab0d11..6b6df0945943 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -40,6 +40,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -549,21 +554,26 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt image = image.to(device=device, dtype=dtype) batch_size = image.shape[0] - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) + if image.shape[1] == 4: + init_latents = image + else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) - init_latents = self.vae.config.scaling_factor * init_latents + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -599,7 +609,14 @@ def __call__( self, prompt: Union[str, List[str]], source_prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -619,9 +636,10 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image generation. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the - process. + process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded + again. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of @@ -699,7 +717,7 @@ def __call__( ) # 4. Preprocess image - image = preprocess(image) + image = self.image_processor.preprocess(image) # 5. Prepare timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py index 67d3f44e6d4b..293ed7d981b8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion_img2img.py @@ -13,6 +13,7 @@ # limitations under the License. import inspect +import warnings from typing import Callable, List, Optional, Union import numpy as np @@ -33,6 +34,13 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess with 8->64 def preprocess(image): + warnings.warn( + ( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead" + ), + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index a5b2a9987fa1..2fd4503a94ce 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -37,6 +37,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -423,21 +428,26 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt image = image.to(device=device, dtype=dtype) batch_size = batch_size * num_images_per_prompt - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) + if image.shape[1] == 4: + init_latents = image + else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) - init_latents = self.vae.config.scaling_factor * init_latents + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -474,6 +484,8 @@ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_gui if isinstance(image[0], PIL.Image.Image): width, height = image[0].size + elif isinstance(image[0], np.ndarray): + width, height = image[0].shape[:-1] else: height, width = image[0].shape[-2:] @@ -512,7 +524,14 @@ def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_gui def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, depth_map: Optional[torch.FloatTensor] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, @@ -535,9 +554,12 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the - process. + process. Can accept image latents as `image` only if `depth_map` is not `None`. + depth_map (`torch.FloatTensor`, *optional*): + depth prediction that will be used as additional conditioning for the image generation process. If not + defined, it will automatically predicts the depth via `self.depth_estimator`. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of @@ -664,7 +686,7 @@ def __call__( ) # 5. Preprocess image - image = preprocess(image) + image = self.image_processor.preprocess(image) # 6. Set timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py index e4fc08b79cfd..3c1ac58bcee4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py @@ -159,6 +159,11 @@ def kl_divergence(hidden_states): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -799,19 +804,25 @@ def prepare_image_latents(self, image, batch_size, dtype, device, generator=None image = image.to(device=device, dtype=dtype) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) + if image.shape[1] == 4: + latents = image - if isinstance(generator, list): - latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)] - latents = torch.cat(latents, dim=0) else: - latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if isinstance(generator, list): + latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + latents = torch.cat(latents, dim=0) + else: + latents = self.vae.encode(image).latent_dist.sample(generator) - latents = self.vae.config.scaling_factor * latents + latents = self.vae.config.scaling_factor * latents if batch_size != latents.shape[0]: if batch_size % latents.shape[0] == 0: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 258c8000ba63..106b6528a982 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -73,6 +73,11 @@ def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -441,6 +446,7 @@ def _encode_prompt( return prompt_embeds + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker def run_safety_checker(self, image, device, dtype): if self.safety_checker is None: has_nsfw_concept = None @@ -455,6 +461,7 @@ def run_safety_checker(self, image, device, dtype): ) return image, has_nsfw_concept + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents def decode_latents(self, latents): warnings.warn( "The decode_latents method is deprecated and will be removed in a future version. Please" @@ -544,21 +551,26 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt image = image.to(device=device, dtype=dtype) batch_size = batch_size * num_images_per_prompt - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - if isinstance(generator, list): - init_latents = [ - self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) - ] - init_latents = torch.cat(init_latents, dim=0) + if image.shape[1] == 4: + init_latents = image + else: - init_latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) - init_latents = self.vae.config.scaling_factor * init_latents + init_latents = self.vae.config.scaling_factor * init_latents if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: # expand init_latents for batch_size @@ -592,7 +604,14 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, strength: float = 0.8, num_inference_steps: Optional[int] = 50, guidance_scale: Optional[float] = 7.5, @@ -615,9 +634,10 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor` or `PIL.Image.Image`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch, that will be used as the starting point for the - process. + process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded + again. strength (`float`, *optional*, defaults to 0.8): Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the `strength`. The number of diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 65ef5617fc68..25102ae7cf4a 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -43,6 +43,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -145,7 +150,14 @@ def __init__( def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, num_inference_steps: int = 100, guidance_scale: float = 7.5, image_guidance_scale: float = 1.5, @@ -168,8 +180,9 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image`): - `Image`, or tensor representing an image batch which will be repainted according to `prompt`. + image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, or tensor representing an image batch which will be repainted according to `prompt`. Can also + accpet image latents as `image`, if passing latents directly, it will not be encoded again. num_inference_steps (`int`, *optional*, defaults to 100): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -290,8 +303,7 @@ def __call__( ) # 3. Preprocess image - image = preprocess(image) - height, width = image.shape[-2:] + image = self.image_processor.preprocess(image) # 4. set timesteps self.scheduler.set_timesteps(num_inference_steps, device=device) @@ -308,6 +320,10 @@ def __call__( generator, ) + height, width = image_latents.shape[-2:] + height = height * self.vae_scale_factor + width = width * self.vae_scale_factor + # 6. Prepare latent variables num_channels_latents = self.vae.config.latent_channels latents = self.prepare_latents( @@ -746,17 +762,21 @@ def prepare_image_latents( image = image.to(device=device, dtype=dtype) batch_size = batch_size * num_images_per_prompt - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - if isinstance(generator, list): - image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)] - image_latents = torch.cat(image_latents, dim=0) + if image.shape[1] == 4: + image_latents = image else: - image_latents = self.vae.encode(image).latent_dist.mode() + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if isinstance(generator, list): + image_latents = [self.vae.encode(image[i : i + 1]).latent_dist.mode() for i in range(batch_size)] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.mode() if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0: # expand image_latents for batch_size diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index 664d58dc812f..e0fecf6d353f 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -94,7 +94,7 @@ def __init__( scheduler=scheduler, ) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic") def enable_sequential_cpu_offload(self, gpu_id=0): r""" @@ -291,7 +291,14 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def __call__( self, prompt: Union[str, List[str]], - image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]], + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, num_inference_steps: int = 75, guidance_scale: float = 9.0, negative_prompt: Optional[Union[str, List[str]]] = None, @@ -308,7 +315,7 @@ def __call__( Args: prompt (`str` or `List[str]`): The prompt or prompts to guide the image upscaling. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch which will be upscaled. If it's a tensor, it can be either a latent output from a stable diffusion model, or an image tensor in the range `[-1, 1]`. It will be considered a `latent` if `image.shape[1]` is `4`; otherwise, it will be considered to be an @@ -413,7 +420,7 @@ def __call__( ) # 4. Preprocess image - image = preprocess(image) + image = self.image_processor.preprocess(image) image = image.to(dtype=text_embeddings.dtype, device=device) if image.shape[1] == 3: # encode image if not in latent-space yet diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 3b7c6dc6b513..3332cc89d96c 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -177,6 +177,11 @@ class Pix2PixInversionPipelineOutput(BaseOutput, TextualInversionLoaderMixin): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -629,7 +634,6 @@ def prepare_extra_step_kwargs(self, generator, eta): def check_inputs( self, prompt, - image, source_embeds, target_embeds, callback_steps, @@ -727,19 +731,25 @@ def prepare_image_latents(self, image, batch_size, dtype, device, generator=None image = image.to(device=device, dtype=dtype) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) + if image.shape[1] == 4: + latents = image - if isinstance(generator, list): - latents = [self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size)] - latents = torch.cat(latents, dim=0) else: - latents = self.vae.encode(image).latent_dist.sample(generator) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) - latents = self.vae.config.scaling_factor * latents + if isinstance(generator, list): + latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + latents = torch.cat(latents, dim=0) + else: + latents = self.vae.encode(image).latent_dist.sample(generator) + + latents = self.vae.config.scaling_factor * latents if batch_size != latents.shape[0]: if batch_size % latents.shape[0] == 0: @@ -804,7 +814,6 @@ def kl_divergence(self, hidden_states): def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - image: Optional[Union[torch.FloatTensor, PIL.Image.Image]] = None, source_embeds: torch.Tensor = None, target_embeds: torch.Tensor = None, height: Optional[int] = None, @@ -905,7 +914,6 @@ def __call__( # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, - image, source_embeds, target_embeds, callback_steps, @@ -1085,7 +1093,14 @@ def __call__( def invert( self, prompt: Optional[str] = None, - image: Union[torch.FloatTensor, PIL.Image.Image] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, num_inference_steps: int = 50, guidance_scale: float = 1, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, @@ -1109,8 +1124,9 @@ def invert( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image`, *optional*): - `Image`, or tensor representing an image batch which will be used for conditioning. + image (`torch.FloatTensor` `np.ndarray`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, or tensor representing an image batch which will be used for conditioning. Can also accpet + image latents as `image`, if passing latents directly, it will not be encoded again. num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference. @@ -1179,7 +1195,7 @@ def invert( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Preprocess image - image = preprocess(image) + image = self.image_processor.preprocess(image) # 4. Prepare latent variables latents = self.prepare_image_latents(image, batch_size, self.vae.dtype, device, generator) @@ -1267,16 +1283,13 @@ def invert( inverted_latents = latents.detach().clone() # 8. Post-processing - image = self.decode_latents(latents.detach()) + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image = self.image_processor.postprocess(image, output_type=output_type) # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() - # 9. Convert to PIL. - if output_type == "pil": - image = self.image_processor.numpy_to_pil(image) - if not return_dict: return (inverted_latents, image) diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index 36e5411b4215..ecc457b4cb94 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -1,4 +1,5 @@ import inspect +import warnings from dataclasses import dataclass from typing import Callable, List, Optional, Union @@ -34,6 +35,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py index 6842d29dc6c0..1344d33a2552 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -40,6 +40,7 @@ class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMix params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index b2312a4e94d0..9915998be24e 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -41,7 +41,9 @@ ) from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_BATCH_PARAMS, + TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS, ) from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -99,7 +101,8 @@ class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 9d3b10aa8283..de8f578a3cce 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -38,6 +38,7 @@ from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, ) @@ -51,7 +52,8 @@ class ControlNetImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTest pipeline_class = StableDiffusionControlNetImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS.union({"control_image"}) + image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index f8cc881e8650..0f8808bcb728 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -40,6 +40,7 @@ from ..pipeline_params import ( TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, + TEXT_TO_IMAGE_IMAGE_PARAMS, ) from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -51,7 +52,8 @@ class ControlNetInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTest pipeline_class = StableDiffusionControlNetInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - image_params = frozenset([]) + image_params = frozenset({"control_image"}) # skip `image` and `mask` for now, only test for control_image + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py index a1ae3d2d0e7c..9a54c21c0a21 100644 --- a/tests/pipelines/stable_diffusion/test_cycle_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_cycle_diffusion.py @@ -25,7 +25,11 @@ from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps -from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -42,7 +46,8 @@ class CycleDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterM } required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"source_prompt"}) - image_params = frozenset([]) # TO_DO: add image_params once refactored VaeImageProcessor.preprocess + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) @@ -101,6 +106,7 @@ def get_dummy_components(self): def get_dummy_inputs(self, device, seed=0): image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image / 2 + 0.5 if str(device).startswith("mps"): generator = torch.manual_seed(seed) else: diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index dafd00321527..93abe7ae58bc 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -93,6 +93,7 @@ class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTester params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index c35d84de9802..e16478f06112 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -47,6 +47,7 @@ class StableDiffusionImageVariationPipelineFastTests( batch_params = IMAGE_VARIATION_BATCH_PARAMS image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_latents_params = frozenset([]) def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index 33305d5980be..eefbc83ce9d7 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -32,7 +32,6 @@ StableDiffusionImg2ImgPipeline, UNet2DConditionModel, ) -from diffusers.image_processor import VaeImageProcessor from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device from diffusers.utils.testing_utils import ( enable_full_determinism, @@ -91,6 +90,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipelin required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) @@ -142,6 +142,7 @@ def get_dummy_components(self): def get_dummy_inputs(self, device, seed=0): image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image / 2 + 0.5 if str(device).startswith("mps"): generator = torch.manual_seed(seed) else: @@ -160,12 +161,10 @@ def test_stable_diffusion_img2img_default_case(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) - inputs["image"] = inputs["image"] / 2 + 0.5 image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] @@ -178,12 +177,10 @@ def test_stable_diffusion_img2img_negative_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) - inputs["image"] = inputs["image"] / 2 + 0.5 negative_prompt = "french fries" output = sd_pipe(**inputs, negative_prompt=negative_prompt) image = output.images @@ -198,14 +195,12 @@ def test_stable_diffusion_img2img_multiple_init_images(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) inputs["prompt"] = [inputs["prompt"]] * 2 inputs["image"] = inputs["image"].repeat(2, 1, 1, 1) - inputs["image"] = inputs["image"] / 2 + 0.5 image = sd_pipe(**inputs).images image_slice = image[-1, -3:, -3:, -1] @@ -221,12 +216,10 @@ def test_stable_diffusion_img2img_k_lms(self): beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear" ) sd_pipe = StableDiffusionImg2ImgPipeline(**components) - sd_pipe.image_processor = VaeImageProcessor(vae_scale_factor=sd_pipe.vae_scale_factor, do_normalize=True) sd_pipe = sd_pipe.to(device) sd_pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(device) - inputs["image"] = inputs["image"] / 2 + 0.5 image = sd_pipe(**inputs).images image_slice = image[0, -3:, -3:, -1] diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index 0cf4d711be4c..f761f245883f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -88,6 +88,7 @@ class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipelin batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS image_params = frozenset([]) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_latents_params = frozenset([]) def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index fbff6c554967..691427b1c6eb 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -31,10 +31,15 @@ StableDiffusionInstructPix2PixPipeline, UNet2DConditionModel, ) +from diffusers.image_processor import VaeImageProcessor from diffusers.utils import floats_tensor, load_image, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu -from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, + TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -47,9 +52,8 @@ class StableDiffusionInstructPix2PixPipelineFastTests( pipeline_class = StableDiffusionInstructPix2PixPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"} batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS - image_params = frozenset( - [] - ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) @@ -163,6 +167,7 @@ def test_stable_diffusion_pix2pix_multiple_init_images(self): image = np.array(inputs["image"]).astype(np.float32) / 255.0 image = torch.from_numpy(image).unsqueeze(0).to(device) + image = image / 2 + 0.5 image = image.permute(0, 3, 1, 2) inputs["image"] = image.repeat(2, 1, 1, 1) @@ -199,6 +204,28 @@ def test_stable_diffusion_pix2pix_euler(self): def test_inference_batch_single_identical(self): super().test_inference_batch_single_identical(expected_max_diff=3e-3) + # Overwrite the default test_latents_inputs because pix2pix encode the image differently + def test_latents_input(self): + components = self.get_dummy_components() + pipe = StableDiffusionInstructPix2PixPipeline(**components) + pipe.image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + out = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0] + + vae = components["vae"] + inputs = self.get_dummy_inputs_by_type(torch_device, input_image_type="pt") + + for image_param in self.image_latents_params: + if image_param in inputs.keys(): + inputs[image_param] = vae.encode(inputs[image_param]).latent_dist.mode() + + out_latents_inputs = pipe(**inputs)[0] + + max_diff = np.abs(out - out_latents_inputs).max() + self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image") + @slow @require_torch_gpu diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index cba20417bca0..f47a70c4ece8 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -44,6 +44,7 @@ class StableDiffusionModelEditingPipelineFastTests(PipelineLatentTesterMixin, Pi params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 021065416838..c8d2bfa8c59d 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -45,6 +45,7 @@ class StableDiffusionPanoramaPipelineFastTests(PipelineLatentTesterMixin, Pipeli params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 98f5910ab313..6f41d2c43c8e 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -32,11 +32,16 @@ StableDiffusionPix2PixZeroPipeline, UNet2DConditionModel, ) +from diffusers.image_processor import VaeImageProcessor from diffusers.utils import floats_tensor, load_numpy, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism, load_image, load_pt, require_torch_gpu, skip_mps -from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, + TEXT_TO_IMAGE_IMAGE_PARAMS, +) +from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference enable_full_determinism() @@ -45,11 +50,10 @@ @skip_mps class StableDiffusionPix2PixZeroPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): pipeline_class = StableDiffusionPix2PixZeroPipeline - params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"image"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset( - [] - ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS @classmethod def setUpClass(cls): @@ -130,6 +134,7 @@ def get_dummy_inputs(self, device, seed=0): def get_dummy_inversion_inputs(self, device, seed=0): dummy_image = floats_tensor((2, 3, 32, 32), rng=random.Random(seed)).to(torch_device) + dummy_image = dummy_image / 2 + 0.5 generator = torch.manual_seed(seed) inputs = { @@ -145,6 +150,24 @@ def get_dummy_inversion_inputs(self, device, seed=0): } return inputs + def get_dummy_inversion_inputs_by_type(self, device, seed=0, input_image_type="pt", output_type="np"): + inputs = self.get_dummy_inversion_inputs(device, seed) + + if input_image_type == "pt": + image = inputs["image"] + elif input_image_type == "np": + image = VaeImageProcessor.pt_to_numpy(inputs["image"]) + elif input_image_type == "pil": + image = VaeImageProcessor.pt_to_numpy(inputs["image"]) + image = VaeImageProcessor.numpy_to_pil(image) + else: + raise ValueError(f"unsupported input_image_type {input_image_type}") + + inputs["image"] = image + inputs["output_type"] = output_type + + return inputs + def test_save_load_optional_components(self): if not hasattr(self.pipeline_class, "_optional_components"): return @@ -281,6 +304,41 @@ def test_stable_diffusion_pix2pix_zero_ddpm(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + def test_stable_diffusion_pix2pix_zero_inversion_pt_np_pil_outputs_equivalent(self): + device = torch_device + components = self.get_dummy_components() + sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + output_pt = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, output_type="pt")).images + output_np = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, output_type="np")).images + output_pil = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, output_type="pil")).images + + max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() + self.assertLess(max_diff, 1e-4, "`output_type=='pt'` generate different results from `output_type=='np'`") + + max_diff = np.abs(np.array(output_pil[0]) - (output_np[0] * 255).round()).max() + self.assertLess(max_diff, 2.0, "`output_type=='pil'` generate different results from `output_type=='np'`") + + def test_stable_diffusion_pix2pix_zero_inversion_pt_np_pil_inputs_equivalent(self): + device = torch_device + components = self.get_dummy_components() + sd_pipe = StableDiffusionPix2PixZeroPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + out_input_pt = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, input_image_type="pt")).images + out_input_np = sd_pipe.invert(**self.get_dummy_inversion_inputs_by_type(device, input_image_type="np")).images + out_input_pil = sd_pipe.invert( + **self.get_dummy_inversion_inputs_by_type(device, input_image_type="pil") + ).images + + max_diff = np.abs(out_input_pt - out_input_np).max() + self.assertLess(max_diff, 1e-4, "`input_type=='pt'` generate different result from `input_type=='np'`") + + assert_mean_pixel_difference(out_input_pil, out_input_np, expected_max_diff=1) + # Non-determinism caused by the scheduler optimizing the latent inputs during inference @unittest.skip("non-deterministic pipeline") def test_inference_batch_single_identical(self): diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py index 2b0f0bfc11a6..91719ce7676f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_sag.py @@ -41,6 +41,7 @@ class StableDiffusionSAGPipelineFastTests(PipelineLatentTesterMixin, PipelineTes params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS test_cpu_offload = False def get_dummy_components(self): diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 3f9867783b33..87a960c7d1a4 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -47,6 +47,7 @@ class StableDiffusion2PipelineFastTests(PipelineLatentTesterMixin, PipelineTeste params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 6cec2cce752d..304ddacd2c36 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -45,6 +45,7 @@ class StableDiffusionAttendAndExcitePipelineFastTests( params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"token_indices"}) image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS # Attend and excite requires being able to run a backward pass at # inference time. There's no deterministic backward operator for pad diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index 08ac29868971..f393967c7de4 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -51,7 +51,12 @@ ) from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps -from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS +from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, + TEXT_TO_IMAGE_IMAGE_PARAMS, +) from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin @@ -65,9 +70,8 @@ class StableDiffusionDepth2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS - image_params = frozenset( - [] - ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py index 8df5b6da846c..1de80d60d8e8 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py @@ -49,6 +49,7 @@ class StableDiffusionDiffEditPipelineFastTests(PipelineLatentTesterMixin, Pipeli image_params = frozenset( [] ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_latents_params = frozenset([]) def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index 10d8561f0126..37c254f367f3 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -40,6 +40,7 @@ class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, Pipeli image_params = frozenset( [] ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_latents_params = frozenset([]) def get_dummy_components(self): torch.manual_seed(0) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index 561536a44ea0..b94aaca4258a 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -52,6 +52,7 @@ class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, P image_params = frozenset( [] ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_latents_params = frozenset([]) test_cpu_offload = True diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index 8b4a065cd4bf..4bbbad757edf 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -27,6 +27,7 @@ class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMix params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS # TODO(will) Expected attn_bias.stride(1) == 0 to be true, but got false test_xformers_attention = False diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index 35cae61242c4..741343066133 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -46,6 +46,7 @@ class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTe image_params = frozenset( [] ) # TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess + image_latents_params = frozenset([]) def get_dummy_components(self): embedder_hidden_size = 32 diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 8ce0a0f283d7..fac04bdbe30f 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -8,6 +8,7 @@ from typing import Callable, Union import numpy as np +import PIL import torch import diffusers @@ -39,9 +40,28 @@ def image_params(self) -> frozenset: "`image_params` are tested for if all accepted input image types (i.e. `pt`,`pil`,`np`) are producing same results" ) + @property + def image_latents_params(self) -> frozenset: + raise NotImplementedError( + "You need to set the attribute `image_latents_params` in the child test class. " + "`image_latents_params` are tested for if passing latents directly are producing same results" + ) + def get_dummy_inputs_by_type(self, device, seed=0, input_image_type="pt", output_type="np"): inputs = self.get_dummy_inputs(device, seed) + def convert_to_pt(image): + if isinstance(image, torch.Tensor): + input_image = image + elif isinstance(image, np.ndarray): + input_image = VaeImageProcessor.numpy_to_pt(image) + elif isinstance(image, PIL.Image.Image): + input_image = VaeImageProcessor.pil_to_numpy(image) + input_image = VaeImageProcessor.numpy_to_pt(input_image) + else: + raise ValueError(f"unsupported input_image_type {type(image)}") + return input_image + def convert_pt_to_type(image, input_image_type): if input_image_type == "pt": input_image = image @@ -56,21 +76,32 @@ def convert_pt_to_type(image, input_image_type): for image_param in self.image_params: if image_param in inputs.keys(): - inputs[image_param] = convert_pt_to_type(inputs[image_param], input_image_type) + inputs[image_param] = convert_pt_to_type( + convert_to_pt(inputs[image_param]).to(device), input_image_type + ) inputs["output_type"] = output_type return inputs def test_pt_np_pil_outputs_equivalent(self, expected_max_diff=1e-4): + self._test_pt_np_pil_outputs_equivalent(expected_max_diff=expected_max_diff) + + def _test_pt_np_pil_outputs_equivalent(self, expected_max_diff=1e-4, input_image_type="pt"): components = self.get_dummy_components() pipe = self.pipeline_class(**components) pipe = pipe.to(torch_device) pipe.set_progress_bar_config(disable=None) - output_pt = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pt"))[0] - output_np = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="np"))[0] - output_pil = pipe(**self.get_dummy_inputs_by_type(torch_device, output_type="pil"))[0] + output_pt = pipe( + **self.get_dummy_inputs_by_type(torch_device, input_image_type=input_image_type, output_type="pt") + )[0] + output_np = pipe( + **self.get_dummy_inputs_by_type(torch_device, input_image_type=input_image_type, output_type="np") + )[0] + output_pil = pipe( + **self.get_dummy_inputs_by_type(torch_device, input_image_type=input_image_type, output_type="pil") + )[0] max_diff = np.abs(output_pt.cpu().numpy().transpose(0, 2, 3, 1) - output_np).max() self.assertLess( @@ -98,6 +129,31 @@ def test_pt_np_pil_inputs_equivalent(self): max_diff = np.abs(out_input_pil - out_input_np).max() self.assertLess(max_diff, 1e-2, "`input_type=='pt'` generate different result from `input_type=='np'`") + def test_latents_input(self): + if len(self.image_latents_params) == 0: + return + + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.image_processor = VaeImageProcessor(do_resize=False, do_normalize=False) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + out = pipe(**self.get_dummy_inputs_by_type(torch_device, input_image_type="pt"))[0] + + vae = components["vae"] + inputs = self.get_dummy_inputs_by_type(torch_device, input_image_type="pt") + generator = inputs["generator"] + for image_param in self.image_latents_params: + if image_param in inputs.keys(): + inputs[image_param] = ( + vae.encode(inputs[image_param]).latent_dist.sample(generator) * vae.config.scaling_factor + ) + out_latents_inputs = pipe(**inputs)[0] + + max_diff = np.abs(out - out_latents_inputs).max() + self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image") + @require_torch class PipelineTesterMixin: From 462956be7b057ba1d156e9405289c39db56106bb Mon Sep 17 00:00:00 2001 From: Will Berman Date: Mon, 5 Jun 2023 10:24:31 -0700 Subject: [PATCH 081/199] small tweaks for parsing thibaudz controlnet checkpoints (#3657) --- ...onvert_original_controlnet_to_diffusers.py | 18 ++++ .../stable_diffusion/convert_from_ckpt.py | 99 +++++++++++++------ 2 files changed, 87 insertions(+), 30 deletions(-) diff --git a/scripts/convert_original_controlnet_to_diffusers.py b/scripts/convert_original_controlnet_to_diffusers.py index a9e05abd4cf1..9466bd27234c 100644 --- a/scripts/convert_original_controlnet_to_diffusers.py +++ b/scripts/convert_original_controlnet_to_diffusers.py @@ -75,6 +75,22 @@ ) parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)") + + # small workaround to get argparser to parse a boolean input as either true _or_ false + def parse_bool(string): + if string == "True": + return True + elif string == "False": + return False + else: + raise ValueError(f"could not parse string as bool {string}") + + parser.add_argument( + "--use_linear_projection", help="Override for use linear projection", required=False, type=parse_bool + ) + + parser.add_argument("--cross_attention_dim", help="Override for cross attention_dim", required=False, type=int) + args = parser.parse_args() controlnet = download_controlnet_from_original_ckpt( @@ -86,6 +102,8 @@ upcast_attention=args.upcast_attention, from_safetensors=args.from_safetensors, device=args.device, + use_linear_projection=args.use_linear_projection, + cross_attention_dim=args.cross_attention_dim, ) controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 7ba1bbd996db..e59b91e486f5 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -339,41 +339,46 @@ def create_ldm_bert_config(original_config): return config -def convert_ldm_unet_checkpoint(checkpoint, config, path=None, extract_ema=False, controlnet=False): +def convert_ldm_unet_checkpoint( + checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False +): """ Takes a state dict and a config, and returns a converted checkpoint. """ - # extract state_dict for UNet - unet_state_dict = {} - keys = list(checkpoint.keys()) - - if controlnet: - unet_key = "control_model." + if skip_extract_state_dict: + unet_state_dict = checkpoint else: - unet_key = "model.diffusion_model." - - # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA - if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema: - print(f"Checkpoint {path} has both EMA and non-EMA weights.") - print( - "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA" - " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag." - ) - for key in keys: - if key.startswith("model.diffusion_model"): - flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) - else: - if sum(k.startswith("model_ema") for k in keys) > 100: + # extract state_dict for UNet + unet_state_dict = {} + keys = list(checkpoint.keys()) + + if controlnet: + unet_key = "control_model." + else: + unet_key = "model.diffusion_model." + + # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA + if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema: + print(f"Checkpoint {path} has both EMA and non-EMA weights.") print( - "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" - " weights (usually better for inference), please make sure to add the `--extract_ema` flag." + "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA" + " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag." ) + for key in keys: + if key.startswith("model.diffusion_model"): + flat_ema_key = "model_ema." + "".join(key.split(".")[1:]) + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key) + else: + if sum(k.startswith("model_ema") for k in keys) > 100: + print( + "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA" + " weights (usually better for inference), please make sure to add the `--extract_ema` flag." + ) - for key in keys: - if key.startswith(unet_key): - unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) + for key in keys: + if key.startswith(unet_key): + unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(key) new_checkpoint = {} @@ -956,17 +961,42 @@ def stable_unclip_image_noising_components( def convert_controlnet_checkpoint( - checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema + checkpoint, + original_config, + checkpoint_path, + image_size, + upcast_attention, + extract_ema, + use_linear_projection=None, + cross_attention_dim=None, ): ctrlnet_config = create_unet_diffusers_config(original_config, image_size=image_size, controlnet=True) ctrlnet_config["upcast_attention"] = upcast_attention ctrlnet_config.pop("sample_size") + if use_linear_projection is not None: + ctrlnet_config["use_linear_projection"] = use_linear_projection + + if cross_attention_dim is not None: + ctrlnet_config["cross_attention_dim"] = cross_attention_dim + controlnet_model = ControlNetModel(**ctrlnet_config) + # Some controlnet ckpt files are distributed independently from the rest of the + # model components i.e. https://huggingface.co/thibaud/controlnet-sd21/ + if "time_embed.0.weight" in checkpoint: + skip_extract_state_dict = True + else: + skip_extract_state_dict = False + converted_ctrl_checkpoint = convert_ldm_unet_checkpoint( - checkpoint, ctrlnet_config, path=checkpoint_path, extract_ema=extract_ema, controlnet=True + checkpoint, + ctrlnet_config, + path=checkpoint_path, + extract_ema=extract_ema, + controlnet=True, + skip_extract_state_dict=skip_extract_state_dict, ) controlnet_model.load_state_dict(converted_ctrl_checkpoint) @@ -1344,6 +1374,8 @@ def download_controlnet_from_original_ckpt( upcast_attention: Optional[bool] = None, device: str = None, from_safetensors: bool = False, + use_linear_projection: Optional[bool] = None, + cross_attention_dim: Optional[bool] = None, ) -> DiffusionPipeline: if not is_omegaconf_available(): raise ValueError(BACKENDS_MAPPING["omegaconf"][1]) @@ -1381,7 +1413,14 @@ def download_controlnet_from_original_ckpt( raise ValueError("`control_stage_config` not present in original config") controlnet_model = convert_controlnet_checkpoint( - checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema + checkpoint, + original_config, + checkpoint_path, + image_size, + upcast_attention, + extract_ema, + use_linear_projection=use_linear_projection, + cross_attention_dim=cross_attention_dim, ) return controlnet_model From 41ae6708280ba84a4671d20fbb4d52f176083faa Mon Sep 17 00:00:00 2001 From: Will Berman Date: Mon, 5 Jun 2023 12:30:48 -0700 Subject: [PATCH 082/199] move activation dispatches into helper function (#3656) * move activation dispatches into helper function * tests --- src/diffusers/models/activations.py | 12 +++++ src/diffusers/models/attention.py | 15 +++--- src/diffusers/models/embeddings.py | 19 ++------ src/diffusers/models/resnet.py | 15 +----- src/diffusers/models/unet_1d_blocks.py | 36 +++++--------- src/diffusers/models/unet_2d_condition.py | 23 ++------- .../versatile_diffusion/modeling_text_unet.py | 22 ++------- tests/models/test_activations.py | 48 +++++++++++++++++++ 8 files changed, 89 insertions(+), 101 deletions(-) create mode 100644 src/diffusers/models/activations.py create mode 100644 tests/models/test_activations.py diff --git a/src/diffusers/models/activations.py b/src/diffusers/models/activations.py new file mode 100644 index 000000000000..64759b706e2f --- /dev/null +++ b/src/diffusers/models/activations.py @@ -0,0 +1,12 @@ +from torch import nn + + +def get_activation(act_fn): + if act_fn in ["swish", "silu"]: + return nn.SiLU() + elif act_fn == "mish": + return nn.Mish() + elif act_fn == "gelu": + return nn.GELU() + else: + raise ValueError(f"Unsupported activation function: {act_fn}") diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index a7a9a472d9e9..8805257ebe9a 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -18,6 +18,7 @@ from torch import nn from ..utils import maybe_allow_in_graph +from .activations import get_activation from .attention_processor import Attention from .embeddings import CombinedTimestepLabelEmbeddings @@ -345,15 +346,11 @@ def __init__( super().__init__() self.num_groups = num_groups self.eps = eps - self.act = None - if act_fn == "swish": - self.act = lambda x: F.silu(x) - elif act_fn == "mish": - self.act = nn.Mish() - elif act_fn == "silu": - self.act = nn.SiLU() - elif act_fn == "gelu": - self.act = nn.GELU() + + if act_fn is None: + self.act = None + else: + self.act = get_activation(act_fn) self.linear = nn.Linear(embedding_dim, out_dim * 2) diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 991264a9aa8f..4dd16f0dd5ff 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -18,6 +18,8 @@ import torch from torch import nn +from .activations import get_activation + def get_timestep_embedding( timesteps: torch.Tensor, @@ -171,14 +173,7 @@ def __init__( else: self.cond_proj = None - if act_fn == "silu": - self.act = nn.SiLU() - elif act_fn == "mish": - self.act = nn.Mish() - elif act_fn == "gelu": - self.act = nn.GELU() - else: - raise ValueError(f"{act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'") + self.act = get_activation(act_fn) if out_dim is not None: time_embed_dim_out = out_dim @@ -188,14 +183,8 @@ def __init__( if post_act_fn is None: self.post_act = None - elif post_act_fn == "silu": - self.post_act = nn.SiLU() - elif post_act_fn == "mish": - self.post_act = nn.Mish() - elif post_act_fn == "gelu": - self.post_act = nn.GELU() else: - raise ValueError(f"{post_act_fn} does not exist. Make sure to define one of 'silu', 'mish', or 'gelu'") + self.post_act = get_activation(post_act_fn) def forward(self, sample, condition=None): if condition is not None: diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index 3380a4909372..52f01552c528 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -20,6 +20,7 @@ import torch.nn as nn import torch.nn.functional as F +from .activations import get_activation from .attention import AdaGroupNorm from .attention_processor import SpatialNorm @@ -558,14 +559,7 @@ def __init__( conv_2d_out_channels = conv_2d_out_channels or out_channels self.conv2 = torch.nn.Conv2d(out_channels, conv_2d_out_channels, kernel_size=3, stride=1, padding=1) - if non_linearity == "swish": - self.nonlinearity = lambda x: F.silu(x) - elif non_linearity == "mish": - self.nonlinearity = nn.Mish() - elif non_linearity == "silu": - self.nonlinearity = nn.SiLU() - elif non_linearity == "gelu": - self.nonlinearity = nn.GELU() + self.nonlinearity = get_activation(non_linearity) self.upsample = self.downsample = None if self.up: @@ -646,11 +640,6 @@ def forward(self, input_tensor, temb): return output_tensor -class Mish(torch.nn.Module): - def forward(self, hidden_states): - return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states)) - - # unet_rl.py def rearrange_dims(tensor): if len(tensor.shape) == 2: diff --git a/src/diffusers/models/unet_1d_blocks.py b/src/diffusers/models/unet_1d_blocks.py index 934a4a4a7dcb..3c04bffeeacc 100644 --- a/src/diffusers/models/unet_1d_blocks.py +++ b/src/diffusers/models/unet_1d_blocks.py @@ -17,6 +17,7 @@ import torch.nn.functional as F from torch import nn +from .activations import get_activation from .resnet import Downsample1D, ResidualTemporalBlock1D, Upsample1D, rearrange_dims @@ -55,14 +56,10 @@ def __init__( self.resnets = nn.ModuleList(resnets) - if non_linearity == "swish": - self.nonlinearity = lambda x: F.silu(x) - elif non_linearity == "mish": - self.nonlinearity = nn.Mish() - elif non_linearity == "silu": - self.nonlinearity = nn.SiLU() - else: + if non_linearity is None: self.nonlinearity = None + else: + self.nonlinearity = get_activation(non_linearity) self.downsample = None if add_downsample: @@ -119,14 +116,10 @@ def __init__( self.resnets = nn.ModuleList(resnets) - if non_linearity == "swish": - self.nonlinearity = lambda x: F.silu(x) - elif non_linearity == "mish": - self.nonlinearity = nn.Mish() - elif non_linearity == "silu": - self.nonlinearity = nn.SiLU() - else: + if non_linearity is None: self.nonlinearity = None + else: + self.nonlinearity = get_activation(non_linearity) self.upsample = None if add_upsample: @@ -194,14 +187,10 @@ def __init__( self.resnets = nn.ModuleList(resnets) - if non_linearity == "swish": - self.nonlinearity = lambda x: F.silu(x) - elif non_linearity == "mish": - self.nonlinearity = nn.Mish() - elif non_linearity == "silu": - self.nonlinearity = nn.SiLU() - else: + if non_linearity is None: self.nonlinearity = None + else: + self.nonlinearity = get_activation(non_linearity) self.upsample = None if add_upsample: @@ -232,10 +221,7 @@ def __init__(self, num_groups_out, out_channels, embed_dim, act_fn): super().__init__() self.final_conv1d_1 = nn.Conv1d(embed_dim, embed_dim, 5, padding=2) self.final_conv1d_gn = nn.GroupNorm(num_groups_out, embed_dim) - if act_fn == "silu": - self.final_conv1d_act = nn.SiLU() - if act_fn == "mish": - self.final_conv1d_act = nn.Mish() + self.final_conv1d_act = get_activation(act_fn) self.final_conv1d_2 = nn.Conv1d(embed_dim, out_channels, 1) def forward(self, hidden_states, temb=None): diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 106346070d94..dda21fd80479 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -16,12 +16,12 @@ import torch import torch.nn as nn -import torch.nn.functional as F import torch.utils.checkpoint from ..configuration_utils import ConfigMixin, register_to_config from ..loaders import UNet2DConditionLoadersMixin from ..utils import BaseOutput, logging +from .activations import get_activation from .attention_processor import AttentionProcessor, AttnProcessor from .embeddings import ( GaussianFourierProjection, @@ -338,16 +338,8 @@ def __init__( if time_embedding_act_fn is None: self.time_embed_act = None - elif time_embedding_act_fn == "swish": - self.time_embed_act = lambda x: F.silu(x) - elif time_embedding_act_fn == "mish": - self.time_embed_act = nn.Mish() - elif time_embedding_act_fn == "silu": - self.time_embed_act = nn.SiLU() - elif time_embedding_act_fn == "gelu": - self.time_embed_act = nn.GELU() else: - raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}") + self.time_embed_act = get_activation(time_embedding_act_fn) self.down_blocks = nn.ModuleList([]) self.up_blocks = nn.ModuleList([]) @@ -501,16 +493,7 @@ def __init__( num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps ) - if act_fn == "swish": - self.conv_act = lambda x: F.silu(x) - elif act_fn == "mish": - self.conv_act = nn.Mish() - elif act_fn == "silu": - self.conv_act = nn.SiLU() - elif act_fn == "gelu": - self.conv_act = nn.GELU() - else: - raise ValueError(f"Unsupported activation function: {act_fn}") + self.conv_act = get_activation(act_fn) else: self.conv_norm_out = None diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index a0dbdaa75230..f11729451299 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -7,6 +7,7 @@ from ...configuration_utils import ConfigMixin, register_to_config from ...models import ModelMixin +from ...models.activations import get_activation from ...models.attention import Attention from ...models.attention_processor import ( AttentionProcessor, @@ -441,16 +442,8 @@ def __init__( if time_embedding_act_fn is None: self.time_embed_act = None - elif time_embedding_act_fn == "swish": - self.time_embed_act = lambda x: F.silu(x) - elif time_embedding_act_fn == "mish": - self.time_embed_act = nn.Mish() - elif time_embedding_act_fn == "silu": - self.time_embed_act = nn.SiLU() - elif time_embedding_act_fn == "gelu": - self.time_embed_act = nn.GELU() else: - raise ValueError(f"Unsupported activation function: {time_embedding_act_fn}") + self.time_embed_act = get_activation(time_embedding_act_fn) self.down_blocks = nn.ModuleList([]) self.up_blocks = nn.ModuleList([]) @@ -604,16 +597,7 @@ def __init__( num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps ) - if act_fn == "swish": - self.conv_act = lambda x: F.silu(x) - elif act_fn == "mish": - self.conv_act = nn.Mish() - elif act_fn == "silu": - self.conv_act = nn.SiLU() - elif act_fn == "gelu": - self.conv_act = nn.GELU() - else: - raise ValueError(f"Unsupported activation function: {act_fn}") + self.conv_act = get_activation(act_fn) else: self.conv_norm_out = None diff --git a/tests/models/test_activations.py b/tests/models/test_activations.py new file mode 100644 index 000000000000..4e8e51453e98 --- /dev/null +++ b/tests/models/test_activations.py @@ -0,0 +1,48 @@ +import unittest + +import torch +from torch import nn + +from diffusers.models.activations import get_activation + + +class ActivationsTests(unittest.TestCase): + def test_swish(self): + act = get_activation("swish") + + self.assertIsInstance(act, nn.SiLU) + + self.assertEqual(act(torch.tensor(-100, dtype=torch.float32)).item(), 0) + self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20) + + def test_silu(self): + act = get_activation("silu") + + self.assertIsInstance(act, nn.SiLU) + + self.assertEqual(act(torch.tensor(-100, dtype=torch.float32)).item(), 0) + self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20) + + def test_mish(self): + act = get_activation("mish") + + self.assertIsInstance(act, nn.Mish) + + self.assertEqual(act(torch.tensor(-200, dtype=torch.float32)).item(), 0) + self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20) + + def test_gelu(self): + act = get_activation("gelu") + + self.assertIsInstance(act, nn.GELU) + + self.assertEqual(act(torch.tensor(-100, dtype=torch.float32)).item(), 0) + self.assertNotEqual(act(torch.tensor(-1, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(0, dtype=torch.float32)).item(), 0) + self.assertEqual(act(torch.tensor(20, dtype=torch.float32)).item(), 20) From a8b0f42c38ad3bb2b7203aee3af66d58b3d189f7 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 6 Jun 2023 01:07:47 -0700 Subject: [PATCH 083/199] [docs] Fix link to loader method (#3680) fix link to load_lora_weights --- docs/source/en/using-diffusers/other-formats.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/other-formats.mdx b/docs/source/en/using-diffusers/other-formats.mdx index 1b2ce4bfc610..8e606f13469d 100644 --- a/docs/source/en/using-diffusers/other-formats.mdx +++ b/docs/source/en/using-diffusers/other-formats.mdx @@ -127,7 +127,7 @@ image = pipeline(prompt, num_inference_steps=50).images[0] ## A1111 LoRA files -[Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) (A1111) is a popular web UI for Stable Diffusion that supports model sharing platforms like [Civitai](https://civitai.com/). Models trained with the Low-Rank Adaptation (LoRA) technique are especially popular because they're fast to train and have a much smaller file size than a fully finetuned model. 🤗 Diffusers supports loading A1111 LoRA checkpoints with [`~LoraLoaderMixin.load_lora_weights`]: +[Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) (A1111) is a popular web UI for Stable Diffusion that supports model sharing platforms like [Civitai](https://civitai.com/). Models trained with the Low-Rank Adaptation (LoRA) technique are especially popular because they're fast to train and have a much smaller file size than a fully finetuned model. 🤗 Diffusers supports loading A1111 LoRA checkpoints with [`~loaders.LoraLoaderMixin.load_lora_weights`]: ```py from diffusers import DiffusionPipeline, UniPCMultistepScheduler @@ -145,7 +145,7 @@ Download a LoRA checkpoint from Civitai; this example uses the [Howls Moving Cas !wget https://civitai.com/api/download/models/19998 -O howls_moving_castle.safetensors ``` -Load the LoRA checkpoint into the pipeline with the [`~LoraLoaderMixin.load_lora_weights`] method: +Load the LoRA checkpoint into the pipeline with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method: ```py pipeline.load_lora_weights(".", weight_name="howls_moving_castle.safetensors") From b45204ea5aa0160d343c79bfb19ec9ceda637a5b Mon Sep 17 00:00:00 2001 From: Takuma Mori Date: Tue, 6 Jun 2023 17:36:13 +0900 Subject: [PATCH 084/199] Add function to remove monkey-patch for text encoder LoRA (#3649) * merge undoable-monkeypatch * remove TEXT_ENCODER_TARGET_MODULES, refactoring * move create_lora_weight_file --- src/diffusers/loaders.py | 81 +++++++++++++++++++------------- src/diffusers/utils/__init__.py | 1 - src/diffusers/utils/constants.py | 1 - tests/models/test_lora_layers.py | 56 ++++++++++++++++++---- 4 files changed, 97 insertions(+), 42 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 3c8081ccbbbd..ab0f1418e615 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -34,7 +34,7 @@ from .utils import ( DIFFUSERS_CACHE, HF_HUB_OFFLINE, - TEXT_ENCODER_TARGET_MODULES, + TEXT_ENCODER_ATTN_MODULE, _get_model_file, deprecate, is_safetensors_available, @@ -955,6 +955,19 @@ def text_encoder_lora_attn_procs(self): return self._text_encoder_lora_attn_procs return + def _remove_text_encoder_monkey_patch(self): + # Loop over the CLIPAttention module of text_encoder + for name, attn_module in self.text_encoder.named_modules(): + if name.endswith(TEXT_ENCODER_ATTN_MODULE): + # Loop over the LoRA layers + for _, text_encoder_attr in self._lora_attn_processor_attr_to_text_encoder_attr.items(): + # Retrieve the q/k/v/out projection of CLIPAttention + module = attn_module.get_submodule(text_encoder_attr) + if hasattr(module, "old_forward"): + # restore original `forward` to remove monkey-patch + module.forward = module.old_forward + delattr(module, "old_forward") + def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): r""" Monkey-patches the forward passes of attention modules of the text encoder. @@ -963,37 +976,41 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): attn_processors: Dict[str, `LoRAAttnProcessor`]: A dictionary mapping the module names and their corresponding [`~LoRAAttnProcessor`]. """ - # Loop over the original attention modules. - for name, _ in self.text_encoder.named_modules(): - if any(x in name for x in TEXT_ENCODER_TARGET_MODULES): - # Retrieve the module and its corresponding LoRA processor. - module = self.text_encoder.get_submodule(name) - # Construct a new function that performs the LoRA merging. We will monkey patch - # this forward pass. - attn_processor_name = ".".join(name.split(".")[:-1]) - lora_layer = getattr(attn_processors[attn_processor_name], self._get_lora_layer_attribute(name)) - old_forward = module.forward - - # create a new scope that locks in the old_forward, lora_layer value for each new_forward function - # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060 - def make_new_forward(old_forward, lora_layer): - def new_forward(x): - return old_forward(x) + lora_layer(x) - - return new_forward - - # Monkey-patch. - module.forward = make_new_forward(old_forward, lora_layer) - - def _get_lora_layer_attribute(self, name: str) -> str: - if "q_proj" in name: - return "to_q_lora" - elif "v_proj" in name: - return "to_v_lora" - elif "k_proj" in name: - return "to_k_lora" - else: - return "to_out_lora" + + # First, remove any monkey-patch that might have been applied before + self._remove_text_encoder_monkey_patch() + + # Loop over the CLIPAttention module of text_encoder + for name, attn_module in self.text_encoder.named_modules(): + if name.endswith(TEXT_ENCODER_ATTN_MODULE): + # Loop over the LoRA layers + for attn_proc_attr, text_encoder_attr in self._lora_attn_processor_attr_to_text_encoder_attr.items(): + # Retrieve the q/k/v/out projection of CLIPAttention and its corresponding LoRA layer. + module = attn_module.get_submodule(text_encoder_attr) + lora_layer = attn_processors[name].get_submodule(attn_proc_attr) + + # save old_forward to module that can be used to remove monkey-patch + old_forward = module.old_forward = module.forward + + # create a new scope that locks in the old_forward, lora_layer value for each new_forward function + # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060 + def make_new_forward(old_forward, lora_layer): + def new_forward(x): + return old_forward(x) + lora_layer(x) + + return new_forward + + # Monkey-patch. + module.forward = make_new_forward(old_forward, lora_layer) + + @property + def _lora_attn_processor_attr_to_text_encoder_attr(self): + return { + "to_q_lora": "q_proj", + "to_k_lora": "k_proj", + "to_v_lora": "v_proj", + "to_out_lora": "out_proj", + } def _load_text_encoder_attn_procs( self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 772c36b1177b..36cbe82f79e7 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -31,7 +31,6 @@ ONNX_WEIGHTS_NAME, SAFETENSORS_WEIGHTS_NAME, TEXT_ENCODER_ATTN_MODULE, - TEXT_ENCODER_TARGET_MODULES, WEIGHTS_NAME, ) from .deprecation_utils import deprecate diff --git a/src/diffusers/utils/constants.py b/src/diffusers/utils/constants.py index 93d5c8cc42cd..3c641a259a81 100644 --- a/src/diffusers/utils/constants.py +++ b/src/diffusers/utils/constants.py @@ -30,5 +30,4 @@ DIFFUSERS_DYNAMIC_MODULE_NAME = "diffusers_modules" HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules")) DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"] -TEXT_ENCODER_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "out_proj"] TEXT_ENCODER_ATTN_MODULE = ".self_attn" diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index d04d87e08b7a..52826fc0c736 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -163,6 +163,15 @@ def get_dummy_inputs(self): return noise, input_ids, pipeline_inputs + def create_lora_weight_file(self, tmpdirname): + _, lora_components = self.get_dummy_components() + LoraLoaderMixin.save_lora_weights( + save_directory=tmpdirname, + unet_lora_layers=lora_components["unet_lora_layers"], + text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + def test_lora_save_load(self): pipeline_components, lora_components = self.get_dummy_components() sd_pipe = StableDiffusionPipeline(**pipeline_components) @@ -299,14 +308,45 @@ def test_text_encoder_lora_monkey_patch(self): outputs_without_lora, outputs_with_lora ), "lora_up_weight are not zero, so the lora outputs should be different to without lora outputs" - def create_lora_weight_file(self, tmpdirname): - _, lora_components = self.get_dummy_components() - LoraLoaderMixin.save_lora_weights( - save_directory=tmpdirname, - unet_lora_layers=lora_components["unet_lora_layers"], - text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], - ) - self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + def test_text_encoder_lora_remove_monkey_patch(self): + pipeline_components, _ = self.get_dummy_components() + pipe = StableDiffusionPipeline(**pipeline_components) + + dummy_tokens = self.get_dummy_tokens() + + # inference without lora + outputs_without_lora = pipe.text_encoder(**dummy_tokens)[0] + assert outputs_without_lora.shape == (1, 77, 32) + + # create lora_attn_procs with randn up.weights + text_attn_procs = create_text_encoder_lora_attn_procs(pipe.text_encoder) + set_lora_up_weights(text_attn_procs, randn_weight=True) + + # monkey patch + pipe._modify_text_encoder(text_attn_procs) + + # verify that it's okay to release the text_attn_procs which holds the LoRAAttnProcessor. + del text_attn_procs + gc.collect() + + # inference with lora + outputs_with_lora = pipe.text_encoder(**dummy_tokens)[0] + assert outputs_with_lora.shape == (1, 77, 32) + + assert not torch.allclose( + outputs_without_lora, outputs_with_lora + ), "lora outputs should be different to without lora outputs" + + # remove monkey patch + pipe._remove_text_encoder_monkey_patch() + + # inference with removed lora + outputs_without_lora_removed = pipe.text_encoder(**dummy_tokens)[0] + assert outputs_without_lora_removed.shape == (1, 77, 32) + + assert torch.allclose( + outputs_without_lora, outputs_without_lora_removed + ), "remove lora monkey patch should restore the original outputs" def test_lora_unet_attn_processors(self): with tempfile.TemporaryDirectory() as tmpdirname: From 8669e8313dfedd5da1fd84e8abab92eda82623c0 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 6 Jun 2023 14:56:05 +0530 Subject: [PATCH 085/199] [LoRA] feat: add lora attention processor for pt 2.0. (#3594) * feat: add lora attention processor for pt 2.0. * explicit context manager for SDPA. * switch to flash attention * make shapes compatible to work optimally with SDPA. * fix: circular import problem. * explicitly specify the flash attention kernel in sdpa * fall back to efficient attention context manager. * remove explicit dispatch. * fix: removed processor. * fix: remove optional from type annotation. * feat: make changes regarding LoRAAttnProcessor2_0. * remove confusing warning. * formatting. * relax tolerance for PT 2.0 * fix: loading message. * remove unnecessary logging. * add: entry to the docs. * add: network_alpha argument. * relax tolerance. --- docs/source/en/api/attnprocessor.mdx | 3 + examples/dreambooth/train_dreambooth_lora.py | 6 +- src/diffusers/loaders.py | 8 +- src/diffusers/models/attention_processor.py | 117 ++++++++++++++++-- tests/models/test_lora_layers.py | 19 ++- tests/models/test_models_unet_3d_condition.py | 4 +- 6 files changed, 137 insertions(+), 20 deletions(-) diff --git a/docs/source/en/api/attnprocessor.mdx b/docs/source/en/api/attnprocessor.mdx index ead639feffe0..7a4812e0961e 100644 --- a/docs/source/en/api/attnprocessor.mdx +++ b/docs/source/en/api/attnprocessor.mdx @@ -11,6 +11,9 @@ An attention processor is a class for applying different types of attention mech ## LoRAAttnProcessor [[autodoc]] models.attention_processor.LoRAAttnProcessor +## LoRAAttnProcessor2_0 +[[autodoc]] models.attention_processor.LoRAAttnProcessor2_0 + ## CustomDiffusionAttnProcessor [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index ca25152fcb1c..3accc4265787 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -55,6 +55,7 @@ AttnAddedKVProcessor2_0, LoRAAttnAddedKVProcessor, LoRAAttnProcessor, + LoRAAttnProcessor2_0, SlicedAttnAddedKVProcessor, ) from diffusers.optimization import get_scheduler @@ -844,8 +845,9 @@ def main(args): if isinstance(attn_processor, (AttnAddedKVProcessor, SlicedAttnAddedKVProcessor, AttnAddedKVProcessor2_0)): lora_attn_processor_class = LoRAAttnAddedKVProcessor else: - lora_attn_processor_class = LoRAAttnProcessor - + lora_attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) unet_lora_attn_procs[name] = lora_attn_processor_class( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim ) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index ab0f1418e615..684a2ba710b9 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -18,6 +18,7 @@ from typing import Callable, Dict, List, Optional, Union import torch +import torch.nn.functional as F from huggingface_hub import hf_hub_download from .models.attention_processor import ( @@ -27,6 +28,7 @@ CustomDiffusionXFormersAttnProcessor, LoRAAttnAddedKVProcessor, LoRAAttnProcessor, + LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, SlicedAttnAddedKVProcessor, XFormersAttnProcessor, @@ -287,7 +289,9 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict if isinstance(attn_processor, (XFormersAttnProcessor, LoRAXFormersAttnProcessor)): attn_processor_class = LoRAXFormersAttnProcessor else: - attn_processor_class = LoRAAttnProcessor + attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) attn_processors[key] = attn_processor_class( hidden_size=hidden_size, @@ -927,11 +931,11 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di # Load the layers corresponding to text encoder and make necessary adjustments. text_encoder_keys = [k for k in keys if k.startswith(self.text_encoder_name)] - logger.info(f"Loading {self.text_encoder_name}.") text_encoder_lora_state_dict = { k.replace(f"{self.text_encoder_name}.", ""): v for k, v in state_dict.items() if k in text_encoder_keys } if len(text_encoder_lora_state_dict) > 0: + logger.info(f"Loading {self.text_encoder_name}.") attn_procs_text_encoder = self._load_text_encoder_attn_procs( text_encoder_lora_state_dict, network_alpha=network_alpha ) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 61a1faea07f4..e0404a83cc9a 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import warnings from typing import Callable, Optional, Union import torch @@ -166,7 +165,8 @@ def set_use_memory_efficient_attention_xformers( self, use_memory_efficient_attention_xformers: bool, attention_op: Optional[Callable] = None ): is_lora = hasattr(self, "processor") and isinstance( - self.processor, (LoRAAttnProcessor, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor) + self.processor, + (LoRAAttnProcessor, LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, LoRAAttnAddedKVProcessor), ) is_custom_diffusion = hasattr(self, "processor") and isinstance( self.processor, (CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor) @@ -200,14 +200,6 @@ def set_use_memory_efficient_attention_xformers( "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is" " only available for GPU " ) - elif hasattr(F, "scaled_dot_product_attention") and self.scale_qk: - warnings.warn( - "You have specified using flash attention using xFormers but you have PyTorch 2.0 already installed. " - "We will default to PyTorch's native efficient flash attention implementation (`F.scaled_dot_product_attention`) " - "introduced in PyTorch 2.0. In case you are using LoRA or Custom Diffusion, we will fall " - "back to their respective attention processors i.e., we will NOT use the PyTorch 2.0 " - "native efficient flash attention." - ) else: try: # Make sure we can run the memory efficient attention @@ -220,6 +212,8 @@ def set_use_memory_efficient_attention_xformers( raise e if is_lora: + # TODO (sayakpaul): should we throw a warning if someone wants to use the xformers + # variant when using PT 2.0 now that we have LoRAAttnProcessor2_0? processor = LoRAXFormersAttnProcessor( hidden_size=self.processor.hidden_size, cross_attention_dim=self.processor.cross_attention_dim, @@ -252,7 +246,10 @@ def set_use_memory_efficient_attention_xformers( processor = XFormersAttnProcessor(attention_op=attention_op) else: if is_lora: - processor = LoRAAttnProcessor( + attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + processor = attn_processor_class( hidden_size=self.processor.hidden_size, cross_attention_dim=self.processor.cross_attention_dim, rank=self.processor.rank, @@ -548,6 +545,8 @@ class LoRAAttnProcessor(nn.Module): The number of channels in the `encoder_hidden_states`. rank (`int`, defaults to 4): The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. """ def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): @@ -843,6 +842,7 @@ class LoRAAttnAddedKVProcessor(nn.Module): The number of channels in the `encoder_hidden_states`. rank (`int`, defaults to 4): The dimension of the LoRA update matrices. + """ def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): @@ -1162,6 +1162,9 @@ class LoRAXFormersAttnProcessor(nn.Module): [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + """ def __init__( @@ -1236,6 +1239,97 @@ def __call__( return hidden_states +class LoRAAttnProcessor2_0(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product + attention. + + Args: + hidden_size (`int`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + """ + + def __init__(self, hidden_size, cross_attention_dim=None, rank=4, network_alpha=None): + super().__init__() + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states, encoder_hidden_states=None, attention_mask=None, scale=1.0): + residual = hidden_states + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + inner_dim = hidden_states.shape[-1] + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + scale * self.to_q_lora(hidden_states) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + scale * self.to_k_lora(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + scale * self.to_v_lora(encoder_hidden_states) + + head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + scale * self.to_out_lora(hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + class CustomDiffusionXFormersAttnProcessor(nn.Module): r""" Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method. @@ -1520,6 +1614,7 @@ def __call__(self, attn: "Attention", hidden_states, encoder_hidden_states=None, XFormersAttnAddedKVProcessor, LoRAAttnProcessor, LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_0, LoRAAttnAddedKVProcessor, CustomDiffusionAttnProcessor, CustomDiffusionXFormersAttnProcessor, diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 52826fc0c736..2b10955d23f2 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -19,6 +19,7 @@ import torch import torch.nn as nn +import torch.nn.functional as F from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer from diffusers import AutoencoderKL, DDIMScheduler, StableDiffusionPipeline, UNet2DConditionModel @@ -28,6 +29,7 @@ AttnProcessor, AttnProcessor2_0, LoRAAttnProcessor, + LoRAAttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor, ) @@ -46,16 +48,24 @@ def create_unet_lora_layers(unet: nn.Module): elif name.startswith("down_blocks"): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) + lora_attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + lora_attn_procs[name] = lora_attn_processor_class( + hidden_size=hidden_size, cross_attention_dim=cross_attention_dim + ) unet_lora_layers = AttnProcsLayers(lora_attn_procs) return lora_attn_procs, unet_lora_layers def create_text_encoder_lora_attn_procs(text_encoder: nn.Module): text_lora_attn_procs = {} + lora_attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) for name, module in text_encoder.named_modules(): if name.endswith(TEXT_ENCODER_ATTN_MODULE): - text_lora_attn_procs[name] = LoRAAttnProcessor( + text_lora_attn_procs[name] = lora_attn_processor_class( hidden_size=module.out_proj.out_features, cross_attention_dim=None ) return text_lora_attn_procs @@ -368,7 +378,10 @@ def test_lora_unet_attn_processors(self): # check if lora attention processors are used for _, module in sd_pipe.unet.named_modules(): if isinstance(module, Attention): - self.assertIsInstance(module.processor, LoRAAttnProcessor) + attn_proc_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + self.assertIsInstance(module.processor, attn_proc_class) @unittest.skipIf(torch_device != "cuda", "This test is supposed to run on GPU") def test_lora_unet_attn_processors_with_xformers(self): diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 928f6bcbe960..762c4975da51 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -261,7 +261,7 @@ def test_lora_save_load(self): with torch.no_grad(): new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - assert (sample - new_sample).abs().max() < 1e-4 + assert (sample - new_sample).abs().max() < 5e-4 # LoRA and no LoRA should NOT be the same assert (sample - old_sample).abs().max() > 1e-4 @@ -295,7 +295,7 @@ def test_lora_save_load_safetensors(self): with torch.no_grad(): new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - assert (sample - new_sample).abs().max() < 1e-4 + assert (sample - new_sample).abs().max() < 3e-4 # LoRA and no LoRA should NOT be the same assert (sample - old_sample).abs().max() > 1e-4 From 017ee1609b7b0559db0dd1f47bbdd39bb4137046 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Tue, 6 Jun 2023 10:08:36 -1000 Subject: [PATCH 086/199] refactor Image processor for x4 upscaler (#3692) * refactor x4 upscaler * style * copies --------- Co-authored-by: yiyixuxu --- ...ipeline_stable_diffusion_latent_upscale.py | 5 ++ .../pipeline_stable_diffusion_upscale.py | 58 ++++++++++++------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py index e0fecf6d353f..d67a7f894886 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_latent_upscale.py @@ -33,6 +33,11 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_upscale.preprocess def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 6bb463a6a65f..4c4f3998cb91 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -21,6 +21,7 @@ import torch from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from ...image_processor import VaeImageProcessor from ...loaders import TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor @@ -34,6 +35,11 @@ def preprocess(image): + warnings.warn( + "The preprocess method is deprecated and will be removed in a future version. Please" + " use VaeImageProcessor.preprocess instead", + FutureWarning, + ) if isinstance(image, torch.Tensor): return image elif isinstance(image, PIL.Image.Image): @@ -125,6 +131,8 @@ def __init__( watermarker=watermarker, feature_extractor=feature_extractor, ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, resample="bicubic") self.register_to_config(max_noise_level=max_noise_level) def enable_sequential_cpu_offload(self, gpu_id=0): @@ -432,14 +440,15 @@ def check_inputs( if ( not isinstance(image, torch.Tensor) and not isinstance(image, PIL.Image.Image) + and not isinstance(image, np.ndarray) and not isinstance(image, list) ): raise ValueError( - f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or `list` but is {type(image)}" + f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}" ) - # verify batch size of prompt and image are same if image is a list or tensor - if isinstance(image, list) or isinstance(image, torch.Tensor): + # verify batch size of prompt and image are same if image is a list or tensor or numpy array + if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray): if isinstance(prompt, str): batch_size = 1 else: @@ -483,7 +492,14 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[PIL.Image.Image]] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, num_inference_steps: int = 75, guidance_scale: float = 9.0, noise_level: int = 20, @@ -506,7 +522,7 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`PIL.Image.Image` or List[`PIL.Image.Image`] or `torch.FloatTensor`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): `Image`, or tensor representing an image batch which will be upscaled. * num_inference_steps (`int`, *optional*, defaults to 50): The number of denoising steps. More denoising steps usually lead to a higher quality image at the @@ -627,7 +643,7 @@ def __call__( ) # 4. Preprocess image - image = preprocess(image) + image = self.image_processor.preprocess(image) image = image.to(dtype=prompt_embeds.dtype, device=device) # 5. set timesteps @@ -723,25 +739,25 @@ def __call__( else: latents = latents.float() - # 11. Convert to PIL - if output_type == "pil": - image = self.decode_latents(latents) - + # post-processing + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] image, has_nsfw_concept, _ = self.run_safety_checker(image, device, prompt_embeds.dtype) - - image = self.numpy_to_pil(image) - - # 11. Apply watermark - if self.watermarker is not None: - image = self.watermarker.apply_watermark(image) - elif output_type == "pt": - latents = 1 / self.vae.config.scaling_factor * latents - image = self.vae.decode(latents).sample - has_nsfw_concept = None else: - image = self.decode_latents(latents) + image = latents has_nsfw_concept = None + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # 11. Apply watermark + if output_type == "pil" and self.watermarker is not None: + image = self.watermarker.apply_watermark(image) + # Offload last model to CPU if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: self.final_offload_hook.offload() From de16f64667473ecb9503fc46a688c29ce2585377 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 7 Jun 2023 01:50:53 +0530 Subject: [PATCH 087/199] feat: when using PT 2.0 use LoRAAttnProcessor2_0 for text enc LoRA. (#3691) --- src/diffusers/loaders.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 684a2ba710b9..6ecc701f83e8 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -1168,7 +1168,10 @@ def _load_text_encoder_attn_procs( cross_attention_dim = value_dict["to_k_lora.down.weight"].shape[1] hidden_size = value_dict["to_k_lora.up.weight"].shape[0] - attn_processors[key] = LoRAAttnProcessor( + attn_processor_class = ( + LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor + ) + attn_processors[key] = attn_processor_class( hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, rank=rank, From 10f4ecd17725f6c6b21e47b2987dedb622d96706 Mon Sep 17 00:00:00 2001 From: stano Date: Wed, 7 Jun 2023 00:18:14 +0300 Subject: [PATCH 088/199] Fix the Kandinsky docstring examples (#3695) - use the correct Prior hub model id - use the new names in KandinskyPriorPipelineOutput --- src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py | 8 ++++---- .../pipelines/kandinsky/pipeline_kandinsky_prior.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 0da9d205f8e0..6de9cf4451de 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -41,13 +41,13 @@ >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline >>> import torch - >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-prior") + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior") >>> pipe_prior.to("cuda") >>> prompt = "red cat, 4k photo" >>> out = pipe_prior(prompt) - >>> image_emb = out.images - >>> zero_image_emb = out.zero_embeds + >>> image_emb = out.image_embeds + >>> negative_image_emb = out.negative_image_embeds >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1") >>> pipe.to("cuda") @@ -55,7 +55,7 @@ >>> image = pipe( ... prompt, ... image_embeds=image_emb, - ... negative_image_embeds=zero_image_emb, + ... negative_image_embeds=negative_image_emb, ... height=768, ... width=768, ... num_inference_steps=100, diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py index 0c262c57abc0..a0208d5858b1 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -45,8 +45,8 @@ >>> prompt = "red cat, 4k photo" >>> out = pipe_prior(prompt) - >>> image_emb = out.images - >>> zero_image_emb = out.zero_embeds + >>> image_emb = out.image_embeds + >>> negative_image_emb = out.negative_image_embeds >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1") >>> pipe.to("cuda") @@ -54,7 +54,7 @@ >>> image = pipe( ... prompt, ... image_embeds=image_emb, - ... negative_image_embeds=zero_image_emb, + ... negative_image_embeds=negative_image_emb, ... height=768, ... width=768, ... num_inference_steps=100, From 11b3002b48353b33880e385c576888ca5405918a Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Wed, 7 Jun 2023 05:20:02 +0800 Subject: [PATCH 089/199] Support views batch for panorama (#3632) * support views batch for panorama * add entry for the new argument * format entry for the new argument * add view_batch_size test * fix batch test and a boundary condition * add more docstrings * fix a typos * fix typos * add: entry to the doc about view_batch_size. * Revert "add: entry to the doc about view_batch_size." This reverts commit a36aeaa9edf9b662d09bbfd6e18cbc556ed38187. * add a tip on . --------- Co-authored-by: Sayak Paul --- docs/source/en/api/pipelines/panorama.mdx | 8 ++++ .../pipeline_stable_diffusion_panorama.py | 43 ++++++++++++++----- .../test_stable_diffusion_panorama.py | 20 ++++++++- 3 files changed, 59 insertions(+), 12 deletions(-) diff --git a/docs/source/en/api/pipelines/panorama.mdx b/docs/source/en/api/pipelines/panorama.mdx index e0c7747a0193..044901f24bf3 100644 --- a/docs/source/en/api/pipelines/panorama.mdx +++ b/docs/source/en/api/pipelines/panorama.mdx @@ -52,6 +52,14 @@ image = pipe(prompt).images[0] image.save("dolomites.png") ``` + + +While calling this pipeline, it's possible to specify the `view_batch_size` to have a >1 value. +For some GPUs with high performance, higher a `view_batch_size`, can speedup the generation +and increase the VRAM usage. + + + ## StableDiffusionPanoramaPipeline [[autodoc]] StableDiffusionPanoramaPipeline - __call__ diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 66706c806a81..35d57d048907 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -451,10 +451,11 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype def get_views(self, panorama_height, panorama_width, window_size=64, stride=8): # Here, we define the mappings F_i (see Eq. 7 in the MultiDiffusion paper https://arxiv.org/abs/2302.08113) + # if panorama's height/width < window_size, num_blocks of height/width should return 1 panorama_height /= 8 panorama_width /= 8 - num_blocks_height = (panorama_height - window_size) // stride + 1 - num_blocks_width = (panorama_width - window_size) // stride + 1 + num_blocks_height = (panorama_height - window_size) // stride + 1 if panorama_height > window_size else 1 + num_blocks_width = (panorama_width - window_size) // stride + 1 if panorama_height > window_size else 1 total_num_blocks = int(num_blocks_height * num_blocks_width) views = [] for i in range(total_num_blocks): @@ -474,6 +475,7 @@ def __call__( width: Optional[int] = 2048, num_inference_steps: int = 50, guidance_scale: float = 7.5, + view_batch_size: int = 1, negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, eta: float = 0.0, @@ -508,6 +510,9 @@ def __call__( Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, usually at the expense of lower image quality. + view_batch_size (`int`, *optional*, defaults to 1): + The batch size to denoise splited views. For some GPUs with high performance, higher view batch size + can speedup the generation and increase the VRAM usage. negative_prompt (`str` or `List[str]`, *optional*): The prompt or prompts not to guide the image generation. If not defined, one has to pass `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is @@ -609,8 +614,11 @@ def __call__( ) # 6. Define panorama grid and initialize views for synthesis. + # prepare batch grid views = self.get_views(height, width) - views_scheduler_status = [copy.deepcopy(self.scheduler.__dict__)] * len(views) + views_batch = [views[i : i + view_batch_size] for i in range(0, len(views), view_batch_size)] + views_scheduler_status = [copy.deepcopy(self.scheduler.__dict__)] * len(views_batch) + count = torch.zeros_like(latents) value = torch.zeros_like(latents) @@ -631,42 +639,55 @@ def __call__( # denoised (latent) crops are then averaged to produce the final latent # for the current timestep via MultiDiffusion. Please see Sec. 4.1 in the # MultiDiffusion paper for more details: https://arxiv.org/abs/2302.08113 - for j, (h_start, h_end, w_start, w_end) in enumerate(views): + # Batch views denoise + for j, batch_view in enumerate(views_batch): + vb_size = len(batch_view) # get the latents corresponding to the current view coordinates - latents_for_view = latents[:, :, h_start:h_end, w_start:w_end] + latents_for_view = torch.cat( + [latents[:, :, h_start:h_end, w_start:w_end] for h_start, h_end, w_start, w_end in batch_view] + ) # rematch block's scheduler status self.scheduler.__dict__.update(views_scheduler_status[j]) # expand the latents if we are doing classifier free guidance latent_model_input = ( - torch.cat([latents_for_view] * 2) if do_classifier_free_guidance else latents_for_view + latents_for_view.repeat_interleave(2, dim=0) + if do_classifier_free_guidance + else latents_for_view ) latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # repeat prompt_embeds for batch + prompt_embeds_input = torch.cat([prompt_embeds] * vb_size) + # predict the noise residual noise_pred = self.unet( latent_model_input, t, - encoder_hidden_states=prompt_embeds, + encoder_hidden_states=prompt_embeds_input, cross_attention_kwargs=cross_attention_kwargs, ).sample # perform guidance if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_uncond, noise_pred_text = noise_pred[::2], noise_pred[1::2] noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - latents_view_denoised = self.scheduler.step( + latents_denoised_batch = self.scheduler.step( noise_pred, t, latents_for_view, **extra_step_kwargs ).prev_sample # save views scheduler status after sample views_scheduler_status[j] = copy.deepcopy(self.scheduler.__dict__) - value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised - count[:, :, h_start:h_end, w_start:w_end] += 1 + # extract value from batch + for latents_view_denoised, (h_start, h_end, w_start, w_end) in zip( + latents_denoised_batch.chunk(vb_size), batch_view + ): + value[:, :, h_start:h_end, w_start:w_end] += latents_view_denoised + count[:, :, h_start:h_end, w_start:w_end] += 1 # take the MultiDiffusion step. Eq. 5 in MultiDiffusion paper: https://arxiv.org/abs/2302.08113 latents = torch.where(count > 0, value / count, value) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index c8d2bfa8c59d..32541c980a15 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -131,7 +131,7 @@ def test_inference_batch_consistent(self): # override to speed the overall test timing up. def test_inference_batch_single_identical(self): - super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=3e-3) + super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=3.25e-3) def test_stable_diffusion_panorama_negative_prompt(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator @@ -152,6 +152,24 @@ def test_stable_diffusion_panorama_negative_prompt(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_panorama_views_batch(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionPanoramaPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + output = sd_pipe(**inputs, view_batch_size=2) + image = output.images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array([0.6187, 0.5375, 0.4915, 0.4136, 0.4114, 0.4563, 0.5128, 0.4976, 0.4757]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_panorama_euler(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator components = self.get_dummy_components() From 2de9e2df368241cf13f859cf51514cea4e53aed5 Mon Sep 17 00:00:00 2001 From: "Jason C.H" Date: Wed, 7 Jun 2023 05:39:11 +0800 Subject: [PATCH 090/199] Fix from_ckpt for Stable Diffusion 2.x (#3662) --- src/diffusers/loaders.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 6ecc701f83e8..4b7bb69535f7 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -1442,23 +1442,25 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): # TODO: For now we only support stable diffusion stable_unclip = None + model_type = None controlnet = False if pipeline_name == "StableDiffusionControlNetPipeline": - model_type = "FrozenCLIPEmbedder" + # Model type will be inferred from the checkpoint. controlnet = True elif "StableDiffusion" in pipeline_name: - model_type = "FrozenCLIPEmbedder" + # Model type will be inferred from the checkpoint. + pass elif pipeline_name == "StableUnCLIPPipeline": - model_type == "FrozenOpenCLIPEmbedder" + model_type = "FrozenOpenCLIPEmbedder" stable_unclip = "txt2img" elif pipeline_name == "StableUnCLIPImg2ImgPipeline": - model_type == "FrozenOpenCLIPEmbedder" + model_type = "FrozenOpenCLIPEmbedder" stable_unclip = "img2img" elif pipeline_name == "PaintByExamplePipeline": - model_type == "PaintByExample" + model_type = "PaintByExample" elif pipeline_name == "LDMTextToImagePipeline": - model_type == "LDMTextToImage" + model_type = "LDMTextToImage" else: raise ValueError(f"Unhandled pipeline class: {pipeline_name}") From 74fd735eb073eb1d774b1ab4154a0876eb82f055 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 6 Jun 2023 23:47:46 +0200 Subject: [PATCH 091/199] Add draft for lora text encoder scale (#3626) * Add draft for lora text encoder scale * Improve naming * fix: training dreambooth lora script. * Apply suggestions from code review * Update examples/dreambooth/train_dreambooth_lora.py * Apply suggestions from code review * Apply suggestions from code review * add lora mixin when fit * add lora mixin when fit * add lora mixin when fit * fix more * fix more --------- Co-authored-by: Sayak Paul --- CONTRIBUTING.md | 8 +-- PHILOSOPHY.md | 20 +++---- README.md | 18 +++--- docs/source/_config.py | 2 +- docs/source/en/training/lora.mdx | 8 +++ src/diffusers/loaders.py | 12 +++- .../alt_diffusion/pipeline_alt_diffusion.py | 16 +++++- .../pipeline_alt_diffusion_img2img.py | 16 +++++- .../controlnet/pipeline_controlnet.py | 16 +++++- .../controlnet/pipeline_controlnet_img2img.py | 16 +++++- .../controlnet/pipeline_controlnet_inpaint.py | 16 +++++- .../pipeline_cycle_diffusion.py | 28 +++++++-- .../pipeline_stable_diffusion.py | 12 ++++ ...line_stable_diffusion_attend_and_excite.py | 10 +++- .../pipeline_stable_diffusion_depth2img.py | 29 ++++++++-- .../pipeline_stable_diffusion_diffedit.py | 13 +++++ .../pipeline_stable_diffusion_img2img.py | 12 ++++ .../pipeline_stable_diffusion_inpaint.py | 12 ++++ ...ipeline_stable_diffusion_inpaint_legacy.py | 29 ++++++++-- .../pipeline_stable_diffusion_k_diffusion.py | 10 +++- ...pipeline_stable_diffusion_model_editing.py | 16 +++++- .../pipeline_stable_diffusion_panorama.py | 16 +++++- .../pipeline_stable_diffusion_pix2pix_zero.py | 10 +++- .../pipeline_stable_diffusion_sag.py | 10 +++- .../pipeline_stable_diffusion_upscale.py | 24 +++++++- .../pipeline_stable_unclip.py | 16 +++++- .../pipeline_stable_unclip_img2img.py | 16 +++++- .../pipeline_text_to_video_synth.py | 16 +++++- tests/models/test_lora_layers.py | 57 ++++++++++++++----- 29 files changed, 406 insertions(+), 78 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5ce48793e9c2..9c5f0a10edd3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,14 +125,14 @@ Awesome! Tell us what problem it solved for you. You can open a feature request [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=). -#### 2.3 Feedback. +#### 2.3 Feedback. Feedback about the library design and why it is good or not good helps the core maintainers immensely to build a user-friendly library. To understand the philosophy behind the current design philosophy, please have a look [here](https://huggingface.co/docs/diffusers/conceptual/philosophy). If you feel like a certain design choice does not fit with the current design philosophy, please explain why and how it should be changed. If a certain design choice follows the design philosophy too much, hence restricting use cases, explain why and how it should be changed. If a certain design choice is very useful for you, please also leave a note as this is great feedback for future design decisions. You can open an issue about feedback [here](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=). -#### 2.4 Technical questions. +#### 2.4 Technical questions. Technical questions are mainly about why certain code of the library was written in a certain way, or what a certain part of the code does. Please make sure to link to the code in question and please provide detail on why this part of the code is difficult to understand. @@ -394,8 +394,8 @@ passes. You should run the tests impacted by your changes like this: ```bash $ pytest tests/.py ``` - -Before you run the tests, please make sure you install the dependencies required for testing. You can do so + +Before you run the tests, please make sure you install the dependencies required for testing. You can do so with this command: ```bash diff --git a/PHILOSOPHY.md b/PHILOSOPHY.md index fbad5948e17e..399cb0bfb47d 100644 --- a/PHILOSOPHY.md +++ b/PHILOSOPHY.md @@ -27,18 +27,18 @@ In a nutshell, Diffusers is built to be a natural extension of PyTorch. Therefor ## Simple over easy -As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library: +As PyTorch states, **explicit is better than implicit** and **simple is better than complex**. This design philosophy is reflected in multiple parts of the library: - We follow PyTorch's API with methods like [`DiffusionPipeline.to`](https://huggingface.co/docs/diffusers/main/en/api/diffusion_pipeline#diffusers.DiffusionPipeline.to) to let the user handle device management. - Raising concise error messages is preferred to silently correct erroneous input. Diffusers aims at teaching the user, rather than making the library as easy to use as possible. - Complex model vs. scheduler logic is exposed instead of magically handled inside. Schedulers/Samplers are separated from diffusion models with minimal dependencies on each other. This forces the user to write the unrolled denoising loop. However, the separation allows for easier debugging and gives the user more control over adapting the denoising process or switching out diffusion models or schedulers. -- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training +- Separately trained components of the diffusion pipeline, *e.g.* the text encoder, the unet, and the variational autoencoder, each have their own model class. This forces the user to handle the interaction between the different model components, and the serialization format separates the model components into different files. However, this allows for easier debugging and customization. Dreambooth or textual inversion training is very simple thanks to diffusers' ability to separate single components of the diffusion pipeline. ## Tweakable, contributor-friendly over abstraction -For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself). +For large parts of the library, Diffusers adopts an important design principle of the [Transformers library](https://github.com/huggingface/transformers), which is to prefer copy-pasted code over hasty abstractions. This design principle is very opinionated and stands in stark contrast to popular design principles such as [Don't repeat yourself (DRY)](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself). In short, just like Transformers does for modeling files, diffusers prefers to keep an extremely low level of abstraction and very self-contained code for pipelines and schedulers. -Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable. +Functions, long code blocks, and even classes can be copied across multiple files which at first can look like a bad, sloppy design choice that makes the library unmaintainable. **However**, this design has proven to be extremely successful for Transformers and makes a lot of sense for community-driven, open-source machine learning libraries because: - Machine Learning is an extremely fast-moving field in which paradigms, model architectures, and algorithms are changing rapidly, which therefore makes it very difficult to define long-lasting code abstractions. - Machine Learning practitioners like to be able to quickly tweak existing code for ideation and research and therefore prefer self-contained code over one that contains many abstractions. @@ -47,10 +47,10 @@ Functions, long code blocks, and even classes can be copied across multiple file At Hugging Face, we call this design the **single-file policy** which means that almost all of the code of a certain class should be written in a single, self-contained file. To read more about the philosophy, you can have a look at [this blog post](https://huggingface.co/blog/transformers-design-philosophy). -In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such +In diffusers, we follow this philosophy for both pipelines and schedulers, but only partly for diffusion models. The reason we don't follow this design fully for diffusion models is because almost all diffusion pipelines, such as [DDPM](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/ddpm), [Stable Diffusion](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/stable_diffusion/overview#stable-diffusion-pipelines), [UnCLIP (Dalle-2)](https://huggingface.co/docs/diffusers/v0.12.0/en/api/pipelines/unclip#overview) and [Imagen](https://imagen.research.google/) all rely on the same diffusion model, the [UNet](https://huggingface.co/docs/diffusers/api/models#diffusers.UNet2DConditionModel). -Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗. +Great, now you should have generally understood why 🧨 Diffusers is designed the way it is 🤗. We try to apply these design principles consistently across the library. Nevertheless, there are some minor exceptions to the philosophy or some unlucky design choices. If you have feedback regarding the design, we would ❤️ to hear it [directly on GitHub](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=). ## Design Philosophy in Details @@ -89,7 +89,7 @@ The following design principles are followed: - Models should by default have the highest precision and lowest performance setting. - To integrate new model checkpoints whose general architecture can be classified as an architecture that already exists in Diffusers, the existing model architecture shall be adapted to make it work with the new checkpoint. One should only create a new file if the model architecture is fundamentally different. - Models should be designed to be easily extendable to future changes. This can be achieved by limiting public function arguments, configuration arguments, and "foreseeing" future changes, *e.g.* it is usually better to add `string` "...type" arguments that can easily be extended to new future types instead of boolean `is_..._type` arguments. Only the minimum amount of changes shall be made to existing architectures to make a new model checkpoint work. -- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and +- The model design is a difficult trade-off between keeping code readable and concise and supporting many model checkpoints. For most parts of the modeling code, classes shall be adapted for new model checkpoints, while there are some exceptions where it is preferred to add new classes to make sure the code is kept concise and readable longterm, such as [UNet blocks](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py) and [Attention processors](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). ### Schedulers @@ -97,9 +97,9 @@ readable longterm, such as [UNet blocks](https://github.com/huggingface/diffuser Schedulers are responsible to guide the denoising process for inference as well as to define a noise schedule for training. They are designed as individual classes with loadable configuration files and strongly follow the **single-file policy**. The following design principles are followed: -- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). -- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained. -- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper). +- All schedulers are found in [`src/diffusers/schedulers`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/schedulers). +- Schedulers are **not** allowed to import from large utils files and shall be kept very self-contained. +- One scheduler python file corresponds to one scheduler algorithm (as might be defined in a paper). - If schedulers share similar functionalities, we can make use of the `#Copied from` mechanism. - Schedulers all inherit from `SchedulerMixin` and `ConfigMixin`. - Schedulers can be easily swapped out with the [`ConfigMixin.from_config`](https://huggingface.co/docs/diffusers/main/en/api/configuration#diffusers.ConfigMixin.from_config) method as explained in detail [here](./using-diffusers/schedulers.mdx). diff --git a/README.md b/README.md index ab37c629102a..c2a3b04b57a8 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,7 @@ We recommend installing 🤗 Diffusers in a virtual environment from PyPi or Con ### PyTorch With `pip` (official package): - + ```bash pip install --upgrade diffusers[torch] ``` @@ -107,7 +107,7 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l | [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques. | ## Contribution -We ❤️ contributions from the open-source community! +We ❤️ contributions from the open-source community! If you want to contribute to this library, please check out our [Contribution guide](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md). You can look out for [issues](https://github.com/huggingface/diffusers/issues) you'd like to tackle to contribute to the library. - See [Good first issues](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22good+first+issue%22) for general opportunities to contribute @@ -128,7 +128,7 @@ just hang out ☕. Unconditional Image Generation - DDPM + DDPM google/ddpm-ema-church-256 @@ -185,13 +185,13 @@ just hang out ☕. ## Popular libraries using 🧨 Diffusers -- https://github.com/microsoft/TaskMatrix -- https://github.com/invoke-ai/InvokeAI -- https://github.com/apple/ml-stable-diffusion -- https://github.com/Sanster/lama-cleaner +- https://github.com/microsoft/TaskMatrix +- https://github.com/invoke-ai/InvokeAI +- https://github.com/apple/ml-stable-diffusion +- https://github.com/Sanster/lama-cleaner - https://github.com/IDEA-Research/Grounded-Segment-Anything -- https://github.com/ashawkey/stable-dreamfusion -- https://github.com/deep-floyd/IF +- https://github.com/ashawkey/stable-dreamfusion +- https://github.com/deep-floyd/IF - https://github.com/bentoml/BentoML - https://github.com/bmaltais/kohya_ss - +3000 other amazing GitHub repositories 💪 diff --git a/docs/source/_config.py b/docs/source/_config.py index 9a4818ea8b1e..3d0d73dcb951 100644 --- a/docs/source/_config.py +++ b/docs/source/_config.py @@ -6,4 +6,4 @@ # ! pip install git+https://github.com/huggingface/diffusers.git """ -notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] \ No newline at end of file +notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx index 484b08ce950a..1208178810a5 100644 --- a/docs/source/en/training/lora.mdx +++ b/docs/source/en/training/lora.mdx @@ -260,6 +260,14 @@ pipe.load_lora_weights(lora_model_id) image = pipe("A picture of a sks dog in a bucket", num_inference_steps=25).images[0] ``` + + +If your LoRA parameters involve the UNet as well as the Text Encoder, then passing +`cross_attention_kwargs={"scale": 0.5}` will apply the `scale` value to both the UNet +and the Text Encoder. + + + Note that the use of [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] is preferred to [`~diffusers.loaders.UNet2DConditionLoadersMixin.load_attn_procs`] for loading LoRA parameters. This is because [`~diffusers.loaders.LoraLoaderMixin.load_lora_weights`] can handle the following situations: diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 4b7bb69535f7..6d273de5ca9d 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -852,6 +852,9 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di weight_name = kwargs.pop("weight_name", None) use_safetensors = kwargs.pop("use_safetensors", None) + # set lora scale to a reasonable default + self._lora_scale = 1.0 + if use_safetensors and not is_safetensors_available(): raise ValueError( "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" @@ -953,6 +956,12 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`." warnings.warn(warn_message) + @property + def lora_scale(self) -> float: + # property function that returns the lora scale which can be set at run time by the pipeline. + # if _lora_scale has not been set, return 1 + return self._lora_scale if hasattr(self, "_lora_scale") else 1.0 + @property def text_encoder_lora_attn_procs(self): if hasattr(self, "_text_encoder_lora_attn_procs"): @@ -1000,7 +1009,8 @@ def _modify_text_encoder(self, attn_processors: Dict[str, LoRAAttnProcessor]): # for more detail, see https://github.com/huggingface/diffusers/pull/3490#issuecomment-1555059060 def make_new_forward(old_forward, lora_layer): def new_forward(x): - return old_forward(x) + lora_layer(x) + result = old_forward(x) + self.lora_scale * lora_layer(x) + return result return new_forward diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 8507684cf9b4..64ca06a53a7b 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -24,7 +24,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import deprecate, logging, randn_tensor, replace_example_docstring @@ -52,7 +52,7 @@ # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker -class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Alt Diffusion. @@ -291,6 +291,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -315,7 +316,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -653,6 +661,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -661,6 +672,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index f0d4d91ce966..5903f97aca36 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -26,7 +26,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor, replace_example_docstring @@ -95,7 +95,7 @@ def preprocess(image): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker -class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Alt Diffusion. @@ -302,6 +302,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -326,7 +327,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -706,6 +714,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -714,6 +725,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 4ac43377c82a..89398b6f01f9 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -25,7 +25,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -91,7 +91,7 @@ """ -class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionControlNetPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -291,6 +291,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -315,7 +316,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -838,6 +846,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -846,6 +857,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 6667cf43ce46..0e984d8ae5e3 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -25,7 +25,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -117,7 +117,7 @@ def prepare_image(image): return image -class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -317,6 +317,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -341,7 +342,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -929,6 +937,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -937,6 +948,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image image = self.image_processor.preprocess(image).to(dtype=torch.float32) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index c20f2d518f96..5ce2fd5543b8 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -26,7 +26,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -223,7 +223,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image=False return mask, masked_image -class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion with ControlNet guidance. @@ -434,6 +434,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -458,7 +459,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -1131,6 +1139,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -1139,6 +1150,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py index 6b6df0945943..b8360f512405 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_cycle_diffusion.py @@ -14,7 +14,7 @@ import inspect import warnings -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -26,7 +26,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor @@ -126,7 +126,7 @@ def compute_noise(scheduler, prev_latents, latents, timestep, noise_pred, eta): return noise -class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class CycleDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -315,6 +315,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -339,7 +340,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -629,6 +637,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -685,6 +694,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -705,12 +718,16 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, num_images_per_prompt, do_classifier_free_guidance, prompt_embeds=prompt_embeds, + lora_scale=text_encoder_lora_scale, ) source_prompt_embeds = self._encode_prompt( source_prompt, device, num_images_per_prompt, do_classifier_free_guidance, None @@ -764,7 +781,10 @@ def __call__( dim=0, ) concat_noise_pred = self.unet( - concat_latent_model_input, t, encoder_hidden_states=concat_prompt_embeds + concat_latent_model_input, + t, + cross_attention_kwargs=cross_attention_kwargs, + encoder_hidden_states=concat_prompt_embeds, ).sample # perform guidance diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 170002b2514e..f7374452a5f6 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -294,6 +294,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -318,7 +319,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -654,6 +662,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -662,6 +673,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py index 64e8577438ea..f76268463707 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_attend_and_excite.py @@ -23,7 +23,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import KarrasDiffusionSchedulers @@ -306,6 +306,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -330,7 +331,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py index 2fd4503a94ce..002014681040 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py @@ -15,7 +15,7 @@ import contextlib import inspect import warnings -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -183,6 +183,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -207,7 +208,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -546,6 +554,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -606,6 +615,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Examples: @@ -665,6 +678,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -673,6 +689,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare depth mask @@ -711,9 +728,13 @@ def __call__( latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1) # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[ - 0 - ] + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py index 3c1ac58bcee4..837811baae64 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_diffedit.py @@ -487,6 +487,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -511,7 +512,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -1007,6 +1015,7 @@ def generate_mask( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompts + (cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None) target_prompt_embeds = self._encode_prompt( target_prompt, device, @@ -1458,6 +1467,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -1466,6 +1478,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess mask diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index 106b6528a982..e9e91b646ed5 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -309,6 +309,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -333,7 +334,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -714,6 +722,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -722,6 +733,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index 9a6f5dbf9480..b07a5555f1c7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -378,6 +378,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -402,7 +403,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -898,6 +906,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -906,6 +917,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. set timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index c549d869e685..147d914fe6c1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -14,7 +14,7 @@ import inspect import warnings -from typing import Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -304,6 +304,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -328,7 +329,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -575,6 +583,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -639,6 +648,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Returns: [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: @@ -665,6 +678,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -673,6 +689,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image and mask @@ -708,9 +725,13 @@ def __call__( latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=prompt_embeds, return_dict=False)[ - 0 - ] + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index 39601ac36c33..ab613dd4dfe4 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -21,7 +21,7 @@ from k_diffusion.sampling import get_sigmas_karras from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...pipelines import DiffusionPipeline from ...schedulers import LMSDiscreteScheduler from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor @@ -210,6 +210,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -234,7 +235,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py index 7bc6b466b46f..1d30b9ee0347 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_model_editing.py @@ -20,7 +20,7 @@ from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import PNDMScheduler from ...schedulers.scheduling_utils import SchedulerMixin @@ -55,7 +55,7 @@ """ -class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionModelEditingPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image model editing using "Editing Implicit Assumptions in Text-to-Image Diffusion Models". @@ -237,6 +237,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -261,7 +262,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -719,6 +727,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -727,6 +738,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 35d57d048907..3826447576d4 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -20,7 +20,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import DDIMScheduler from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring @@ -51,7 +51,7 @@ """ -class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using "MultiDiffusion: Fusing Diffusion Paths for Controlled Image Generation". @@ -199,6 +199,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -223,7 +224,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -586,6 +594,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -594,6 +605,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py index 3332cc89d96c..75ac4f777756 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_pix2pix_zero.py @@ -30,7 +30,7 @@ ) from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import Attention from ...schedulers import DDIMScheduler, DDPMScheduler, EulerAncestralDiscreteScheduler, LMSDiscreteScheduler @@ -447,6 +447,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -471,7 +472,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py index db3c148f04e5..ba1c0d2b9d49 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_sag.py @@ -21,7 +21,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import is_accelerate_available, is_accelerate_version, logging, randn_tensor, replace_example_docstring @@ -218,6 +218,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -242,7 +243,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 4c4f3998cb91..0fda05ea5ec2 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -14,7 +14,7 @@ import inspect import warnings -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import numpy as np import PIL @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers @@ -60,7 +60,7 @@ def preprocess(image): return image -class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-guided image super-resolution using Stable Diffusion 2. @@ -224,6 +224,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -248,7 +249,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -514,6 +522,7 @@ def __call__( return_dict: bool = True, callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -568,6 +577,10 @@ def __call__( callback_steps (`int`, *optional*, defaults to 1): The frequency at which the `callback` function will be called. If not specified, the callback will be called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). Examples: ```py @@ -632,6 +645,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -640,6 +656,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess image @@ -703,6 +720,7 @@ def __call__( latent_model_input, t, encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, class_labels=noise_level, return_dict=False, )[0] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py index fb907f49553c..e36ebfbb70f1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip.py @@ -21,7 +21,7 @@ from transformers.models.clip.modeling_clip import CLIPTextModelOutput from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, PriorTransformer, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers @@ -50,7 +50,7 @@ """ -class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableUnCLIPPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): """ Pipeline for text-to-image generation using stable unCLIP. @@ -338,6 +338,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -362,7 +363,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -856,6 +864,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 8. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt=prompt, device=device, @@ -864,6 +875,7 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 9. Prepare image embeddings diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py index 44916049e29f..0187c86b4239 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_unclip_img2img.py @@ -23,7 +23,7 @@ from diffusers.utils.import_utils import is_accelerate_available from ...image_processor import VaeImageProcessor -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.embeddings import get_timestep_embedding from ...schedulers import KarrasDiffusionSchedulers @@ -63,7 +63,7 @@ """ -class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableUnCLIPImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): """ Pipeline for text-guided image to image generation using stable unCLIP. @@ -238,6 +238,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -262,7 +263,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -752,6 +760,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt=prompt, device=device, @@ -760,6 +771,7 @@ def __call__( negative_prompt=negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Encoder input image diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index a44b6f1d0744..8bf4bafa4fe5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -19,7 +19,7 @@ import torch from transformers import CLIPTextModel, CLIPTokenizer -from ...loaders import TextualInversionLoaderMixin +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet3DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -73,7 +73,7 @@ def tensor2vid(video: torch.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) - return images -class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class TextToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-video generation. @@ -224,6 +224,7 @@ def _encode_prompt( negative_prompt=None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, ): r""" Encodes the prompt into text encoder hidden states. @@ -248,7 +249,14 @@ def _encode_prompt( Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + if prompt is not None and isinstance(prompt, str): batch_size = 1 elif prompt is not None and isinstance(prompt, list): @@ -591,6 +599,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -599,6 +610,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare timesteps diff --git a/tests/models/test_lora_layers.py b/tests/models/test_lora_layers.py index 2b10955d23f2..aaacf1e68f9f 100644 --- a/tests/models/test_lora_layers.py +++ b/tests/models/test_lora_layers.py @@ -173,6 +173,17 @@ def get_dummy_inputs(self): return noise, input_ids, pipeline_inputs + # copied from: https://colab.research.google.com/gist/sayakpaul/df2ef6e1ae6d8c10a49d859883b10860/scratchpad.ipynb + + def get_dummy_tokens(self): + max_seq_length = 77 + + inputs = torch.randint(2, 56, size=(1, max_seq_length), generator=torch.manual_seed(0)) + + prepared_inputs = {} + prepared_inputs["input_ids"] = inputs + return prepared_inputs + def create_lora_weight_file(self, tmpdirname): _, lora_components = self.get_dummy_components() LoraLoaderMixin.save_lora_weights( @@ -188,7 +199,7 @@ def test_lora_save_load(self): sd_pipe = sd_pipe.to(torch_device) sd_pipe.set_progress_bar_config(disable=None) - noise, input_ids, pipeline_inputs = self.get_dummy_inputs() + _, _, pipeline_inputs = self.get_dummy_inputs() original_images = sd_pipe(**pipeline_inputs).images orig_image_slice = original_images[0, -3:, -3:, -1] @@ -214,7 +225,7 @@ def test_lora_save_load_safetensors(self): sd_pipe = sd_pipe.to(torch_device) sd_pipe.set_progress_bar_config(disable=None) - noise, input_ids, pipeline_inputs = self.get_dummy_inputs() + _, _, pipeline_inputs = self.get_dummy_inputs() original_images = sd_pipe(**pipeline_inputs).images orig_image_slice = original_images[0, -3:, -3:, -1] @@ -242,7 +253,7 @@ def test_lora_save_load_legacy(self): sd_pipe = sd_pipe.to(torch_device) sd_pipe.set_progress_bar_config(disable=None) - noise, input_ids, pipeline_inputs = self.get_dummy_inputs() + _, _, pipeline_inputs = self.get_dummy_inputs() original_images = sd_pipe(**pipeline_inputs).images orig_image_slice = original_images[0, -3:, -3:, -1] @@ -260,16 +271,6 @@ def test_lora_save_load_legacy(self): # Outputs shouldn't match. self.assertFalse(torch.allclose(torch.from_numpy(orig_image_slice), torch.from_numpy(lora_image_slice))) - # copied from: https://colab.research.google.com/gist/sayakpaul/df2ef6e1ae6d8c10a49d859883b10860/scratchpad.ipynb - def get_dummy_tokens(self): - max_seq_length = 77 - - inputs = torch.randint(2, 56, size=(1, max_seq_length), generator=torch.manual_seed(0)) - - prepared_inputs = {} - prepared_inputs["input_ids"] = inputs - return prepared_inputs - def test_text_encoder_lora_monkey_patch(self): pipeline_components, _ = self.get_dummy_components() pipe = StableDiffusionPipeline(**pipeline_components) @@ -358,6 +359,34 @@ def test_text_encoder_lora_remove_monkey_patch(self): outputs_without_lora, outputs_without_lora_removed ), "remove lora monkey patch should restore the original outputs" + def test_text_encoder_lora_scale(self): + pipeline_components, lora_components = self.get_dummy_components() + sd_pipe = StableDiffusionPipeline(**pipeline_components) + sd_pipe = sd_pipe.to(torch_device) + sd_pipe.set_progress_bar_config(disable=None) + + _, _, pipeline_inputs = self.get_dummy_inputs() + + with tempfile.TemporaryDirectory() as tmpdirname: + LoraLoaderMixin.save_lora_weights( + save_directory=tmpdirname, + unet_lora_layers=lora_components["unet_lora_layers"], + text_encoder_lora_layers=lora_components["text_encoder_lora_layers"], + ) + self.assertTrue(os.path.isfile(os.path.join(tmpdirname, "pytorch_lora_weights.bin"))) + sd_pipe.load_lora_weights(tmpdirname) + + lora_images = sd_pipe(**pipeline_inputs).images + lora_image_slice = lora_images[0, -3:, -3:, -1] + + lora_images_with_scale = sd_pipe(**pipeline_inputs, cross_attention_kwargs={"scale": 0.5}).images + lora_image_with_scale_slice = lora_images_with_scale[0, -3:, -3:, -1] + + # Outputs shouldn't match. + self.assertFalse( + torch.allclose(torch.from_numpy(lora_image_slice), torch.from_numpy(lora_image_with_scale_slice)) + ) + def test_lora_unet_attn_processors(self): with tempfile.TemporaryDirectory() as tmpdirname: self.create_lora_weight_file(tmpdirname) @@ -416,7 +445,7 @@ def test_lora_save_load_with_xformers(self): sd_pipe = sd_pipe.to(torch_device) sd_pipe.set_progress_bar_config(disable=None) - noise, input_ids, pipeline_inputs = self.get_dummy_inputs() + _, _, pipeline_inputs = self.get_dummy_inputs() # enable XFormers sd_pipe.enable_xformers_memory_efficient_attention() From 12a232efa99d7a8c33f54ae515c5a3d6fc5c8f34 Mon Sep 17 00:00:00 2001 From: Max-We <48329936+Max-We@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:57:10 +0200 Subject: [PATCH 092/199] Fix schedulers zero SNR and rescale classifier free guidance (#3664) * Implement option for rescaling betas to zero terminal SNR * Implement rescale classifier free guidance in pipeline_stable_diffusion.py * focus on DDIM * make style * make style * make style * make style * Apply suggestions from Peter Lin * Apply suggestions from Peter Lin * make style * Apply suggestions from code review * Apply suggestions from code review * make style * make style --------- Co-authored-by: MaxWe00 Co-authored-by: Patrick von Platen --- .../stable_diffusion/stable_diffusion_2.mdx | 58 +++++++++++++++ docs/source/en/api/schedulers/ddim.mdx | 63 +++++++++++++++- examples/text_to_image/train_text_to_image.py | 10 +++ .../text_to_image/train_text_to_image_lora.py | 10 +++ .../alt_diffusion/pipeline_alt_diffusion.py | 25 +++++++ .../pipeline_stable_diffusion.py | 24 ++++++ src/diffusers/schedulers/scheduling_ddim.py | 74 +++++++++++++++++-- .../test_stable_diffusion.py | 21 ++++++ .../test_stable_diffusion_v_pred.py | 23 ++++++ tests/schedulers/test_scheduler_ddim.py | 8 ++ 10 files changed, 310 insertions(+), 6 deletions(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx index e922072e4e31..7162626ebbde 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx @@ -71,6 +71,64 @@ image = pipe(prompt, guidance_scale=9, num_inference_steps=25).images[0] image.save("astronaut.png") ``` +#### Experimental: "Common Diffusion Noise Schedules and Sample Steps are Flawed": + +The paper **[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/abs/2305.08891)** +claims that a mismatch between the training and inference settings leads to suboptimal inference generation results for Stable Diffusion. + +The abstract reads as follows: + +*We discover that common diffusion noise schedules do not enforce the last timestep to have zero signal-to-noise ratio (SNR), +and some implementations of diffusion samplers do not start from the last timestep. +Such designs are flawed and do not reflect the fact that the model is given pure Gaussian noise at inference, creating a discrepancy between training and inference. +We show that the flawed design causes real problems in existing implementations. +In Stable Diffusion, it severely limits the model to only generate images with medium brightness and +prevents it from generating very bright and dark samples. We propose a few simple fixes: +- (1) rescale the noise schedule to enforce zero terminal SNR; +- (2) train the model with v prediction; +- (3) change the sampler to always start from the last timestep; +- (4) rescale classifier-free guidance to prevent over-exposure. +These simple changes ensure the diffusion process is congruent between training and inference and +allow the model to generate samples more faithful to the original data distribution.* + +You can apply all of these changes in `diffusers` when using [`DDIMScheduler`]: +- (1) rescale the noise schedule to enforce zero terminal SNR; +```py +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, rescale_betas_zero_snr=True) +``` +- (2) train the model with v prediction; +Continue fine-tuning a checkpoint with [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) +and `--prediction_type="v_prediction"`. +- (3) change the sampler to always start from the last timestep; +```py +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_scaling="trailing") +``` +- (4) rescale classifier-free guidance to prevent over-exposure. +```py +pipe(..., guidance_rescale=0.7) +``` + +An example is to use [this checkpoint](https://huggingface.co/ptx0/pseudo-journey-v2) +which has been fine-tuned using the `"v_prediction"`. + +The checkpoint can then be run in inference as follows: + +```py +from diffusers import DiffusionPipeline, DDIMScheduler + +pipe = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", torch_dtype=torch.float16) +pipe.scheduler = DDIMScheduler.from_config( + pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_scaling="trailing" +) +pipe.to("cuda") + +prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" +image = pipeline(prompt, guidance_rescale=0.7).images[0] +``` + +## DDIMScheduler +[[autodoc]] DDIMScheduler + ### Image Inpainting - *Image Inpainting (512x512 resolution)*: [stabilityai/stable-diffusion-2-inpainting](https://huggingface.co/stabilityai/stable-diffusion-2-inpainting) with [`StableDiffusionInpaintPipeline`] diff --git a/docs/source/en/api/schedulers/ddim.mdx b/docs/source/en/api/schedulers/ddim.mdx index 51b0cc3e9a09..0db5e4f4e2b5 100644 --- a/docs/source/en/api/schedulers/ddim.mdx +++ b/docs/source/en/api/schedulers/ddim.mdx @@ -18,10 +18,71 @@ specific language governing permissions and limitations under the License. The abstract of the paper is the following: -Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, yet they require simulating a Markov chain for many steps to produce a sample. To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space. +*Denoising diffusion probabilistic models (DDPMs) have achieved high quality image generation without adversarial training, +yet they require simulating a Markov chain for many steps to produce a sample. +To accelerate sampling, we present denoising diffusion implicit models (DDIMs), a more efficient class of iterative implicit probabilistic models +with the same training procedure as DDPMs. In DDPMs, the generative process is defined as the reverse of a Markovian diffusion process. +We construct a class of non-Markovian diffusion processes that lead to the same training objective, but whose reverse process can be much faster to sample from. +We empirically demonstrate that DDIMs can produce high quality samples 10× to 50× faster in terms of wall-clock time compared to DDPMs, allow us to trade off +computation for sample quality, and can perform semantically meaningful image interpolation directly in the latent space.* The original codebase of this paper can be found here: [ermongroup/ddim](https://github.com/ermongroup/ddim). For questions, feel free to contact the author on [tsong.me](https://tsong.me/). +### Experimental: "Common Diffusion Noise Schedules and Sample Steps are Flawed": + +The paper **[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/abs/2305.08891)** +claims that a mismatch between the training and inference settings leads to suboptimal inference generation results for Stable Diffusion. + +The abstract reads as follows: + +*We discover that common diffusion noise schedules do not enforce the last timestep to have zero signal-to-noise ratio (SNR), +and some implementations of diffusion samplers do not start from the last timestep. +Such designs are flawed and do not reflect the fact that the model is given pure Gaussian noise at inference, creating a discrepancy between training and inference. +We show that the flawed design causes real problems in existing implementations. +In Stable Diffusion, it severely limits the model to only generate images with medium brightness and +prevents it from generating very bright and dark samples. We propose a few simple fixes: +- (1) rescale the noise schedule to enforce zero terminal SNR; +- (2) train the model with v prediction; +- (3) change the sampler to always start from the last timestep; +- (4) rescale classifier-free guidance to prevent over-exposure. +These simple changes ensure the diffusion process is congruent between training and inference and +allow the model to generate samples more faithful to the original data distribution.* + +You can apply all of these changes in `diffusers` when using [`DDIMScheduler`]: +- (1) rescale the noise schedule to enforce zero terminal SNR; +```py +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, rescale_betas_zero_snr=True) +``` +- (2) train the model with v prediction; +Continue fine-tuning a checkpoint with [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) +and `--prediction_type="v_prediction"`. +- (3) change the sampler to always start from the last timestep; +```py +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_scaling="trailing") +``` +- (4) rescale classifier-free guidance to prevent over-exposure. +```py +pipe(..., guidance_rescale=0.7) +``` + +An example is to use [this checkpoint](https://huggingface.co/ptx0/pseudo-journey-v2) +which has been fine-tuned using the `"v_prediction"`. + +The checkpoint can then be run in inference as follows: + +```py +from diffusers import DiffusionPipeline, DDIMScheduler + +pipe = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", torch_dtype=torch.float16) +pipe.scheduler = DDIMScheduler.from_config( + pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_scaling="trailing" +) +pipe.to("cuda") + +prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" +image = pipeline(prompt, guidance_rescale=0.7).images[0] +``` + ## DDIMScheduler [[autodoc]] DDIMScheduler diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index bbf7bf9b85bb..0965c77eea96 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -307,6 +307,12 @@ def parse_args(): parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--prediction_type", + type=str, + default=None, + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -848,6 +854,10 @@ def collate_fn(examples): encoder_hidden_states = text_encoder(batch["input_ids"])[0] # Get the target for loss depending on the prediction type + if args.prediction_type is not None: + # set prediction_type of scheduler if defined + noise_scheduler.register_to_config(prediction_type=args.prediction_type) + if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 806637f04c53..30d527efd22d 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -272,6 +272,12 @@ def parse_args(): parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--prediction_type", + type=str, + default=None, + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + ) parser.add_argument( "--hub_model_id", type=str, @@ -749,6 +755,10 @@ def collate_fn(examples): encoder_hidden_states = text_encoder(batch["input_ids"])[0] # Get the target for loss depending on the prediction type + if args.prediction_type is not None: + # set prediction_type of scheduler if defined + noise_scheduler.register_to_config(prediction_type=args.prediction_type) + if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index 64ca06a53a7b..b79e4f72144b 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -51,6 +51,21 @@ """ +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" @@ -567,6 +582,7 @@ def __call__( callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, ): r""" Function invoked when calling the pipeline for generation. @@ -627,6 +643,11 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. Examples: @@ -717,6 +738,10 @@ def __call__( noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index f7374452a5f6..8368668ebea7 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -55,6 +55,20 @@ """ +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -568,6 +582,7 @@ def __call__( callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, ): r""" Function invoked when calling the pipeline for generation. @@ -628,6 +643,11 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. Examples: @@ -718,6 +738,10 @@ def __call__( noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 6b62d8893482..bab6f8acea03 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -76,6 +76,42 @@ def alpha_bar(time_step): return torch.tensor(betas, dtype=torch.float32) +def rescale_zero_terminal_snr(betas): + """ + Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) + + + Args: + betas (`torch.FloatTensor`): + the betas that the scheduler is being initialized with. + + Returns: + `torch.FloatTensor`: rescaled betas with zero terminal SNR + """ + # Convert betas to alphas_bar_sqrt + alphas = 1.0 - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) + alphas_bar_sqrt = alphas_cumprod.sqrt() + + # Store old values. + alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() + alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() + + # Shift so the last timestep is zero. + alphas_bar_sqrt -= alphas_bar_sqrt_T + + # Scale so the first timestep is back to the old value. + alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) + + # Convert alphas_bar_sqrt to betas + alphas_bar = alphas_bar_sqrt**2 # Revert sqrt + alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod + alphas = torch.cat([alphas_bar[0:1], alphas]) + betas = 1 - alphas + + return betas + + class DDIMScheduler(SchedulerMixin, ConfigMixin): """ Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising @@ -122,6 +158,14 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin): (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`. sample_max_value (`float`, default `1.0`): the threshold value for dynamic thresholding. Valid only when `thresholding=True`. + timestep_spacing (`str`, default `"leading"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + rescale_betas_zero_snr (`bool`, default `False`): + whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf). + This can enable the model to generate very bright and dark samples instead of limiting it to samples with + medium brightness. Loosely related to + [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -143,6 +187,8 @@ def __init__( dynamic_thresholding_ratio: float = 0.995, clip_sample_range: float = 1.0, sample_max_value: float = 1.0, + timestep_spacing: str = "leading", + rescale_betas_zero_snr: bool = False, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -159,6 +205,10 @@ def __init__( else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + # Rescale for zero SNR + if rescale_betas_zero_snr: + self.betas = rescale_zero_terminal_snr(self.betas) + self.alphas = 1.0 - self.betas self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) @@ -251,12 +301,26 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic ) self.num_inference_steps = num_inference_steps - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + + # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'." + ) + self.timesteps = torch.from_numpy(timesteps).to(device) - self.timesteps += self.config.steps_offset def step( self, diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 87a960c7d1a4..33cc7f638ec2 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -208,6 +208,27 @@ def test_stable_diffusion_k_euler(self): assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_unflawed(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + components["scheduler"] = DDIMScheduler.from_config( + components["scheduler"].config, timestep_spacing="trailing" + ) + sd_pipe = StableDiffusionPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["guidance_rescale"] = 0.7 + inputs["num_inference_steps"] = 10 + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.4736, 0.5405, 0.4705, 0.4955, 0.5675, 0.4812, 0.5310, 0.4967, 0.5064]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + def test_stable_diffusion_long_prompt(self): components = self.get_dummy_components() components["scheduler"] = LMSDiscreteScheduler.from_config(components["scheduler"].config) diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index d1a2c856659f..21862ba6a216 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -384,6 +384,29 @@ def test_stable_diffusion_text2img_pipeline_v_pred_default(self): assert image.shape == (768, 768, 3) assert np.abs(expected_image - image).max() < 9e-1 + def test_stable_diffusion_text2img_pipeline_unflawed(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" + "sd2-text2img/lion_galaxy.npy" + ) + + pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") + pipe.scheduler = DDIMScheduler.from_config( + pipe.scheduler.config, timestep_scaling="trailing", rescale_betas_zero_snr=True + ) + pipe.to(torch_device) + pipe.enable_attention_slicing() + pipe.set_progress_bar_config(disable=None) + + prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" + + generator = torch.manual_seed(0) + output = pipe(prompt=prompt, guidance_scale=7.5, guidance_rescale=0.7, generator=generator, output_type="np") + image = output.images[0] + + assert image.shape == (768, 768, 3) + assert np.abs(expected_image - image).max() < 5e-1 + def test_stable_diffusion_text2img_pipeline_v_pred_fp16(self): expected_image = load_numpy( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/" diff --git a/tests/schedulers/test_scheduler_ddim.py b/tests/schedulers/test_scheduler_ddim.py index e9c85314d558..156b02b2208e 100644 --- a/tests/schedulers/test_scheduler_ddim.py +++ b/tests/schedulers/test_scheduler_ddim.py @@ -69,6 +69,14 @@ def test_clip_sample(self): for clip_sample in [True, False]: self.check_over_configs(clip_sample=clip_sample) + def test_timestep_spacing(self): + for timestep_spacing in ["trailing", "leading"]: + self.check_over_configs(timestep_spacing=timestep_spacing) + + def test_rescale_betas_zero_snr(self): + for rescale_betas_zero_snr in [True, False]: + self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr) + def test_thresholding(self): self.check_over_configs(thresholding=False) for threshold in [0.5, 1.0, 2.0]: From fdec23188afd8821e3c441f01567e156e35a51ff Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Wed, 7 Jun 2023 12:01:35 +0200 Subject: [PATCH 093/199] [Tests] Run slow matrix sequentially (#3500) [tests] Run slow matrix sequentially. --- .github/workflows/push_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 749da4dea81a..567cd5f5b0d4 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -17,6 +17,7 @@ jobs: run_slow_tests: strategy: fail-fast: false + max-parallel: 1 matrix: config: - name: Slow PyTorch CUDA tests on Ubuntu From cd9d0913d90b52d3609427ea2ad5d07b557677dd Mon Sep 17 00:00:00 2001 From: Alex McKinney <44398246+vvvm23@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:07:13 +0100 Subject: [PATCH 094/199] Fixes eval generator init in `train_text_to_image_lora.py` (#3678) --- examples/text_to_image/train_text_to_image_lora.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 30d527efd22d..8a8fdd7ad5d8 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -835,7 +835,9 @@ def collate_fn(examples): pipeline.set_progress_bar_config(disable=True) # run inference - generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + generator = torch.Generator(device=accelerator.device) + if args.seed is not None: + generator = generator.manual_seed(args.seed) images = [] for _ in range(args.num_validation_images): images.append( @@ -891,7 +893,9 @@ def collate_fn(examples): pipeline.unet.load_attn_procs(args.output_dir) # run inference - generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + generator = torch.Generator(device=accelerator.device) + if args.seed is not None: + generator = generator.manual_seed(args.seed) images = [] for _ in range(args.num_validation_images): images.append(pipeline(args.validation_prompt, num_inference_steps=30, generator=generator).images[0]) From 803d6537481fdd931aa3a09e64adaf093549ba67 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 7 Jun 2023 18:33:54 +0200 Subject: [PATCH 095/199] Fix custom releases (#3708) * Fix custom releases * make style --- src/diffusers/utils/dynamic_modules_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/diffusers/utils/dynamic_modules_utils.py b/src/diffusers/utils/dynamic_modules_utils.py index aa6c9c657a87..5b0952f0b514 100644 --- a/src/diffusers/utils/dynamic_modules_utils.py +++ b/src/diffusers/utils/dynamic_modules_utils.py @@ -21,12 +21,12 @@ import re import shutil import sys -from distutils.version import StrictVersion from pathlib import Path from typing import Dict, Optional, Union from urllib import request from huggingface_hub import HfFolder, cached_download, hf_hub_download, model_info +from packaging import version from .. import __version__ from . import DIFFUSERS_DYNAMIC_MODULE_NAME, HF_MODULES_CACHE, logging @@ -43,7 +43,7 @@ def get_diffusers_versions(): url = "https://pypi.org/pypi/diffusers/json" releases = json.loads(request.urlopen(url).read())["releases"].keys() - return sorted(releases, key=StrictVersion) + return sorted(releases, key=lambda x: version.Version(x)) def init_hf_modules(): From cd6186907c2dab95ea56228738b003cf62a88d7b Mon Sep 17 00:00:00 2001 From: Kadir Nar Date: Wed, 7 Jun 2023 19:43:33 +0300 Subject: [PATCH 096/199] [Community] Support StableDiffusionCanvasPipeline (#3590) * added StableDiffusionCanvasPipeline pipeline * Added utils codes to pipe_utils file. * make style * delete mixture.py and Text2ImageRegion class * make style * Added the codes to the readme.md file. * Moved functions from pipeline_utils to mix_canvas --- examples/community/README.md | 38 +- examples/community/mixture.py | 401 --------------------- examples/community/mixture_canvas.py | 503 +++++++++++++++++++++++++++ 3 files changed, 539 insertions(+), 403 deletions(-) delete mode 100644 examples/community/mixture.py create mode 100644 examples/community/mixture_canvas.py diff --git a/examples/community/README.md b/examples/community/README.md index 065b46f5410c..17cd34a5182d 100755 --- a/examples/community/README.md +++ b/examples/community/README.md @@ -1601,7 +1601,7 @@ pipe_images = mixing_pipeline( ![image_mixing_result](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/boromir_gigachad.png) -### Stable Diffusion Mixture +### Stable Diffusion Mixture Tiling This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details. @@ -1672,4 +1672,38 @@ mask_image = Image.open(BytesIO(response.content)).convert("RGB") prompt = "a mecha robot sitting on a bench" image = pipe(prompt, image=input_image, mask_image=mask_image, strength=0.75,).images[0] image.save('tensorrt_inpaint_mecha_robot.png') -``` \ No newline at end of file +``` + +### Stable Diffusion Mixture Canvas + +This pipeline uses the Mixture. Refer to the [Mixture](https://arxiv.org/abs/2302.02412) paper for more details. + +```python +from PIL import Image +from diffusers import LMSDiscreteScheduler, DiffusionPipeline +from diffusers.pipelines.pipeline_utils import Image2ImageRegion, Text2ImageRegion, preprocess_image + + +# Load and preprocess guide image +iic_image = preprocess_image(Image.open("input_image.png").convert("RGB")) + +# Creater scheduler and model (similar to StableDiffusionPipeline) +scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) +pipeline = DiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler).to("cuda:0", custom_pipeline="mixture_canvas") +pipeline.to("cuda") + +# Mixture of Diffusers generation +output = pipeline( + canvas_height=800, + canvas_width=352, + regions=[ + Text2ImageRegion(0, 800, 0, 352, guidance_scale=8, + prompt=f"best quality, masterpiece, WLOP, sakimichan, art contest winner on pixiv, 8K, intricate details, wet effects, rain drops, ethereal, mysterious, futuristic, UHD, HDR, cinematic lighting, in a beautiful forest, rainy day, award winning, trending on artstation, beautiful confident cheerful young woman, wearing a futuristic sleeveless dress, ultra beautiful detailed eyes, hyper-detailed face, complex, perfect, model,  textured, chiaroscuro, professional make-up, realistic, figure in frame, "), + Image2ImageRegion(352-800, 352, 0, 352, reference_image=iic_image, strength=1.0), + ], + num_inference_steps=100, + seed=5525475061, +)["images"][0] +``` +![Input_Image](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/input_image.png) +![mixture_canvas_results](https://huggingface.co/datasets/kadirnar/diffusers_readme_images/resolve/main/canvas.png) diff --git a/examples/community/mixture.py b/examples/community/mixture.py deleted file mode 100644 index 845ad76b6a2e..000000000000 --- a/examples/community/mixture.py +++ /dev/null @@ -1,401 +0,0 @@ -import inspect -from copy import deepcopy -from enum import Enum -from typing import List, Optional, Tuple, Union - -import torch -from ligo.segments import segment -from tqdm.auto import tqdm -from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer - -from diffusers.models import AutoencoderKL, UNet2DConditionModel -from diffusers.pipeline_utils import DiffusionPipeline -from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker -from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -from diffusers.utils import logging - - -logger = logging.get_logger(__name__) # pylint: disable=invalid-name - -EXAMPLE_DOC_STRING = """ - Examples: - ```py - >>> from diffusers import LMSDiscreteScheduler - >>> from mixdiff import StableDiffusionTilingPipeline - - >>> scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000) - >>> pipeline = StableDiffusionTilingPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", scheduler=scheduler) - >>> pipeline.to("cuda:0") - - >>> image = pipeline( - >>> prompt=[[ - >>> "A charming house in the countryside, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - >>> "A dirt road in the countryside crossing pastures, by jakub rozalski, sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece", - >>> "An old and rusty giant robot lying on a dirt road, by jakub rozalski, dark sunset lighting, elegant, highly detailed, smooth, sharp focus, artstation, stunning masterpiece" - >>> ]], - >>> tile_height=640, - >>> tile_width=640, - >>> tile_row_overlap=0, - >>> tile_col_overlap=256, - >>> guidance_scale=8, - >>> seed=7178915308, - >>> num_inference_steps=50, - >>> )["images"][0] - ``` -""" - - -def _tile2pixel_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): - """Given a tile row and column numbers returns the range of pixels affected by that tiles in the overall image - - Returns a tuple with: - - Starting coordinates of rows in pixel space - - Ending coordinates of rows in pixel space - - Starting coordinates of columns in pixel space - - Ending coordinates of columns in pixel space - """ - px_row_init = 0 if tile_row == 0 else tile_row * (tile_height - tile_row_overlap) - px_row_end = px_row_init + tile_height - px_col_init = 0 if tile_col == 0 else tile_col * (tile_width - tile_col_overlap) - px_col_end = px_col_init + tile_width - return px_row_init, px_row_end, px_col_init, px_col_end - - -def _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end): - """Translates coordinates in pixel space to coordinates in latent space""" - return px_row_init // 8, px_row_end // 8, px_col_init // 8, px_col_end // 8 - - -def _tile2latent_indices(tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap): - """Given a tile row and column numbers returns the range of latents affected by that tiles in the overall image - - Returns a tuple with: - - Starting coordinates of rows in latent space - - Ending coordinates of rows in latent space - - Starting coordinates of columns in latent space - - Ending coordinates of columns in latent space - """ - px_row_init, px_row_end, px_col_init, px_col_end = _tile2pixel_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap - ) - return _pixel2latent_indices(px_row_init, px_row_end, px_col_init, px_col_end) - - -def _tile2latent_exclusive_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap, rows, columns -): - """Given a tile row and column numbers returns the range of latents affected only by that tile in the overall image - - Returns a tuple with: - - Starting coordinates of rows in latent space - - Ending coordinates of rows in latent space - - Starting coordinates of columns in latent space - - Ending coordinates of columns in latent space - """ - row_init, row_end, col_init, col_end = _tile2latent_indices( - tile_row, tile_col, tile_width, tile_height, tile_row_overlap, tile_col_overlap - ) - row_segment = segment(row_init, row_end) - col_segment = segment(col_init, col_end) - # Iterate over the rest of tiles, clipping the region for the current tile - for row in range(rows): - for column in range(columns): - if row != tile_row and column != tile_col: - clip_row_init, clip_row_end, clip_col_init, clip_col_end = _tile2latent_indices( - row, column, tile_width, tile_height, tile_row_overlap, tile_col_overlap - ) - row_segment = row_segment - segment(clip_row_init, clip_row_end) - col_segment = col_segment - segment(clip_col_init, clip_col_end) - # return row_init, row_end, col_init, col_end - return row_segment[0], row_segment[1], col_segment[0], col_segment[1] - - -class StableDiffusionExtrasMixin: - """Mixin providing additional convenience method to Stable Diffusion pipelines""" - - def decode_latents(self, latents, cpu_vae=False): - """Decodes a given array of latents into pixel space""" - # scale and decode the image latents with vae - if cpu_vae: - lat = deepcopy(latents).cpu() - vae = deepcopy(self.vae).cpu() - else: - lat = latents - vae = self.vae - - lat = 1 / 0.18215 * lat - image = vae.decode(lat).sample - - image = (image / 2 + 0.5).clamp(0, 1) - image = image.cpu().permute(0, 2, 3, 1).numpy() - - return self.numpy_to_pil(image) - - -class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixin): - def __init__( - self, - vae: AutoencoderKL, - text_encoder: CLIPTextModel, - tokenizer: CLIPTokenizer, - unet: UNet2DConditionModel, - scheduler: Union[DDIMScheduler, PNDMScheduler], - safety_checker: StableDiffusionSafetyChecker, - feature_extractor: CLIPFeatureExtractor, - ): - super().__init__() - self.register_modules( - vae=vae, - text_encoder=text_encoder, - tokenizer=tokenizer, - unet=unet, - scheduler=scheduler, - safety_checker=safety_checker, - feature_extractor=feature_extractor, - ) - - class SeedTilesMode(Enum): - """Modes in which the latents of a particular tile can be re-seeded""" - - FULL = "full" - EXCLUSIVE = "exclusive" - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[List[str]]], - num_inference_steps: Optional[int] = 50, - guidance_scale: Optional[float] = 7.5, - eta: Optional[float] = 0.0, - seed: Optional[int] = None, - tile_height: Optional[int] = 512, - tile_width: Optional[int] = 512, - tile_row_overlap: Optional[int] = 256, - tile_col_overlap: Optional[int] = 256, - guidance_scale_tiles: Optional[List[List[float]]] = None, - seed_tiles: Optional[List[List[int]]] = None, - seed_tiles_mode: Optional[Union[str, List[List[str]]]] = "full", - seed_reroll_regions: Optional[List[Tuple[int, int, int, int, int]]] = None, - cpu_vae: Optional[bool] = False, - ): - r""" - Function to run the diffusion pipeline with tiling support. - - Args: - prompt: either a single string (no tiling) or a list of lists with all the prompts to use (one list for each row of tiles). This will also define the tiling structure. - num_inference_steps: number of diffusions steps. - guidance_scale: classifier-free guidance. - seed: general random seed to initialize latents. - tile_height: height in pixels of each grid tile. - tile_width: width in pixels of each grid tile. - tile_row_overlap: number of overlap pixels between tiles in consecutive rows. - tile_col_overlap: number of overlap pixels between tiles in consecutive columns. - guidance_scale_tiles: specific weights for classifier-free guidance in each tile. - guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used. - seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter. - seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overrriden. - seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles. - cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues. - - Examples: - - Returns: - A PIL image with the generated image. - - """ - if not isinstance(prompt, list) or not all(isinstance(row, list) for row in prompt): - raise ValueError(f"`prompt` has to be a list of lists but is {type(prompt)}") - grid_rows = len(prompt) - grid_cols = len(prompt[0]) - if not all(len(row) == grid_cols for row in prompt): - raise ValueError("All prompt rows must have the same number of prompt columns") - if not isinstance(seed_tiles_mode, str) and ( - not isinstance(seed_tiles_mode, list) or not all(isinstance(row, list) for row in seed_tiles_mode) - ): - raise ValueError(f"`seed_tiles_mode` has to be a string or list of lists but is {type(prompt)}") - if isinstance(seed_tiles_mode, str): - seed_tiles_mode = [[seed_tiles_mode for _ in range(len(row))] for row in prompt] - modes = [mode.value for mode in self.SeedTilesMode] - if any(mode not in modes for row in seed_tiles_mode for mode in row): - raise ValueError(f"Seed tiles mode must be one of {modes}") - if seed_reroll_regions is None: - seed_reroll_regions = [] - batch_size = 1 - - # create original noisy latents using the timesteps - height = tile_height + (grid_rows - 1) * (tile_height - tile_row_overlap) - width = tile_width + (grid_cols - 1) * (tile_width - tile_col_overlap) - latents_shape = (batch_size, self.unet.config.in_channels, height // 8, width // 8) - generator = torch.Generator("cuda").manual_seed(seed) - latents = torch.randn(latents_shape, generator=generator, device=self.device) - - # overwrite latents for specific tiles if provided - if seed_tiles is not None: - for row in range(grid_rows): - for col in range(grid_cols): - if (seed_tile := seed_tiles[row][col]) is not None: - mode = seed_tiles_mode[row][col] - if mode == self.SeedTilesMode.FULL.value: - row_init, row_end, col_init, col_end = _tile2latent_indices( - row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap - ) - else: - row_init, row_end, col_init, col_end = _tile2latent_exclusive_indices( - row, - col, - tile_width, - tile_height, - tile_row_overlap, - tile_col_overlap, - grid_rows, - grid_cols, - ) - tile_generator = torch.Generator("cuda").manual_seed(seed_tile) - tile_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init) - latents[:, :, row_init:row_end, col_init:col_end] = torch.randn( - tile_shape, generator=tile_generator, device=self.device - ) - - # overwrite again for seed reroll regions - for row_init, row_end, col_init, col_end, seed_reroll in seed_reroll_regions: - row_init, row_end, col_init, col_end = _pixel2latent_indices( - row_init, row_end, col_init, col_end - ) # to latent space coordinates - reroll_generator = torch.Generator("cuda").manual_seed(seed_reroll) - region_shape = (latents_shape[0], latents_shape[1], row_end - row_init, col_end - col_init) - latents[:, :, row_init:row_end, col_init:col_end] = torch.randn( - region_shape, generator=reroll_generator, device=self.device - ) - - # Prepare scheduler - accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) - extra_set_kwargs = {} - if accepts_offset: - extra_set_kwargs["offset"] = 1 - self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) - # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas - if isinstance(self.scheduler, LMSDiscreteScheduler): - latents = latents * self.scheduler.sigmas[0] - - # get prompts text embeddings - text_input = [ - [ - self.tokenizer( - col, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="pt", - ) - for col in row - ] - for row in prompt - ] - text_embeddings = [[self.text_encoder(col.input_ids.to(self.device))[0] for col in row] for row in text_input] - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 # TODO: also active if any tile has guidance scale - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - for i in range(grid_rows): - for j in range(grid_cols): - max_length = text_input[i][j].input_ids.shape[-1] - uncond_input = self.tokenizer( - [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt" - ) - uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings[i][j] = torch.cat([uncond_embeddings, text_embeddings[i][j]]) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # Mask for tile weights strenght - tile_weights = self._gaussian_weights(tile_width, tile_height, batch_size) - - # Diffusion timesteps - for i, t in tqdm(enumerate(self.scheduler.timesteps)): - # Diffuse each tile - noise_preds = [] - for row in range(grid_rows): - noise_preds_row = [] - for col in range(grid_cols): - px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( - row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap - ) - tile_latents = latents[:, :, px_row_init:px_row_end, px_col_init:px_col_end] - # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([tile_latents] * 2) if do_classifier_free_guidance else tile_latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - # predict the noise residual - noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings[row][col])[ - "sample" - ] - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) - guidance = ( - guidance_scale - if guidance_scale_tiles is None or guidance_scale_tiles[row][col] is None - else guidance_scale_tiles[row][col] - ) - noise_pred_tile = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond) - noise_preds_row.append(noise_pred_tile) - noise_preds.append(noise_preds_row) - # Stitch noise predictions for all tiles - noise_pred = torch.zeros(latents.shape, device=self.device) - contributors = torch.zeros(latents.shape, device=self.device) - # Add each tile contribution to overall latents - for row in range(grid_rows): - for col in range(grid_cols): - px_row_init, px_row_end, px_col_init, px_col_end = _tile2latent_indices( - row, col, tile_width, tile_height, tile_row_overlap, tile_col_overlap - ) - noise_pred[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += ( - noise_preds[row][col] * tile_weights - ) - contributors[:, :, px_row_init:px_row_end, px_col_init:px_col_end] += tile_weights - # Average overlapping areas with more than 1 contributor - noise_pred /= contributors - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step(noise_pred, t, latents).prev_sample - - # scale and decode the image latents with vae - image = self.decode_latents(latents, cpu_vae) - - return {"images": image} - - def _gaussian_weights(self, tile_width, tile_height, nbatches): - """Generates a gaussian mask of weights for tile contributions""" - import numpy as np - from numpy import exp, pi, sqrt - - latent_width = tile_width // 8 - latent_height = tile_height // 8 - - var = 0.01 - midpoint = (latent_width - 1) / 2 # -1 because index goes from 0 to latent_width - 1 - x_probs = [ - exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var) - for x in range(latent_width) - ] - midpoint = latent_height / 2 - y_probs = [ - exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var) - for y in range(latent_height) - ] - - weights = np.outer(y_probs, x_probs) - return torch.tile(torch.tensor(weights, device=self.device), (nbatches, self.unet.config.in_channels, 1, 1)) diff --git a/examples/community/mixture_canvas.py b/examples/community/mixture_canvas.py new file mode 100644 index 000000000000..40139d1139ad --- /dev/null +++ b/examples/community/mixture_canvas.py @@ -0,0 +1,503 @@ +import re +from copy import deepcopy +from dataclasses import asdict, dataclass +from enum import Enum +from typing import List, Optional, Union + +import numpy as np +import torch +from numpy import exp, pi, sqrt +from torchvision.transforms.functional import resize +from tqdm.auto import tqdm +from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer + +from diffusers.models import AutoencoderKL, UNet2DConditionModel +from diffusers.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker +from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler + + +def preprocess_image(image): + from PIL import Image + + """Preprocess an input image + + Same as + https://github.com/huggingface/diffusers/blob/1138d63b519e37f0ce04e027b9f4a3261d27c628/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L44 + """ + w, h = image.size + w, h = (x - x % 32 for x in (w, h)) # resize to integer multiple of 32 + image = image.resize((w, h), resample=Image.LANCZOS) + image = np.array(image).astype(np.float32) / 255.0 + image = image[None].transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return 2.0 * image - 1.0 + + +@dataclass +class CanvasRegion: + """Class defining a rectangular region in the canvas""" + + row_init: int # Region starting row in pixel space (included) + row_end: int # Region end row in pixel space (not included) + col_init: int # Region starting column in pixel space (included) + col_end: int # Region end column in pixel space (not included) + region_seed: int = None # Seed for random operations in this region + noise_eps: float = 0.0 # Deviation of a zero-mean gaussian noise to be applied over the latents in this region. Useful for slightly "rerolling" latents + + def __post_init__(self): + # Initialize arguments if not specified + if self.region_seed is None: + self.region_seed = np.random.randint(9999999999) + # Check coordinates are non-negative + for coord in [self.row_init, self.row_end, self.col_init, self.col_end]: + if coord < 0: + raise ValueError( + f"A CanvasRegion must be defined with non-negative indices, found ({self.row_init}, {self.row_end}, {self.col_init}, {self.col_end})" + ) + # Check coordinates are divisible by 8, else we end up with nasty rounding error when mapping to latent space + for coord in [self.row_init, self.row_end, self.col_init, self.col_end]: + if coord // 8 != coord / 8: + raise ValueError( + f"A CanvasRegion must be defined with locations divisible by 8, found ({self.row_init}-{self.row_end}, {self.col_init}-{self.col_end})" + ) + # Check noise eps is non-negative + if self.noise_eps < 0: + raise ValueError(f"A CanvasRegion must be defined noises eps non-negative, found {self.noise_eps}") + # Compute coordinates for this region in latent space + self.latent_row_init = self.row_init // 8 + self.latent_row_end = self.row_end // 8 + self.latent_col_init = self.col_init // 8 + self.latent_col_end = self.col_end // 8 + + @property + def width(self): + return self.col_end - self.col_init + + @property + def height(self): + return self.row_end - self.row_init + + def get_region_generator(self, device="cpu"): + """Creates a torch.Generator based on the random seed of this region""" + # Initialize region generator + return torch.Generator(device).manual_seed(self.region_seed) + + @property + def __dict__(self): + return asdict(self) + + +class MaskModes(Enum): + """Modes in which the influence of diffuser is masked""" + + CONSTANT = "constant" + GAUSSIAN = "gaussian" + QUARTIC = "quartic" # See https://en.wikipedia.org/wiki/Kernel_(statistics) + + +@dataclass +class DiffusionRegion(CanvasRegion): + """Abstract class defining a region where some class of diffusion process is acting""" + + pass + + +@dataclass +class Text2ImageRegion(DiffusionRegion): + """Class defining a region where a text guided diffusion process is acting""" + + prompt: str = "" # Text prompt guiding the diffuser in this region + guidance_scale: float = 7.5 # Guidance scale of the diffuser in this region. If None, randomize + mask_type: MaskModes = MaskModes.GAUSSIAN.value # Kind of weight mask applied to this region + mask_weight: float = 1.0 # Global weights multiplier of the mask + tokenized_prompt = None # Tokenized prompt + encoded_prompt = None # Encoded prompt + + def __post_init__(self): + super().__post_init__() + # Mask weight cannot be negative + if self.mask_weight < 0: + raise ValueError( + f"A Text2ImageRegion must be defined with non-negative mask weight, found {self.mask_weight}" + ) + # Mask type must be an actual known mask + if self.mask_type not in [e.value for e in MaskModes]: + raise ValueError( + f"A Text2ImageRegion was defined with mask {self.mask_type}, which is not an accepted mask ({[e.value for e in MaskModes]})" + ) + # Randomize arguments if given as None + if self.guidance_scale is None: + self.guidance_scale = np.random.randint(5, 30) + # Clean prompt + self.prompt = re.sub(" +", " ", self.prompt).replace("\n", " ") + + def tokenize_prompt(self, tokenizer): + """Tokenizes the prompt for this diffusion region using a given tokenizer""" + self.tokenized_prompt = tokenizer( + self.prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + + def encode_prompt(self, text_encoder, device): + """Encodes the previously tokenized prompt for this diffusion region using a given encoder""" + assert self.tokenized_prompt is not None, ValueError( + "Prompt in diffusion region must be tokenized before encoding" + ) + self.encoded_prompt = text_encoder(self.tokenized_prompt.input_ids.to(device))[0] + + +@dataclass +class Image2ImageRegion(DiffusionRegion): + """Class defining a region where an image guided diffusion process is acting""" + + reference_image: torch.FloatTensor = None + strength: float = 0.8 # Strength of the image + + def __post_init__(self): + super().__post_init__() + if self.reference_image is None: + raise ValueError("Must provide a reference image when creating an Image2ImageRegion") + if self.strength < 0 or self.strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {self.strength}") + # Rescale image to region shape + self.reference_image = resize(self.reference_image, size=[self.height, self.width]) + + def encode_reference_image(self, encoder, device, generator, cpu_vae=False): + """Encodes the reference image for this Image2Image region into the latent space""" + # Place encoder in CPU or not following the parameter cpu_vae + if cpu_vae: + # Note here we use mean instead of sample, to avoid moving also generator to CPU, which is troublesome + self.reference_latents = encoder.cpu().encode(self.reference_image).latent_dist.mean.to(device) + else: + self.reference_latents = encoder.encode(self.reference_image.to(device)).latent_dist.sample( + generator=generator + ) + self.reference_latents = 0.18215 * self.reference_latents + + @property + def __dict__(self): + # This class requires special casting to dict because of the reference_image tensor. Otherwise it cannot be casted to JSON + + # Get all basic fields from parent class + super_fields = {key: getattr(self, key) for key in DiffusionRegion.__dataclass_fields__.keys()} + # Pack other fields + return {**super_fields, "reference_image": self.reference_image.cpu().tolist(), "strength": self.strength} + + +class RerollModes(Enum): + """Modes in which the reroll regions operate""" + + RESET = "reset" # Completely reset the random noise in the region + EPSILON = "epsilon" # Alter slightly the latents in the region + + +@dataclass +class RerollRegion(CanvasRegion): + """Class defining a rectangular canvas region in which initial latent noise will be rerolled""" + + reroll_mode: RerollModes = RerollModes.RESET.value + + +@dataclass +class MaskWeightsBuilder: + """Auxiliary class to compute a tensor of weights for a given diffusion region""" + + latent_space_dim: int # Size of the U-net latent space + nbatch: int = 1 # Batch size in the U-net + + def compute_mask_weights(self, region: DiffusionRegion) -> torch.tensor: + """Computes a tensor of weights for a given diffusion region""" + MASK_BUILDERS = { + MaskModes.CONSTANT.value: self._constant_weights, + MaskModes.GAUSSIAN.value: self._gaussian_weights, + MaskModes.QUARTIC.value: self._quartic_weights, + } + return MASK_BUILDERS[region.mask_type](region) + + def _constant_weights(self, region: DiffusionRegion) -> torch.tensor: + """Computes a tensor of constant for a given diffusion region""" + latent_width = region.latent_col_end - region.latent_col_init + latent_height = region.latent_row_end - region.latent_row_init + return torch.ones(self.nbatch, self.latent_space_dim, latent_height, latent_width) * region.mask_weight + + def _gaussian_weights(self, region: DiffusionRegion) -> torch.tensor: + """Generates a gaussian mask of weights for tile contributions""" + latent_width = region.latent_col_end - region.latent_col_init + latent_height = region.latent_row_end - region.latent_row_init + + var = 0.01 + midpoint = (latent_width - 1) / 2 # -1 because index goes from 0 to latent_width - 1 + x_probs = [ + exp(-(x - midpoint) * (x - midpoint) / (latent_width * latent_width) / (2 * var)) / sqrt(2 * pi * var) + for x in range(latent_width) + ] + midpoint = (latent_height - 1) / 2 + y_probs = [ + exp(-(y - midpoint) * (y - midpoint) / (latent_height * latent_height) / (2 * var)) / sqrt(2 * pi * var) + for y in range(latent_height) + ] + + weights = np.outer(y_probs, x_probs) * region.mask_weight + return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1)) + + def _quartic_weights(self, region: DiffusionRegion) -> torch.tensor: + """Generates a quartic mask of weights for tile contributions + + The quartic kernel has bounded support over the diffusion region, and a smooth decay to the region limits. + """ + quartic_constant = 15.0 / 16.0 + + support = (np.array(range(region.latent_col_init, region.latent_col_end)) - region.latent_col_init) / ( + region.latent_col_end - region.latent_col_init - 1 + ) * 1.99 - (1.99 / 2.0) + x_probs = quartic_constant * np.square(1 - np.square(support)) + support = (np.array(range(region.latent_row_init, region.latent_row_end)) - region.latent_row_init) / ( + region.latent_row_end - region.latent_row_init - 1 + ) * 1.99 - (1.99 / 2.0) + y_probs = quartic_constant * np.square(1 - np.square(support)) + + weights = np.outer(y_probs, x_probs) * region.mask_weight + return torch.tile(torch.tensor(weights), (self.nbatch, self.latent_space_dim, 1, 1)) + + +class StableDiffusionCanvasPipeline(DiffusionPipeline): + """Stable Diffusion pipeline that mixes several diffusers in the same canvas""" + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: Union[DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler], + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPFeatureExtractor, + ): + super().__init__() + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + + def decode_latents(self, latents, cpu_vae=False): + """Decodes a given array of latents into pixel space""" + # scale and decode the image latents with vae + if cpu_vae: + lat = deepcopy(latents).cpu() + vae = deepcopy(self.vae).cpu() + else: + lat = latents + vae = self.vae + + lat = 1 / 0.18215 * lat + image = vae.decode(lat).sample + + image = (image / 2 + 0.5).clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).numpy() + + return self.numpy_to_pil(image) + + def get_latest_timestep_img2img(self, num_inference_steps, strength): + """Finds the latest timesteps where an img2img strength does not impose latents anymore""" + # get the original timestep using init_timestep + offset = self.scheduler.config.get("steps_offset", 0) + init_timestep = int(num_inference_steps * (1 - strength)) + offset + init_timestep = min(init_timestep, num_inference_steps) + + t_start = min(max(num_inference_steps - init_timestep + offset, 0), num_inference_steps - 1) + latest_timestep = self.scheduler.timesteps[t_start] + + return latest_timestep + + @torch.no_grad() + def __call__( + self, + canvas_height: int, + canvas_width: int, + regions: List[DiffusionRegion], + num_inference_steps: Optional[int] = 50, + seed: Optional[int] = 12345, + reroll_regions: Optional[List[RerollRegion]] = None, + cpu_vae: Optional[bool] = False, + decode_steps: Optional[bool] = False, + ): + if reroll_regions is None: + reroll_regions = [] + batch_size = 1 + + if decode_steps: + steps_images = [] + + # Prepare scheduler + self.scheduler.set_timesteps(num_inference_steps, device=self.device) + + # Split diffusion regions by their kind + text2image_regions = [region for region in regions if isinstance(region, Text2ImageRegion)] + image2image_regions = [region for region in regions if isinstance(region, Image2ImageRegion)] + + # Prepare text embeddings + for region in text2image_regions: + region.tokenize_prompt(self.tokenizer) + region.encode_prompt(self.text_encoder, self.device) + + # Create original noisy latents using the timesteps + latents_shape = (batch_size, self.unet.config.in_channels, canvas_height // 8, canvas_width // 8) + generator = torch.Generator(self.device).manual_seed(seed) + init_noise = torch.randn(latents_shape, generator=generator, device=self.device) + + # Reset latents in seed reroll regions, if requested + for region in reroll_regions: + if region.reroll_mode == RerollModes.RESET.value: + region_shape = ( + latents_shape[0], + latents_shape[1], + region.latent_row_end - region.latent_row_init, + region.latent_col_end - region.latent_col_init, + ) + init_noise[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] = torch.randn(region_shape, generator=region.get_region_generator(self.device), device=self.device) + + # Apply epsilon noise to regions: first diffusion regions, then reroll regions + all_eps_rerolls = regions + [r for r in reroll_regions if r.reroll_mode == RerollModes.EPSILON.value] + for region in all_eps_rerolls: + if region.noise_eps > 0: + region_noise = init_noise[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] + eps_noise = ( + torch.randn( + region_noise.shape, generator=region.get_region_generator(self.device), device=self.device + ) + * region.noise_eps + ) + init_noise[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] += eps_noise + + # scale the initial noise by the standard deviation required by the scheduler + latents = init_noise * self.scheduler.init_noise_sigma + + # Get unconditional embeddings for classifier free guidance in text2image regions + for region in text2image_regions: + max_length = region.tokenized_prompt.input_ids.shape[-1] + uncond_input = self.tokenizer( + [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt" + ) + uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0] + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + region.encoded_prompt = torch.cat([uncond_embeddings, region.encoded_prompt]) + + # Prepare image latents + for region in image2image_regions: + region.encode_reference_image(self.vae, device=self.device, generator=generator) + + # Prepare mask of weights for each region + mask_builder = MaskWeightsBuilder(latent_space_dim=self.unet.config.in_channels, nbatch=batch_size) + mask_weights = [mask_builder.compute_mask_weights(region).to(self.device) for region in text2image_regions] + + # Diffusion timesteps + for i, t in tqdm(enumerate(self.scheduler.timesteps)): + # Diffuse each region + noise_preds_regions = [] + + # text2image regions + for region in text2image_regions: + region_latents = latents[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([region_latents] * 2) + # scale model input following scheduler rules + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + # predict the noise residual + noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=region.encoded_prompt)["sample"] + # perform guidance + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred_region = noise_pred_uncond + region.guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_preds_regions.append(noise_pred_region) + + # Merge noise predictions for all tiles + noise_pred = torch.zeros(latents.shape, device=self.device) + contributors = torch.zeros(latents.shape, device=self.device) + # Add each tile contribution to overall latents + for region, noise_pred_region, mask_weights_region in zip( + text2image_regions, noise_preds_regions, mask_weights + ): + noise_pred[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] += ( + noise_pred_region * mask_weights_region + ) + contributors[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] += mask_weights_region + # Average overlapping areas with more than 1 contributor + noise_pred /= contributors + noise_pred = torch.nan_to_num( + noise_pred + ) # Replace NaNs by zeros: NaN can appear if a position is not covered by any DiffusionRegion + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents).prev_sample + + # Image2Image regions: override latents generated by the scheduler + for region in image2image_regions: + influence_step = self.get_latest_timestep_img2img(num_inference_steps, region.strength) + # Only override in the timesteps before the last influence step of the image (given by its strength) + if t > influence_step: + timestep = t.repeat(batch_size) + region_init_noise = init_noise[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] + region_latents = self.scheduler.add_noise(region.reference_latents, region_init_noise, timestep) + latents[ + :, + :, + region.latent_row_init : region.latent_row_end, + region.latent_col_init : region.latent_col_end, + ] = region_latents + + if decode_steps: + steps_images.append(self.decode_latents(latents, cpu_vae)) + + # scale and decode the image latents with vae + image = self.decode_latents(latents, cpu_vae) + + output = {"images": image} + if decode_steps: + output = {**output, "steps_images": steps_images} + return output From 8caa530069afab4a0462218da96231eee19dcc24 Mon Sep 17 00:00:00 2001 From: Mishig Date: Wed, 7 Jun 2023 19:21:16 +0200 Subject: [PATCH 097/199] [doc build] Use secrets (#3707) Co-authored-by: Patrick von Platen --- .github/workflows/build_documentation.yml | 1 + .github/workflows/delete_doc_comment.yml | 13 +++++++------ .github/workflows/delete_doc_comment_trigger.yml | 12 ++++++++++++ .github/workflows/upload_pr_documentation.yml | 16 ++++++++++++++++ 4 files changed, 36 insertions(+), 6 deletions(-) create mode 100644 .github/workflows/delete_doc_comment_trigger.yml create mode 100644 .github/workflows/upload_pr_documentation.yml diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index c833bc0319e1..12ca212c49b4 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -17,3 +17,4 @@ jobs: languages: en ko secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml index 238dc0bdbabf..8604019d76eb 100644 --- a/.github/workflows/delete_doc_comment.yml +++ b/.github/workflows/delete_doc_comment.yml @@ -1,13 +1,14 @@ -name: Delete dev documentation +name: Delete doc comment on: - pull_request: - types: [ closed ] + workflow_run: + workflows: ["Delete doc comment trigger"] + types: + - completed jobs: delete: uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main - with: - pr_number: ${{ github.event.number }} - package: diffusers + secrets: + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml new file mode 100644 index 000000000000..f87d9bd4dca7 --- /dev/null +++ b/.github/workflows/delete_doc_comment_trigger.yml @@ -0,0 +1,12 @@ +name: Delete doc comment trigger + +on: + pull_request: + types: [ closed ] + + +jobs: + delete: + uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main + with: + pr_number: ${{ github.event.number }} diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml new file mode 100644 index 000000000000..3e8e21b42c8a --- /dev/null +++ b/.github/workflows/upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload PR Documentation + +on: + workflow_run: + workflows: ["Build PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: diffusers + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file From 500a3ff9ef53fafc52a01e94e1d88b1f7c502928 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 7 Jun 2023 07:35:07 -1000 Subject: [PATCH 098/199] [docs] add image processor documentation (#3710) add image processor Co-authored-by: yiyixuxu --- docs/source/en/_toctree.yml | 2 ++ docs/source/en/api/image_processor.mdx | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 docs/source/en/api/image_processor.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 5084299bb0dd..d6c753056044 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -146,6 +146,8 @@ title: Loaders - local: api/utilities title: Utilities + - local: api/image_processor + title: Vae Image Processor title: Main Classes - sections: - local: api/pipelines/overview diff --git a/docs/source/en/api/image_processor.mdx b/docs/source/en/api/image_processor.mdx new file mode 100644 index 000000000000..1964df214f94 --- /dev/null +++ b/docs/source/en/api/image_processor.mdx @@ -0,0 +1,22 @@ + + +# Image Processor for VAE + +Image processor provides a unified API for Stable Diffusion pipelines to prepare their image inputs for VAE encoding, as well as post-processing their outputs once decoded. This includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and Numpy arrays. + +All pipelines with VAE image processor will accept image inputs in the format of PIL Image, PyTorch tensor, or Numpy array, and will able to return outputs in the format of PIL Image, Pytorch tensor, and Numpy array based on the `output_type` argument from the user. Additionally, the User can pass encoded image latents directly to the pipeline, or ask the pipeline to return latents as output with `output_type = 'pt'` argument. This allows you to take the generated latents from one pipeline and pass it to another pipeline as input, without ever having to leave the latent space. It also makes it much easier to use multiple pipelines together, by passing PyTorch tensors directly between different pipelines. + + +## VaeImageProcessor + +[[autodoc]] image_processor.VaeImageProcessor \ No newline at end of file From a06317abeaad843d7a722562c2ec6fbb988b0ad8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 7 Jun 2023 19:57:28 +0200 Subject: [PATCH 099/199] [Actions] Fix actions (#3712) --- .github/workflows/build_documentation.yml | 2 +- .github/workflows/upload_pr_documentation.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 12ca212c49b4..4331c463097f 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -17,4 +17,4 @@ jobs: languages: en ko secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} - hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + hf_token: ${{ secrets.HF_DOC_PUSH }} diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml index 3e8e21b42c8a..927670d666b4 100644 --- a/.github/workflows/upload_pr_documentation.yml +++ b/.github/workflows/upload_pr_documentation.yml @@ -12,5 +12,5 @@ jobs: with: package_name: diffusers secrets: - hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + hf_token: ${{ secrets.HF_DOC_PUSH }} comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file From 79fa94ea8b77845f8bb5bab3e8b7c9e80c81ef18 Mon Sep 17 00:00:00 2001 From: Zachary Mueller Date: Thu, 8 Jun 2023 10:44:22 -0400 Subject: [PATCH 100/199] Apply deprecations from Accelerate (#3714) Apply deprecations --- examples/controlnet/train_controlnet.py | 5 +++-- examples/custom_diffusion/train_custom_diffusion.py | 5 +++-- examples/dreambooth/train_dreambooth.py | 5 +++-- examples/dreambooth/train_dreambooth_lora.py | 5 +++-- examples/instruct_pix2pix/train_instruct_pix2pix.py | 5 +++-- .../dreambooth_inpaint/train_dreambooth_inpaint.py | 5 +++-- .../dreambooth_inpaint/train_dreambooth_inpaint_lora.py | 5 +++-- .../textual_inversion/textual_inversion_bf16.py | 8 ++++---- .../intel_opts/textual_inversion_dfq/textual_inversion.py | 6 ++++-- .../research_projects/lora/train_text_to_image_lora.py | 5 +++-- .../mulit_token_textual_inversion/textual_inversion.py | 6 +++--- .../train_multi_subject_dreambooth.py | 7 +++---- .../onnxruntime/text_to_image/train_text_to_image.py | 6 +++--- .../onnxruntime/textual_inversion/textual_inversion.py | 6 +++--- .../unconditional_image_generation/train_unconditional.py | 8 ++++---- examples/text_to_image/train_text_to_image.py | 5 +++-- examples/text_to_image/train_text_to_image_lora.py | 7 ++++--- examples/textual_inversion/textual_inversion.py | 7 +++---- .../unconditional_image_generation/train_unconditional.py | 6 +++--- 19 files changed, 61 insertions(+), 51 deletions(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index b6eb98db711b..13937246911e 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -716,13 +716,14 @@ def collate_fn(examples): def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index 7060f8da4534..e619e037453f 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -637,13 +637,14 @@ def parse_args(input_args=None): def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 97b7f334bc9f..37d97f175cc8 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -771,13 +771,14 @@ def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_atte def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 3accc4265787..c3dea4920931 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -653,13 +653,14 @@ def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_atte def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py index de555a50ba50..d3eb6ceabb8e 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py @@ -387,12 +387,13 @@ def main(): ), ) logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py index 5158f9fc3bc0..a9ff5e90ed51 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py @@ -405,13 +405,14 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) - project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", - logging_dir=logging_dir, project_config=project_config, ) diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py index 821c66b7237a..b3f37c53eac8 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py @@ -404,13 +404,14 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py index 1580cb392e8d..3c2dd396ca47 100644 --- a/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py +++ b/examples/research_projects/intel_opts/textual_inversion/textual_inversion_bf16.py @@ -13,7 +13,7 @@ import torch.utils.checkpoint from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import create_repo, upload_folder # TODO: remove and import from diffusers.utils when the new version of diffusers is released @@ -363,12 +363,12 @@ def freeze_params(params): def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) - + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, - log_with="tensorboard", - logging_dir=logging_dir, + log_with=args.report_to, + project_config=accelerator_project_config, ) # If passed along, set the training seed now. diff --git a/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py b/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py index 7afb6c67ef8e..2188b17b68d7 100644 --- a/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py +++ b/examples/research_projects/intel_opts/textual_inversion_dfq/textual_inversion.py @@ -12,7 +12,7 @@ import torch.nn.functional as F import torch.utils.checkpoint from accelerate import Accelerator -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, whoami from neural_compressor.utils import logger from packaging import version @@ -458,11 +458,13 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", - logging_dir=logging_dir, + project_config=accelerator_project_config, ) # If passed along, set the training seed now. diff --git a/examples/research_projects/lora/train_text_to_image_lora.py b/examples/research_projects/lora/train_text_to_image_lora.py index fd516fff9811..e2e0dbaaa4db 100644 --- a/examples/research_projects/lora/train_text_to_image_lora.py +++ b/examples/research_projects/lora/train_text_to_image_lora.py @@ -394,13 +394,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py b/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py index 622c51d2e52e..2aaebd729c80 100644 --- a/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py +++ b/examples/research_projects/mulit_token_textual_inversion/textual_inversion.py @@ -549,14 +549,14 @@ def __getitem__(self, i): def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py index a1016b50e7b2..f24c6057fd8c 100644 --- a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py +++ b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py @@ -464,14 +464,13 @@ def __getitem__(self, index): def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) - + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py index a5bfbbb7b12a..f1e28c7e2caa 100644 --- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py +++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py @@ -422,14 +422,14 @@ def main(): ), ) logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py index 7ff77118c38e..1013dfa8aba4 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py +++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py @@ -562,14 +562,14 @@ def __getitem__(self, i): def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py index 9dc46e864ae8..a42187fadea1 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py @@ -289,14 +289,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, - log_with=args.logger, - logging_dir=logging_dir, + log_with=args.report_to, project_config=accelerator_project_config, ) diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 0965c77eea96..da6d5b8e8c79 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -427,13 +427,14 @@ def main(): ) logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 8a8fdd7ad5d8..fe5631c697dd 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -366,15 +366,16 @@ def parse_args(): def main(): args = parse_args() - logging_dir = os.path.join(args.output_dir, args.logging_dir) + logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index 0bf76c166835..5a94a4802543 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -566,14 +566,13 @@ def __getitem__(self, i): def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) - + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, - logging_dir=logging_dir, project_config=accelerator_project_config, ) diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index 282f52101a3c..e4dc1601d3c2 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -287,14 +287,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration( + total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir + ) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.logger, - logging_dir=logging_dir, project_config=accelerator_project_config, ) From f523b11a1023a07d5aaa21a68d69ce6d9b71d36e Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 8 Jun 2023 16:48:06 +0200 Subject: [PATCH 101/199] Fix loading if unexpected keys are present (#3720) * Fix loading * make style --- src/diffusers/models/modeling_utils.py | 17 +++++++++++++++++ .../unidiffuser/modeling_text_decoder.py | 2 ++ 2 files changed, 19 insertions(+) diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index c9fabf93253b..f6d6bc5711cd 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -17,6 +17,7 @@ import inspect import itertools import os +import re from functools import partial from typing import Any, Callable, List, Optional, Tuple, Union @@ -162,6 +163,7 @@ class ModelMixin(torch.nn.Module): config_name = CONFIG_NAME _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] _supports_gradient_checkpointing = False + _keys_to_ignore_on_load_unexpected = None def __init__(self): super().__init__() @@ -608,6 +610,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize" " those weights or else make sure your checkpoint file is correct." ) + unexpected_keys = [] empty_state_dict = model.state_dict() for param_name, param in state_dict.items(): @@ -615,6 +618,10 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P inspect.signature(set_module_tensor_to_device).parameters.keys() ) + if param_name not in empty_state_dict: + unexpected_keys.append(param_name) + continue + if empty_state_dict[param_name].shape != param.shape: raise ValueError( f"Cannot load {pretrained_model_name_or_path} because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example." @@ -626,6 +633,16 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P ) else: set_module_tensor_to_device(model, param_name, param_device, value=param) + + if cls._keys_to_ignore_on_load_unexpected is not None: + for pat in cls._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + + if len(unexpected_keys) > 0: + logger.warn( + f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" + ) + else: # else let accelerate handle loading and dispatching. # Load weights and dispatch according to the device_map # by default the device_map is None and the weights are loaded on the CPU diff --git a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py index febc8e09e6ab..9b962f6e0656 100644 --- a/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py +++ b/src/diffusers/pipelines/unidiffuser/modeling_text_decoder.py @@ -61,6 +61,8 @@ class UniDiffuserTextDecoder(ModelMixin, ConfigMixin, ModuleUtilsMixin): dot-product/softmax to float() when training with mixed precision. """ + _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] + @register_to_config def __init__( self, From c42f6ee43e0408c5fe8a1d3dc3cdeb9eb3a02fa6 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 8 Jun 2023 18:08:49 +0200 Subject: [PATCH 102/199] Post 0.17.0 release (#3721) * Post release * Post release --- examples/controlnet/train_controlnet.py | 2 +- examples/controlnet/train_controlnet_flax.py | 2 +- .../train_custom_diffusion.py | 2 +- examples/dreambooth/train_dreambooth.py | 2 +- examples/dreambooth/train_dreambooth_flax.py | 2 +- examples/dreambooth/train_dreambooth_lora.py | 2 +- .../train_instruct_pix2pix.py | 2 +- examples/text_to_image/train_text_to_image.py | 2 +- .../text_to_image/train_text_to_image_flax.py | 2 +- .../text_to_image/train_text_to_image_lora.py | 2 +- .../textual_inversion/textual_inversion.py | 2 +- .../textual_inversion_flax.py | 2 +- .../train_unconditional.py | 2 +- setup.py | 2 +- src/diffusers/__init__.py | 2 +- src/diffusers/loaders.py | 2 +- src/diffusers/models/cross_attention.py | 34 +++++++++---------- src/diffusers/pipelines/pipeline_utils.py | 9 +---- src/diffusers/utils/hub_utils.py | 2 +- tests/pipelines/test_pipelines.py | 21 ------------ 20 files changed, 35 insertions(+), 63 deletions(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index 13937246911e..fc358783b5f9 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -55,7 +55,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__) diff --git a/examples/controlnet/train_controlnet_flax.py b/examples/controlnet/train_controlnet_flax.py index 18d97502c7c4..e28d9c5ff368 100644 --- a/examples/controlnet/train_controlnet_flax.py +++ b/examples/controlnet/train_controlnet_flax.py @@ -59,7 +59,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index e619e037453f..421532602137 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -56,7 +56,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 37d97f175cc8..7f6c27dec54c 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -58,7 +58,7 @@ import wandb # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__) diff --git a/examples/dreambooth/train_dreambooth_flax.py b/examples/dreambooth/train_dreambooth_flax.py index 2a2200181d8a..a336c2b787b7 100644 --- a/examples/dreambooth/train_dreambooth_flax.py +++ b/examples/dreambooth/train_dreambooth_flax.py @@ -36,7 +36,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") # Cache compiled models across invocations of this script. cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache")) diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index c3dea4920931..1c64523caf45 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -64,7 +64,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__) diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py index d3eb6ceabb8e..08dd5cd42701 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py @@ -51,7 +51,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index da6d5b8e8c79..f4ce1d96a82c 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -52,7 +52,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index 2b2255b46353..07cc250301c2 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -33,7 +33,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index fe5631c697dd..990695fa0ece 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -47,7 +47,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index 5a94a4802543..d5988a9b1707 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -77,7 +77,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__) diff --git a/examples/textual_inversion/textual_inversion_flax.py b/examples/textual_inversion/textual_inversion_flax.py index af167c53f275..a7be35c9eff1 100644 --- a/examples/textual_inversion/textual_inversion_flax.py +++ b/examples/textual_inversion/textual_inversion_flax.py @@ -56,7 +56,7 @@ # ------------------------------------------------------------------------------ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index e4dc1601d3c2..b07143f8b267 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -28,7 +28,7 @@ # Will error if the minimal version of diffusers is not installed. Remove at your own risks. -check_min_version("0.17.0.dev0") +check_min_version("0.18.0.dev0") logger = get_logger(__name__, log_level="INFO") diff --git a/setup.py b/setup.py index a972df80b509..9dab0b903f24 100644 --- a/setup.py +++ b/setup.py @@ -227,7 +227,7 @@ def run(self): setup( name="diffusers", - version="0.17.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="0.18.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) description="Diffusers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 402f6eaa749a..a322903ab5d1 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.17.0.dev0" +__version__ = "0.18.0.dev0" from .configuration_utils import ConfigMixin from .utils import ( diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 6d273de5ca9d..5cddb7690e52 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -366,7 +366,7 @@ def save_attn_procs( """ weight_name = weight_name or deprecate( "weights_name", - "0.18.0", + "0.20.0", "`weights_name` is deprecated, please use `weight_name` instead.", take_from=kwargs, ) diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py index 4fdb2acaabed..44bc156b34cf 100644 --- a/src/diffusers/models/cross_attention.py +++ b/src/diffusers/models/cross_attention.py @@ -29,7 +29,7 @@ deprecate( "cross_attention", - "0.18.0", + "0.20.0", "Importing from cross_attention is deprecated. Please import from diffusers.models.attention_processor instead.", standard_warn=False, ) @@ -40,55 +40,55 @@ class CrossAttention(Attention): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class CrossAttnProcessor(AttnProcessorRename): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class LoRACrossAttnProcessor(LoRAAttnProcessor): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class CrossAttnAddedKVProcessor(AttnAddedKVProcessor): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class XFormersCrossAttnProcessor(XFormersAttnProcessor): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class LoRAXFormersCrossAttnProcessor(LoRAXFormersAttnProcessor): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class SlicedCrossAttnProcessor(SlicedAttnProcessor): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) class SlicedCrossAttnAddedKVProcessor(SlicedAttnAddedKVProcessor): def __init__(self, *args, **kwargs): - deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.18.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." - deprecate("cross_attention", "0.18.0", deprecation_message, standard_warn=False) + deprecation_message = f"{self.__class__.__name__} is deprecated and will be removed in `0.20.0`. Please use `from diffusers.models.attention_processor import {''.join(self.__class__.__name__.split('Cross'))} instead." + deprecate("cross_attention", "0.20.0", deprecation_message, standard_warn=False) super().__init__(*args, **kwargs) diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index ed95163087a8..d32c240dedaf 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1099,13 +1099,6 @@ def load_module(name, value): # 8. Instantiate the pipeline model = pipeline_class(**init_kwargs) - - return_cached_folder = kwargs.pop("return_cached_folder", False) - if return_cached_folder: - message = f"Passing `return_cached_folder=True` is deprecated and will be removed in `diffusers=0.18.0`. Please do the following instead: \n 1. Load the cached_folder via `cached_folder={cls}.download({pretrained_model_name_or_path})`. \n 2. Load the pipeline by loading from the cached folder: `pipeline={cls}.from_pretrained(cached_folder)`." - deprecate("return_cached_folder", "0.18.0", message) - return model, cached_folder - return model @classmethod @@ -1254,7 +1247,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: # if the whole pipeline is cached we don't have to ping the Hub if revision in DEPRECATED_REVISION_ARGS and version.parse( version.parse(__version__).base_version - ) >= version.parse("0.18.0"): + ) >= version.parse("0.20.0"): warn_deprecated_model_variant( pretrained_model_name, use_auth_token, variant, revision, model_filenames ) diff --git a/src/diffusers/utils/hub_utils.py b/src/diffusers/utils/hub_utils.py index 6e44370a378a..4f0cf00a5c5d 100644 --- a/src/diffusers/utils/hub_utils.py +++ b/src/diffusers/utils/hub_utils.py @@ -280,7 +280,7 @@ def _get_model_file( if ( revision in DEPRECATED_REVISION_ARGS and (weights_name == WEIGHTS_NAME or weights_name == SAFETENSORS_WEIGHTS_NAME) - and version.parse(version.parse(__version__).base_version) >= version.parse("0.18.0") + and version.parse(version.parse(__version__).base_version) >= version.parse("0.20.0") ): try: model_file = hf_hub_download( diff --git a/tests/pipelines/test_pipelines.py b/tests/pipelines/test_pipelines.py index cd3700d0ccdf..7ee2c632e613 100644 --- a/tests/pipelines/test_pipelines.py +++ b/tests/pipelines/test_pipelines.py @@ -244,27 +244,6 @@ def test_force_safetensors_error(self): use_safetensors=True, ) - def test_returned_cached_folder(self): - prompt = "hello" - pipe = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None - ) - _, local_path = StableDiffusionPipeline.from_pretrained( - "hf-internal-testing/tiny-stable-diffusion-torch", safety_checker=None, return_cached_folder=True - ) - pipe_2 = StableDiffusionPipeline.from_pretrained(local_path) - - pipe = pipe.to(torch_device) - pipe_2 = pipe_2.to(torch_device) - - generator = torch.manual_seed(0) - out = pipe(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - generator = torch.manual_seed(0) - out_2 = pipe_2(prompt, num_inference_steps=2, generator=generator, output_type="numpy").images - - assert np.max(np.abs(out - out_2)) < 1e-3 - def test_download_safetensors(self): with tempfile.TemporaryDirectory() as tmpdirname: # pipeline has Flax weights From 787e65af93e226316ffdb163a45929cba92c5a8d Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Fri, 9 Jun 2023 12:40:57 +0530 Subject: [PATCH 103/199] Minor --- examples/consistency_models/script.sh | 3 + .../train_consistency_distillation.py | 59 ++++++++++--------- 2 files changed, 33 insertions(+), 29 deletions(-) create mode 100644 examples/consistency_models/script.sh diff --git a/examples/consistency_models/script.sh b/examples/consistency_models/script.sh new file mode 100644 index 000000000000..5b5f6b4710e9 --- /dev/null +++ b/examples/consistency_models/script.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +accelerate launch train_consistency_distillation.py --dataset_name="huggan/flowers-102-categories" --resolution=64 --center_crop --random_flip --output_dir="ddpm-ema-flowers-64" --train_batch_size=16 --num_epochs=100 --gradient_accumulation_steps=1 --use_ema --learning_rate=1e-4 --lr_warmup_steps=500 --mixed_precision=no --push_to_hub \ No newline at end of file diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index a14d652c208b..022d4e5a63ef 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -75,7 +75,7 @@ def parse_args(): "--model_config_name_or_path", type=str, default=None, - help="The config of the UNet model to train, leave as None to use standard DDPM configuration.", + help="The config of the UNet model to train, leave as None to use standard Consistency configuration.", ) parser.add_argument( "--train_data_dir", @@ -378,31 +378,31 @@ def load_model_hook(models, input_dir): # Initialize the model if args.model_config_name_or_path is None: model = UNet2DModel( - sample_size=args.resolution, - in_channels=3, - out_channels=3, - layers_per_block=2, - block_out_channels=(128, 128, 256, 256, 512, 512), - down_block_types=( - "DownBlock2D", - "DownBlock2D", - "DownBlock2D", - "DownBlock2D", - "AttnDownBlock2D", - "DownBlock2D", - ), - up_block_types=( - "UpBlock2D", - "AttnUpBlock2D", - "UpBlock2D", - "UpBlock2D", - "UpBlock2D", - "UpBlock2D", - ), - ) + sample_size= args.resolution, + in_channels=3, + out_channels=3, + layers_per_block=2, + num_class_embeds=1000, + block_out_channels= [32, 64], + attention_head_dim=8, + down_block_types= [ + "ResnetDownsampleBlock2D", + "AttnDownsampleBlock2D", + ], + up_block_types= [ + "AttnUpsampleBlock2D", + "ResnetUpsampleBlock2D", + ], + resnet_time_scale_shift="scale_shift", + + ) else: config = UNet2DModel.load_config(args.model_config_name_or_path) model = UNet2DModel.from_config(config) + + teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet + # print(teacher_model) + # Create EMA for the model. if args.use_ema: @@ -478,11 +478,11 @@ def load_model_hook(models, input_dir): ) def transform_images(examples): - images = [augmentations(image.convert("RGB")) for image in examples["image"]] - return {"input": images} + images = [augmentations(image.convert("RGB")) for image in examples["img"]] + labels = [torch.tensor(label) for label in examples["label"]] + return {"input": images, "labels": labels} logger.info(f"Dataset size: {len(dataset)}") - dataset.set_transform(transform_images) train_dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers @@ -497,8 +497,8 @@ def transform_images(examples): ) # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( - model, optimizer, train_dataloader, lr_scheduler + model, optimizer, train_dataloader, lr_scheduler, teacher_model = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler, teacher_model ) if args.use_ema: @@ -563,6 +563,7 @@ def transform_images(examples): continue clean_images = batch["input"] + labels = batch["labels"] # Sample noise that we'll add to the images noise = torch.randn(clean_images.shape).to(clean_images.device) bsz = clean_images.shape[0] @@ -577,7 +578,7 @@ def transform_images(examples): with accelerator.accumulate(model): # Predict the noise residual - model_output = model(noisy_images, timesteps).sample + model_output = model(noisy_images, timesteps, labels).sample if args.prediction_type == "epsilon": loss = F.mse_loss(model_output, noise) # this could have different weights! From 6530b17e6af1452dae45966d492eb6f251fa72cd Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Fri, 9 Jun 2023 17:17:25 +0530 Subject: [PATCH 104/199] Add training code --- .../train_consistency_distillation.py | 97 ++++++++++++------- 1 file changed, 64 insertions(+), 33 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 022d4e5a63ef..da362e3c55b3 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -20,7 +20,7 @@ from tqdm.auto import tqdm import diffusers -from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel +from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel, CMStochasticIterativeScheduler from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available @@ -34,6 +34,29 @@ logger = get_logger(__name__, log_level="INFO") +def append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") + return x[(...,) + (None,) * dims_to_append] + + +def heun_solver(samples, t, next_t, x0): + dims = samples.ndim + x = samples + denoiser = teacher_denoise_fn(x, t) + + d = (x - denoiser) / append_dims(t, dims) + samples = x + d * append_dims(next_t - t, dims) + denoiser = teacher_denoise_fn(samples, next_t) + + next_d = (samples - denoiser) / append_dims(next_t, dims) + samples = x + (d + next_d) * append_dims((next_t - t) / 2, dims) + + return samples + + def _extract_into_tensor(arr, timesteps, broadcast_shape): """ @@ -401,10 +424,13 @@ def load_model_hook(models, input_dir): model = UNet2DModel.from_config(config) teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet + noise_scheduler = CMStochasticIterativeScheduler() + num_scales = 40 + noise_scheduler.set_timesteps(num_scales) # print(teacher_model) - # Create EMA for the model. + # Create EMA for the model, this is the target model in the paper if args.use_ema: ema_model = EMAModel( model.parameters(), @@ -429,17 +455,6 @@ def load_model_hook(models, input_dir): else: raise ValueError("xformers is not available. Make sure it is installed correctly") - # Initialize the scheduler - accepts_prediction_type = "prediction_type" in set(inspect.signature(DDPMScheduler.__init__).parameters.keys()) - if accepts_prediction_type: - noise_scheduler = DDPMScheduler( - num_train_timesteps=args.ddpm_num_steps, - beta_schedule=args.ddpm_beta_schedule, - prediction_type=args.prediction_type, - ) - else: - noise_scheduler = DDPMScheduler(num_train_timesteps=args.ddpm_num_steps, beta_schedule=args.ddpm_beta_schedule) - # Initialize the optimizer optimizer = torch.optim.AdamW( model.parameters(), @@ -569,30 +584,46 @@ def transform_images(examples): bsz = clean_images.shape[0] # Sample a random timestep for each image timesteps = torch.randint( - 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=clean_images.device + 0, noise_scheduler.config.num_train_timesteps-1, (bsz,), device=clean_images.device ).long() - - # Add noise to the clean images according to the noise magnitude at each timestep - # (this is the forward diffusion process) - noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps) + timesteps_prev = timesteps + 1 + # TO-DO, we should have an add noise in the scheduler maybe? + noised_image = clean_images + noise*append_dims(timesteps, clean_images.ndims) + scaled_timesteps = noise_scheduler.scale_timesteps(timesteps) + scaled_timesteps_prev = noise_scheduler.scale_timesteps(timesteps_prev) with accelerator.accumulate(model): # Predict the noise residual - model_output = model(noisy_images, timesteps, labels).sample - - if args.prediction_type == "epsilon": - loss = F.mse_loss(model_output, noise) # this could have different weights! - elif args.prediction_type == "sample": - alpha_t = _extract_into_tensor( - noise_scheduler.alphas_cumprod, timesteps, (clean_images.shape[0], 1, 1, 1) - ) - snr_weights = alpha_t / (1 - alpha_t) - loss = snr_weights * F.mse_loss( - model_output, clean_images, reduction="none" - ) # use SNR weighting from distillation paper - loss = loss.mean() - else: - raise ValueError(f"Unsupported prediction type: {args.prediction_type}") + model_output = model(noised_image, scaled_timesteps, class_labels=labels).sample + distiller = noise_scheduler.step( + model_output, timesteps, noised_image, use_noise=False + ).prev_sample + + # Heun Solver to get previous timestep image + samples = noised_image + x = samples + model_output = teacher_model(x, scaled_timesteps, class_labels=labels).sample + teacher_denoiser = noise_scheduler.step( + model_output, timesteps, x, use_noise=False + ).prev_sample + d = (x - teacher_denoiser) / append_dims(scaled_timesteps, x.ndims) + samples = x + d * append_dims(scaled_timesteps_prev - scaled_timesteps, x.ndims) + model_output = teacher_model(samples, scaled_timesteps_prev, class_labels=labels).sample + teacher_denoiser = noise_scheduler.step( + model_output, timesteps_prev, samples, use_noise=False + ).prev_sample + + next_d = (samples - teacher_denoiser) / append_dims(scaled_timesteps_prev, x.ndims) + denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndims) + + # get output from target model + model_output = ema_model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample + distiller_target = noise_scheduler.step( + model_output, timesteps_prev, denoised_image, use_noise=False + ).prev_sample + + loss = F.mse_loss(distiller, distiller_target) # this could have different weights! + loss = loss.mean() accelerator.backward(loss) From 8f858cb6d4d7f160e0b9378f7e331e2b689068d5 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Fri, 9 Jun 2023 18:40:47 +0530 Subject: [PATCH 105/199] Fix bugs in training --- .../train_consistency_distillation.py | 46 ++++++++----------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index da362e3c55b3..a591542a831c 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -42,19 +42,7 @@ def append_dims(x, target_dims): return x[(...,) + (None,) * dims_to_append] -def heun_solver(samples, t, next_t, x0): - dims = samples.ndim - x = samples - denoiser = teacher_denoise_fn(x, t) - d = (x - denoiser) / append_dims(t, dims) - samples = x + d * append_dims(next_t - t, dims) - denoiser = teacher_denoise_fn(samples, next_t) - - next_d = (samples - denoiser) / append_dims(next_t, dims) - samples = x + (d + next_d) * append_dims((next_t - t) / 2, dims) - - return samples @@ -424,9 +412,12 @@ def load_model_hook(models, input_dir): model = UNet2DModel.from_config(config) teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet + model = model.double() + teacher_model = teacher_model.double() noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 noise_scheduler.set_timesteps(num_scales) + timesteps = noise_scheduler.timesteps # print(teacher_model) @@ -583,20 +574,21 @@ def transform_images(examples): noise = torch.randn(clean_images.shape).to(clean_images.device) bsz = clean_images.shape[0] # Sample a random timestep for each image - timesteps = torch.randint( - 0, noise_scheduler.config.num_train_timesteps-1, (bsz,), device=clean_images.device + index = torch.randint( + 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=clean_images.device ).long() - timesteps_prev = timesteps + 1 + timestep = timesteps[index] + timestep_prev = timestep + 1 # TO-DO, we should have an add noise in the scheduler maybe? - noised_image = clean_images + noise*append_dims(timesteps, clean_images.ndims) - scaled_timesteps = noise_scheduler.scale_timesteps(timesteps) - scaled_timesteps_prev = noise_scheduler.scale_timesteps(timesteps_prev) + noised_image = clean_images + noise*append_dims(timestep, clean_images.ndim) + scaled_timesteps = noise_scheduler.scale_timestep(timestep) + scaled_timesteps_prev = noise_scheduler.scale_timestep(timestep_prev) with accelerator.accumulate(model): # Predict the noise residual model_output = model(noised_image, scaled_timesteps, class_labels=labels).sample distiller = noise_scheduler.step( - model_output, timesteps, noised_image, use_noise=False + model_output, timestep, noised_image, use_noise=False ).prev_sample # Heun Solver to get previous timestep image @@ -604,22 +596,22 @@ def transform_images(examples): x = samples model_output = teacher_model(x, scaled_timesteps, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( - model_output, timesteps, x, use_noise=False + model_output, timestep, x, use_noise=False ).prev_sample - d = (x - teacher_denoiser) / append_dims(scaled_timesteps, x.ndims) - samples = x + d * append_dims(scaled_timesteps_prev - scaled_timesteps, x.ndims) + d = (x - teacher_denoiser) / append_dims(scaled_timesteps, x.ndim) + samples = x + d * append_dims(scaled_timesteps_prev - scaled_timesteps, x.ndim) model_output = teacher_model(samples, scaled_timesteps_prev, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( - model_output, timesteps_prev, samples, use_noise=False + model_output, timestep_prev, samples, use_noise=False ).prev_sample - next_d = (samples - teacher_denoiser) / append_dims(scaled_timesteps_prev, x.ndims) - denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndims) + next_d = (samples - teacher_denoiser) / append_dims(scaled_timesteps_prev, x.ndim) + denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndim) # get output from target model - model_output = ema_model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample + model_output = model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample distiller_target = noise_scheduler.step( - model_output, timesteps_prev, denoised_image, use_noise=False + model_output, timestep_prev, denoised_image, use_noise=False ).prev_sample loss = F.mse_loss(distiller, distiller_target) # this could have different weights! From e56b870d2f3acd63dcccf4c770005a87247246d6 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Fri, 9 Jun 2023 23:08:57 +0530 Subject: [PATCH 106/199] Remove some args, add target model --- .../train_consistency_distillation.py | 38 ++++++++++++------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index a591542a831c..c4712fa4efcc 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -235,16 +235,6 @@ def parse_args(): "and an Nvidia Ampere GPU." ), ) - parser.add_argument( - "--prediction_type", - type=str, - default="epsilon", - choices=["epsilon", "sample"], - help="Whether the model should predict the 'epsilon'/noise error or directly the reconstructed image 'x0'.", - ) - parser.add_argument("--ddpm_num_steps", type=int, default=1000) - parser.add_argument("--ddpm_num_inference_steps", type=int, default=1000) - parser.add_argument("--ddpm_beta_schedule", type=str, default="linear") parser.add_argument( "--checkpointing_steps", type=int, @@ -406,13 +396,34 @@ def load_model_hook(models, input_dir): ], resnet_time_scale_shift="scale_shift", + ) + target_model = UNet2DModel( + sample_size= args.resolution, + in_channels=3, + out_channels=3, + layers_per_block=2, + num_class_embeds=1000, + block_out_channels= [32, 64], + attention_head_dim=8, + down_block_types= [ + "ResnetDownsampleBlock2D", + "AttnDownsampleBlock2D", + ], + up_block_types= [ + "AttnUpsampleBlock2D", + "ResnetUpsampleBlock2D", + ], + resnet_time_scale_shift="scale_shift", + ) else: config = UNet2DModel.load_config(args.model_config_name_or_path) model = UNet2DModel.from_config(config) + target_model = UNet2DModel.from_config(config) teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet model = model.double() + target_model = target_model.double() teacher_model = teacher_model.double() noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 @@ -503,8 +514,8 @@ def transform_images(examples): ) # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, lr_scheduler, teacher_model = accelerator.prepare( - model, optimizer, train_dataloader, lr_scheduler, teacher_model + model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, ema_model = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, ema_model ) if args.use_ema: @@ -583,6 +594,7 @@ def transform_images(examples): noised_image = clean_images + noise*append_dims(timestep, clean_images.ndim) scaled_timesteps = noise_scheduler.scale_timestep(timestep) scaled_timesteps_prev = noise_scheduler.scale_timestep(timestep_prev) + ema_model.copy_to(target_model.parameters()) with accelerator.accumulate(model): # Predict the noise residual @@ -609,7 +621,7 @@ def transform_images(examples): denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndim) # get output from target model - model_output = model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample + model_output = target_model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample distiller_target = noise_scheduler.step( model_output, timestep_prev, denoised_image, use_noise=False ).prev_sample From 05361960f252b4d2ff0432a2bbc7b893cf338baf Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Fri, 9 Jun 2023 08:27:02 -1000 Subject: [PATCH 107/199] remove seed (#3734) * remove seed * style --------- Co-authored-by: yiyixuxu --- docs/source/en/api/pipelines/kandinsky.mdx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx index b94937e4af85..1cac9810980f 100644 --- a/docs/source/en/api/pipelines/kandinsky.mdx +++ b/docs/source/en/api/pipelines/kandinsky.mdx @@ -60,8 +60,7 @@ returns both the image embeddings corresponding to the prompt and negative/uncon embeddings corresponding to an empty string. ```py -generator = torch.Generator(device="cuda").manual_seed(12) -image_embeds, negative_image_embeds = pipe_prior(prompt, generator=generator).to_tuple() +image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple() ``` @@ -78,7 +77,7 @@ of the prior by a factor of 2. prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting" negative_prompt = "low quality, bad quality" -image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, generator=generator).to_tuple() +image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, guidance_scale=1.0).to_tuple() ``` @@ -89,7 +88,9 @@ in case you are using a customized negative prompt, that you should pass this on with `negative_prompt=negative_prompt`: ```py -image = t2i_pipe(prompt, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds).images[0] +image = t2i_pipe( + prompt, image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768 +).images[0] image.save("cheeseburger_monster.png") ``` @@ -160,8 +161,7 @@ pipe.to("cuda") prompt = "A fantasy landscape, Cinematic lighting" negative_prompt = "low quality, bad quality" -generator = torch.Generator(device="cuda").manual_seed(30) -image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt, generator=generator).to_tuple() +image_embeds, negative_image_embeds = pipe_prior(prompt, negative_prompt).to_tuple() out = pipe( prompt, From 27af55d1b4e5f27771d1b91e59abe3004e8733fd Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sat, 10 Jun 2023 16:56:41 +0200 Subject: [PATCH 108/199] build docs --- .github/workflows/build_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 4331c463097f..4e797fa31a51 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -5,6 +5,7 @@ on: branches: - main - doc-builder* + - v*-release - v*-patch jobs: From e891b00dfceb2d3f8ca3d1face017fe91f7d48d8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sat, 10 Jun 2023 16:58:59 +0200 Subject: [PATCH 109/199] build docs --- .github/workflows/build_documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 4e797fa31a51..e8741c81f5bb 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -15,7 +15,7 @@ jobs: commit_sha: ${{ github.sha }} package: diffusers notebook_folder: diffusers_doc - languages: en ko + languages: en ko zh secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} hf_token: ${{ secrets.HF_DOC_PUSH }} From abbfe4b5b7a9508169e69476a951a1a6dd86c927 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sat, 10 Jun 2023 17:54:55 +0200 Subject: [PATCH 110/199] fix zh --- docs/source/ko/_toctree.yml | 2 +- docs/source/zh/_toctree.yml | 266 ------------------------------------ 2 files changed, 1 insertion(+), 267 deletions(-) diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 2fec3af66525..3724c238f218 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -45,4 +45,4 @@ title: MPS - local: optimization/habana title: Habana Gaudi - title: 최적화/특수 하드웨어 \ No newline at end of file + title: 최적화/특수 하드웨어 diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index 58f6ac09faef..895273d851f3 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -3,272 +3,6 @@ title: 🧨 Diffusers - local: quicktour title: 快速入门 - - local: stable_diffusion - title: Effective and efficient diffusion - local: installation title: 安装 title: 开始 -- sections: - - local: tutorials/tutorial_overview - title: Overview - - local: using-diffusers/write_own_pipeline - title: Understanding models and schedulers - - local: tutorials/basic_training - title: Train a diffusion model - title: Tutorials -- sections: - - sections: - - local: using-diffusers/loading_overview - title: Overview - - local: using-diffusers/loading - title: Load pipelines, models, and schedulers - - local: using-diffusers/schedulers - title: Load and compare different schedulers - - local: using-diffusers/custom_pipeline_overview - title: Load community pipelines - - local: using-diffusers/kerascv - title: Load KerasCV Stable Diffusion checkpoints - title: Loading & Hub - - sections: - - local: using-diffusers/pipeline_overview - title: Overview - - local: using-diffusers/unconditional_image_generation - title: Unconditional image generation - - local: using-diffusers/conditional_image_generation - title: Text-to-image generation - - local: using-diffusers/img2img - title: Text-guided image-to-image - - local: using-diffusers/inpaint - title: Text-guided image-inpainting - - local: using-diffusers/depth2img - title: Text-guided depth-to-image - - local: using-diffusers/reusing_seeds - title: Improve image quality with deterministic generation - - local: using-diffusers/reproducibility - title: Create reproducible pipelines - - local: using-diffusers/custom_pipeline_examples - title: Community pipelines - - local: using-diffusers/contribute_pipeline - title: How to contribute a community pipeline - - local: using-diffusers/using_safetensors - title: Using safetensors - - local: using-diffusers/stable_diffusion_jax_how_to - title: Stable Diffusion in JAX/Flax - - local: using-diffusers/weighted_prompts - title: Weighting Prompts - title: Pipelines for Inference - - sections: - - local: training/overview - title: Overview - - local: training/unconditional_training - title: Unconditional image generation - - local: training/text_inversion - title: Textual Inversion - - local: training/dreambooth - title: DreamBooth - - local: training/text2image - title: Text-to-image - - local: training/lora - title: Low-Rank Adaptation of Large Language Models (LoRA) - - local: training/controlnet - title: ControlNet - - local: training/instructpix2pix - title: InstructPix2Pix Training - - local: training/custom_diffusion - title: Custom Diffusion - title: Training - - sections: - - local: using-diffusers/rl - title: Reinforcement Learning - - local: using-diffusers/audio - title: Audio - - local: using-diffusers/other-modalities - title: Other Modalities - title: Taking Diffusers Beyond Images - title: Using Diffusers -- sections: - - local: optimization/opt_overview - title: Overview - - local: optimization/fp16 - title: Memory and Speed - - local: optimization/torch2.0 - title: Torch2.0 support - - local: optimization/xformers - title: xFormers - - local: optimization/onnx - title: ONNX - - local: optimization/open_vino - title: OpenVINO - - local: optimization/coreml - title: Core ML - - local: optimization/mps - title: MPS - - local: optimization/habana - title: Habana Gaudi - - local: optimization/tome - title: Token Merging - title: Optimization/Special Hardware -- sections: - - local: conceptual/philosophy - title: Philosophy - - local: using-diffusers/controlling_generation - title: Controlled generation - - local: conceptual/contribution - title: How to contribute? - - local: conceptual/ethical_guidelines - title: Diffusers' Ethical Guidelines - - local: conceptual/evaluation - title: Evaluating Diffusion Models - title: Conceptual Guides -- sections: - - sections: - - local: api/models - title: Models - - local: api/diffusion_pipeline - title: Diffusion Pipeline - - local: api/logging - title: Logging - - local: api/configuration - title: Configuration - - local: api/outputs - title: Outputs - - local: api/loaders - title: Loaders - title: Main Classes - - sections: - - local: api/pipelines/overview - title: Overview - - local: api/pipelines/alt_diffusion - title: AltDiffusion - - local: api/pipelines/audio_diffusion - title: Audio Diffusion - - local: api/pipelines/audioldm - title: AudioLDM - - local: api/pipelines/cycle_diffusion - title: Cycle Diffusion - - local: api/pipelines/dance_diffusion - title: Dance Diffusion - - local: api/pipelines/ddim - title: DDIM - - local: api/pipelines/ddpm - title: DDPM - - local: api/pipelines/dit - title: DiT - - local: api/pipelines/if - title: IF - - local: api/pipelines/latent_diffusion - title: Latent Diffusion - - local: api/pipelines/paint_by_example - title: PaintByExample - - local: api/pipelines/pndm - title: PNDM - - local: api/pipelines/repaint - title: RePaint - - local: api/pipelines/stable_diffusion_safe - title: Safe Stable Diffusion - - local: api/pipelines/score_sde_ve - title: Score SDE VE - - local: api/pipelines/semantic_stable_diffusion - title: Semantic Guidance - - local: api/pipelines/spectrogram_diffusion - title: "Spectrogram Diffusion" - - sections: - - local: api/pipelines/stable_diffusion/overview - title: Overview - - local: api/pipelines/stable_diffusion/text2img - title: Text-to-Image - - local: api/pipelines/stable_diffusion/img2img - title: Image-to-Image - - local: api/pipelines/stable_diffusion/inpaint - title: Inpaint - - local: api/pipelines/stable_diffusion/depth2img - title: Depth-to-Image - - local: api/pipelines/stable_diffusion/image_variation - title: Image-Variation - - local: api/pipelines/stable_diffusion/upscale - title: Super-Resolution - - local: api/pipelines/stable_diffusion/latent_upscale - title: Stable-Diffusion-Latent-Upscaler - - local: api/pipelines/stable_diffusion/pix2pix - title: InstructPix2Pix - - local: api/pipelines/stable_diffusion/attend_and_excite - title: Attend and Excite - - local: api/pipelines/stable_diffusion/pix2pix_zero - title: Pix2Pix Zero - - local: api/pipelines/stable_diffusion/self_attention_guidance - title: Self-Attention Guidance - - local: api/pipelines/stable_diffusion/panorama - title: MultiDiffusion Panorama - - local: api/pipelines/stable_diffusion/controlnet - title: Text-to-Image Generation with ControlNet Conditioning - - local: api/pipelines/stable_diffusion/model_editing - title: Text-to-Image Model Editing - title: Stable Diffusion - - local: api/pipelines/stable_diffusion_2 - title: Stable Diffusion 2 - - local: api/pipelines/stable_unclip - title: Stable unCLIP - - local: api/pipelines/stochastic_karras_ve - title: Stochastic Karras VE - - local: api/pipelines/text_to_video - title: Text-to-Video - - local: api/pipelines/text_to_video_zero - title: Text-to-Video Zero - - local: api/pipelines/unclip - title: UnCLIP - - local: api/pipelines/latent_diffusion_uncond - title: Unconditional Latent Diffusion - - local: api/pipelines/versatile_diffusion - title: Versatile Diffusion - - local: api/pipelines/vq_diffusion - title: VQ Diffusion - title: Pipelines - - sections: - - local: api/schedulers/overview - title: Overview - - local: api/schedulers/ddim - title: DDIM - - local: api/schedulers/ddim_inverse - title: DDIMInverse - - local: api/schedulers/ddpm - title: DDPM - - local: api/schedulers/deis - title: DEIS - - local: api/schedulers/dpm_discrete - title: DPM Discrete Scheduler - - local: api/schedulers/dpm_discrete_ancestral - title: DPM Discrete Scheduler with ancestral sampling - - local: api/schedulers/euler_ancestral - title: Euler Ancestral Scheduler - - local: api/schedulers/euler - title: Euler scheduler - - local: api/schedulers/heun - title: Heun Scheduler - - local: api/schedulers/ipndm - title: IPNDM - - local: api/schedulers/lms_discrete - title: Linear Multistep - - local: api/schedulers/multistep_dpm_solver - title: Multistep DPM-Solver - - local: api/schedulers/pndm - title: PNDM - - local: api/schedulers/repaint - title: RePaint Scheduler - - local: api/schedulers/singlestep_dpm_solver - title: Singlestep DPM-Solver - - local: api/schedulers/stochastic_karras_ve - title: Stochastic Kerras VE - - local: api/schedulers/unipc - title: UniPCMultistepScheduler - - local: api/schedulers/score_sde_ve - title: VE-SDE - - local: api/schedulers/score_sde_vp - title: VP-SDE - - local: api/schedulers/vq_diffusion - title: VQDiffusionScheduler - title: Schedulers - - sections: - - local: api/experimental/rl - title: RL Planning - title: Experimental Features - title: API \ No newline at end of file From 11aa105077c4877c03bc2e49b67876222d9c5fad Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 12 Jun 2023 10:04:45 +0200 Subject: [PATCH 111/199] Correct Token to upload docs (#3744) clean up more --- .github/workflows/upload_pr_documentation.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml index 927670d666b4..fc102df8103e 100644 --- a/.github/workflows/upload_pr_documentation.yml +++ b/.github/workflows/upload_pr_documentation.yml @@ -12,5 +12,5 @@ jobs: with: package_name: diffusers secrets: - hf_token: ${{ secrets.HF_DOC_PUSH }} - comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} From 790212f4d992641f2cb17d2d6c4eaeff0c91e741 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 12 Jun 2023 10:29:23 +0200 Subject: [PATCH 112/199] Correct another push token (#3745) clean up more --- .github/workflows/build_documentation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index e8741c81f5bb..6fc8d343cd91 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -18,4 +18,4 @@ jobs: languages: en ko zh secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} - hf_token: ${{ secrets.HF_DOC_PUSH }} + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} From 38adcd21bd9cc69f84710deb161c578b6478944f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 12 Jun 2023 13:59:38 +0200 Subject: [PATCH 113/199] [Stable Diffusion Inpaint & ControlNet inpaint] Correct timestep inpaint (#3749) * Correct timestep inpaint * make style * Fix * Apply suggestions from code review * make style --- src/diffusers/pipelines/controlnet/pipeline_controlnet.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_img2img.py | 2 +- .../pipelines/controlnet/pipeline_controlnet_inpaint.py | 7 +++++-- .../stable_diffusion/pipeline_stable_diffusion_inpaint.py | 5 ++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 89398b6f01f9..2a86ee0dfe1e 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -611,7 +611,7 @@ def check_image(self, image, prompt, prompt_embeds): and not image_is_np_list ): raise TypeError( - "image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors" + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" ) if image_is_pil: diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index 0e984d8ae5e3..e42b27958446 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -638,7 +638,7 @@ def check_image(self, image, prompt, prompt_embeds): and not image_is_np_list ): raise TypeError( - "image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors" + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" ) if image_is_pil: diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 5ce2fd5543b8..165e2d88dca6 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -770,7 +770,7 @@ def check_image(self, image, prompt, prompt_embeds): and not image_is_np_list ): raise TypeError( - "image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors" + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" ) if image_is_pil: @@ -1306,7 +1306,10 @@ def __call__( init_mask = mask[:1] if i < len(timesteps) - 1: - init_latents_proper = self.scheduler.add_noise(init_latents_proper, noise, torch.tensor([t])) + noise_timestep = timesteps[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) latents = (1 - init_mask) * init_latents_proper + init_mask * latents diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py index b07a5555f1c7..d958f0e3fb72 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py @@ -1038,7 +1038,10 @@ def __call__( init_mask = mask[:1] if i < len(timesteps) - 1: - init_latents_proper = self.scheduler.add_noise(init_latents_proper, noise, torch.tensor([t])) + noise_timestep = timesteps[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) latents = (1 - init_mask) * init_latents_proper + init_mask * latents From 1488180a4710e9cdcead5295ebd8ff1e4eb8443a Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Mon, 12 Jun 2023 17:59:03 +0530 Subject: [PATCH 114/199] attention weight loading fix --- .../train_consistency_distillation.py | 10 +++--- scripts/convert_consistency_to_diffusers.py | 34 ++++++------------- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index c4712fa4efcc..b664cbe8bb39 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -20,7 +20,7 @@ from tqdm.auto import tqdm import diffusers -from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel, CMStochasticIterativeScheduler +from diffusers import DDPMPipeline, UNet2DModel, CMStochasticIterativeScheduler, ConsistencyModelPipeline from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available @@ -584,7 +584,7 @@ def transform_images(examples): # Sample noise that we'll add to the images noise = torch.randn(clean_images.shape).to(clean_images.device) bsz = clean_images.shape[0] - # Sample a random timestep for each image + # Sample a random timestep for each image, TODO - allow different timesteps in a batch index = torch.randint( 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=clean_images.device ).long() @@ -668,7 +668,7 @@ def transform_images(examples): ema_model.store(unet.parameters()) ema_model.copy_to(unet.parameters()) - pipeline = DDPMPipeline( + pipeline = ConsistencyModelPipeline( unet=unet, scheduler=noise_scheduler, ) @@ -678,7 +678,7 @@ def transform_images(examples): images = pipeline( generator=generator, batch_size=args.eval_batch_size, - num_inference_steps=args.ddpm_num_inference_steps, + num_inference_steps=1, output_type="numpy", ).images @@ -709,7 +709,7 @@ def transform_images(examples): ema_model.store(unet.parameters()) ema_model.copy_to(unet.parameters()) - pipeline = DDPMPipeline( + pipeline = ConsistencyModelPipeline( unet=unet, scheduler=noise_scheduler, ) diff --git a/scripts/convert_consistency_to_diffusers.py b/scripts/convert_consistency_to_diffusers.py index 6a8e8eb938e0..923b72815e66 100644 --- a/scripts/convert_consistency_to_diffusers.py +++ b/scripts/convert_consistency_to_diffusers.py @@ -81,35 +81,21 @@ def convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip= return new_checkpoint -def convert_attention(checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim=64): - c, _, _, _ = checkpoint[f"{old_prefix}.qkv.weight"].shape - n_heads = c // (attention_head_dim * 3) - old_weights = checkpoint[f"{old_prefix}.qkv.weight"].reshape(n_heads, attention_head_dim * 3, -1, 1, 1) - old_biases = checkpoint[f"{old_prefix}.qkv.bias"].reshape(n_heads, attention_head_dim * 3, -1, 1, 1) - - weight_q, weight_k, weight_v = old_weights.chunk(3, dim=1) - weight_q = weight_q.reshape(n_heads * attention_head_dim, -1, 1, 1) - weight_k = weight_k.reshape(n_heads * attention_head_dim, -1, 1, 1) - weight_v = weight_v.reshape(n_heads * attention_head_dim, -1, 1, 1) - - bias_q, bias_k, bias_v = old_biases.chunk(3, dim=1) - bias_q = bias_q.reshape(n_heads * attention_head_dim, -1, 1, 1) - bias_k = bias_k.reshape(n_heads * attention_head_dim, -1, 1, 1) - bias_v = bias_v.reshape(n_heads * attention_head_dim, -1, 1, 1) +def convert_attention(checkpoint, new_checkpoint, old_prefix, new_prefix, attention_dim=None): + weight_q, weight_k, weight_v = checkpoint[f"{old_prefix}.qkv.weight"].chunk(3, dim=0) + bias_q, bias_k, bias_v = checkpoint[f"{old_prefix}.qkv.bias"].chunk(3, dim=0) new_checkpoint[f"{new_prefix}.group_norm.weight"] = checkpoint[f"{old_prefix}.norm.weight"] new_checkpoint[f"{new_prefix}.group_norm.bias"] = checkpoint[f"{old_prefix}.norm.bias"] - new_checkpoint[f"{new_prefix}.to_q.weight"] = torch.squeeze(weight_q) - new_checkpoint[f"{new_prefix}.to_q.bias"] = torch.squeeze(bias_q) - new_checkpoint[f"{new_prefix}.to_k.weight"] = torch.squeeze(weight_k) - new_checkpoint[f"{new_prefix}.to_k.bias"] = torch.squeeze(bias_k) - new_checkpoint[f"{new_prefix}.to_v.weight"] = torch.squeeze(weight_v) - new_checkpoint[f"{new_prefix}.to_v.bias"] = torch.squeeze(bias_v) + new_checkpoint[f"{new_prefix}.to_q.weight"] = weight_q.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_q.bias"] = bias_q.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_k.weight"] = weight_k.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_k.bias"] = bias_k.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_v.weight"] = weight_v.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_v.bias"] = bias_v.squeeze(-1).squeeze(-1) - new_checkpoint[f"{new_prefix}.to_out.0.weight"] = ( - checkpoint[f"{old_prefix}.proj_out.weight"].squeeze(-1).squeeze(-1) - ) + new_checkpoint[f"{new_prefix}.to_out.0.weight"] = checkpoint[f"{old_prefix}.proj_out.weight"].squeeze(-1).squeeze(-1) new_checkpoint[f"{new_prefix}.to_out.0.bias"] = checkpoint[f"{old_prefix}.proj_out.bias"].squeeze(-1).squeeze(-1) return new_checkpoint From b2b13cd3150d8a2f12c637263d822e51d9fc5b5c Mon Sep 17 00:00:00 2001 From: JeLuF Date: Mon, 12 Jun 2023 17:40:48 +0200 Subject: [PATCH 115/199] [Documentation] Replace dead link to Flax install guide (#3739) Replace dead link to Flax documentation Replace the dead link to the Flax installation guide by a working one: https://flax.readthedocs.io/en/latest/#installation --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c2a3b04b57a8..6c66be9f2463 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ ## Installation -We recommend installing 🤗 Diffusers in a virtual environment from PyPi or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/installation.html), please refer to their official documentation. +We recommend installing 🤗 Diffusers in a virtual environment from PyPi or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/#installation), please refer to their official documentation. ### PyTorch From f46b22ba134f21a3a62154bf0528ecda290029fd Mon Sep 17 00:00:00 2001 From: Liam Swayne <108629034+LiamSwayne@users.noreply.github.com> Date: Mon, 12 Jun 2023 11:42:01 -0400 Subject: [PATCH 116/199] [documentation] grammatical fixes in installation.mdx (#3735) Update installation.mdx --- docs/source/en/installation.mdx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/installation.mdx b/docs/source/en/installation.mdx index 218ccd7bc4f6..50df14be3f77 100644 --- a/docs/source/en/installation.mdx +++ b/docs/source/en/installation.mdx @@ -23,7 +23,7 @@ Install 🤗 Diffusers for whichever deep learning library you're working with. You should install 🤗 Diffusers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). -A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies. +A virtual environment makes it easier to manage different projects and avoid compatibility issues between dependencies. Start by creating a virtual environment in your project directory: @@ -127,7 +127,7 @@ Your Python environment will find the `main` version of 🤗 Diffusers on the ne Our library gathers telemetry information during `from_pretrained()` requests. This data includes the version of Diffusers and PyTorch/Flax, the requested model or pipeline class, -and the path to a pretrained checkpoint if it is hosted on the Hub. +and the path to a pre-trained checkpoint if it is hosted on the Hub. This usage data helps us debug issues and prioritize new features. Telemetry is only sent when loading models and pipelines from the HuggingFace Hub, and is not collected during local usage. @@ -143,4 +143,4 @@ export DISABLE_TELEMETRY=YES On Windows: ```bash set DISABLE_TELEMETRY=YES -``` \ No newline at end of file +``` From a812fb6f5c2f147a6d98c994effdaa5c2087e53b Mon Sep 17 00:00:00 2001 From: Andranik Movsisyan <48154088+19and99@users.noreply.github.com> Date: Mon, 12 Jun 2023 20:03:18 +0400 Subject: [PATCH 117/199] Text2video zero refinements (#3733) * fix docs typos. add frame_ids argument to text2video-zero pipeline call * make style && make quality * add support of pytorch 2.0 scaled_dot_product_attention for CrossFrameAttnProcessor * add chunk-by-chunk processing to text2video-zero docs * make style && make quality * Update docs/source/en/api/pipelines/text_to_video_zero.mdx Co-authored-by: Sayak Paul --------- Co-authored-by: Sayak Paul --- .../en/api/pipelines/text_to_video_zero.mdx | 41 +++++++- .../pipeline_text_to_video_zero.py | 98 +++++++++++++++++-- 2 files changed, 130 insertions(+), 9 deletions(-) diff --git a/docs/source/en/api/pipelines/text_to_video_zero.mdx b/docs/source/en/api/pipelines/text_to_video_zero.mdx index 3ee10f01c377..3c3dcf5bb1ad 100644 --- a/docs/source/en/api/pipelines/text_to_video_zero.mdx +++ b/docs/source/en/api/pipelines/text_to_video_zero.mdx @@ -80,6 +80,41 @@ You can change these parameters in the pipeline call: * Video length: * `video_length`, the number of frames video_length to be generated. Default: `video_length=8` +We an also generate longer videos by doing the processing in a chunk-by-chunk manner: +```python +import torch +import imageio +from diffusers import TextToVideoZeroPipeline +import numpy as np + +model_id = "runwayml/stable-diffusion-v1-5" +pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") +seed = 0 +video_length = 8 +chunk_size = 4 +prompt = "A panda is playing guitar on times square" + +# Generate the video chunk-by-chunk +result = [] +chunk_ids = np.arange(0, video_length, chunk_size - 1) +generator = torch.Generator(device="cuda") +for i in range(len(chunk_ids)): + print(f"Processing chunk {i + 1} / {len(chunk_ids)}") + ch_start = chunk_ids[i] + ch_end = video_length if i == len(chunk_ids) - 1 else chunk_ids[i + 1] + # Attach the first frame for Cross Frame Attention + frame_ids = [0] + list(range(ch_start, ch_end)) + # Fix the seed for the temporal consistency + generator.manual_seed(seed) + output = pipe(prompt=prompt, video_length=len(frame_ids), generator=generator, frame_ids=frame_ids) + result.append(output.images[1:]) + +# Concatenate chunks and save +result = np.concatenate(result) +result = [(r * 255).astype("uint8") for r in result] +imageio.mimsave("video.mp4", result, fps=4) +``` + ### Text-To-Video with Pose Control To generate a video from prompt with additional pose control @@ -202,7 +237,7 @@ can run with custom [DreamBooth](../training/dreambooth) models, as shown below reader = imageio.get_reader(video_path, "ffmpeg") frame_count = 8 - video = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)] + canny_edges = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)] ``` 3. Run `StableDiffusionControlNetPipeline` with custom trained DreamBooth model @@ -223,10 +258,10 @@ can run with custom [DreamBooth](../training/dreambooth) models, as shown below pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) # fix latents for all frames - latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1) + latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(canny_edges), 1, 1, 1) prompt = "oil painting of a beautiful girl avatar style" - result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images + result = pipe(prompt=[prompt] * len(canny_edges), image=canny_edges, latents=latents).images imageio.mimsave("video.mp4", result, fps=4) ``` diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py index 5b163bbbc8f5..fe7207f904f0 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_zero.py @@ -38,12 +38,12 @@ def rearrange_4(tensor): class CrossFrameAttnProcessor: """ - Cross frame attention processor. For each frame the self-attention is replaced with attention with first frame + Cross frame attention processor. Each frame attends the first frame. Args: batch_size: The number that represents actual batch size, other than the frames. - For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be - equal to 2, due to classifier-free guidance. + For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to + 2, due to classifier-free guidance. """ def __init__(self, batch_size=2): @@ -63,7 +63,7 @@ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_ma key = attn.to_k(encoder_hidden_states) value = attn.to_v(encoder_hidden_states) - # Sparse Attention + # Cross Frame Attention if not is_cross_attention: video_length = key.size()[0] // self.batch_size first_frame_index = [0] * video_length @@ -95,6 +95,81 @@ def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_ma return hidden_states +class CrossFrameAttnProcessor2_0: + """ + Cross frame attention processor with scaled_dot_product attention of Pytorch 2.0. + + Args: + batch_size: The number that represents actual batch size, other than the frames. + For example, calling unet with a single prompt and num_images_per_prompt=1, batch_size should be equal to + 2, due to classifier-free guidance. + """ + + def __init__(self, batch_size=2): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + self.batch_size = batch_size + + def __call__(self, attn, hidden_states, encoder_hidden_states=None, attention_mask=None): + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + inner_dim = hidden_states.shape[-1] + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + query = attn.to_q(hidden_states) + + is_cross_attention = encoder_hidden_states is not None + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + # Cross Frame Attention + if not is_cross_attention: + video_length = key.size()[0] // self.batch_size + first_frame_index = [0] * video_length + + # rearrange keys to have batch and frames in the 1st and 2nd dims respectively + key = rearrange_3(key, video_length) + key = key[:, first_frame_index] + # rearrange values to have batch and frames in the 1st and 2nd dims respectively + value = rearrange_3(value, video_length) + value = value[:, first_frame_index] + + # rearrange back to original shape + key = rearrange_4(key) + value = rearrange_4(value) + + head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + return hidden_states + + @dataclass class TextToVideoPipelineOutput(BaseOutput): images: Union[List[PIL.Image.Image], np.ndarray] @@ -227,7 +302,12 @@ def __init__( super().__init__( vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker ) - self.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) + processor = ( + CrossFrameAttnProcessor2_0(batch_size=2) + if hasattr(F, "scaled_dot_product_attention") + else CrossFrameAttnProcessor(batch_size=2) + ) + self.unet.set_attn_processor(processor) def forward_loop(self, x_t0, t0, t1, generator): """ @@ -338,6 +418,7 @@ def __call__( callback_steps: Optional[int] = 1, t0: int = 44, t1: int = 47, + frame_ids: Optional[List[int]] = None, ): """ Function invoked when calling the pipeline for generation. @@ -399,6 +480,9 @@ def __call__( t1 (`int`, *optional*, defaults to 47): Timestep t0. Should be in the range [t0 + 1, num_inference_steps - 1]. See the [paper](https://arxiv.org/abs/2303.13439), Sect. 3.3.1. + frame_ids (`List[int]`, *optional*): + Indexes of the frames that are being generated. This is used when generating longer videos + chunk-by-chunk. Returns: [`~pipelines.text_to_video_synthesis.TextToVideoPipelineOutput`]: @@ -407,7 +491,9 @@ def __call__( likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ assert video_length > 0 - frame_ids = list(range(video_length)) + if frame_ids is None: + frame_ids = list(range(video_length)) + assert len(frame_ids) == video_length assert num_videos_per_prompt == 1 From ef9590712a19619f187441dddcb077bcf0b17369 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 12 Jun 2023 18:28:30 +0200 Subject: [PATCH 118/199] [Tests] Relax tolerance of flaky failing test (#3755) relax tolerance slightly --- tests/models/test_models_unet_3d_condition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 762c4975da51..4193b6e17bd3 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -261,7 +261,7 @@ def test_lora_save_load(self): with torch.no_grad(): new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - assert (sample - new_sample).abs().max() < 5e-4 + assert (sample - new_sample).abs().max() < 1e-3 # LoRA and no LoRA should NOT be the same assert (sample - old_sample).abs().max() > 1e-4 From 34d14d78489f82b4abebdb62d90545312171c033 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 12 Jun 2023 18:29:58 +0200 Subject: [PATCH 119/199] [MultiControlNet] Allow save and load (#3747) * [MultiControlNet] Allow save and load * Correct more * [MultiControlNet] Allow save and load * make style * Apply suggestions from code review --- .../pipelines/controlnet/multicontrolnet.py | 121 +++++++++++++++++- .../controlnet/pipeline_controlnet.py | 15 +-- .../controlnet/pipeline_controlnet_img2img.py | 15 +-- .../controlnet/pipeline_controlnet_inpaint.py | 15 +-- tests/pipelines/controlnet/test_controlnet.py | 15 --- .../controlnet/test_controlnet_img2img.py | 15 --- .../controlnet/test_controlnet_inpaint.py | 15 --- 7 files changed, 123 insertions(+), 88 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/multicontrolnet.py b/src/diffusers/pipelines/controlnet/multicontrolnet.py index 91d40b20124c..921895b8fd92 100644 --- a/src/diffusers/pipelines/controlnet/multicontrolnet.py +++ b/src/diffusers/pipelines/controlnet/multicontrolnet.py @@ -1,10 +1,15 @@ -from typing import Any, Dict, List, Optional, Tuple, Union +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import torch from torch import nn from ...models.controlnet import ControlNetModel, ControlNetOutput from ...models.modeling_utils import ModelMixin +from ...utils import logging + + +logger = logging.get_logger(__name__) class MultiControlNetModel(ModelMixin): @@ -64,3 +69,117 @@ def forward( mid_block_res_sample += mid_sample return down_block_res_samples, mid_block_res_sample + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + is_main_process: bool = True, + save_function: Callable = None, + safe_serialization: bool = False, + variant: Optional[str] = None, + ): + """ + Save a model and its configuration file to a directory, so that it can be re-loaded using the + `[`~pipelines.controlnet.MultiControlNetModel.from_pretrained`]` class method. + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + is_main_process (`bool`, *optional*, defaults to `True`): + Whether the process calling this is the main process or not. Useful when in distributed training like + TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on + the main process to avoid race conditions. + save_function (`Callable`): + The function to use to save the state dictionary. Useful on distributed training like TPUs when one + need to replace `torch.save` by another method. Can be configured with the environment variable + `DIFFUSERS_SAVE_MODE`. + safe_serialization (`bool`, *optional*, defaults to `False`): + Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + variant (`str`, *optional*): + If specified, weights are saved in the format pytorch_model..bin. + """ + idx = 0 + model_path_to_save = save_directory + for controlnet in self.nets: + controlnet.save_pretrained( + model_path_to_save, + is_main_process=is_main_process, + save_function=save_function, + safe_serialization=safe_serialization, + variant=variant, + ) + + idx += 1 + model_path_to_save = model_path_to_save + f"_{idx}" + + @classmethod + def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs): + r""" + Instantiate a pretrained MultiControlNet model from multiple pre-trained controlnet models. + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train + the model, you should first set it back in training mode with `model.train()`. + + The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come + pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning + task. + + The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those + weights are discarded. + + Parameters: + pretrained_model_path (`os.PathLike`): + A path to a *directory* containing model weights saved using + [`~diffusers.pipelines.controlnet.MultiControlNetModel.save_pretrained`], e.g., + `./my_model_directory/controlnet`. + torch_dtype (`str` or `torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype + will be automatically derived from the model's weights. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. + device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): + A map that specifies where each submodule should go. It doesn't need to be refined to each + parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the + same device. + + To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For + more information about each option see [designing a device + map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + max_memory (`Dict`, *optional*): + A dictionary device identifier to maximum memory. Will default to the maximum memory available for each + GPU and the available CPU RAM if unset. + low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): + Speed up model loading by not initializing the weights and only loading the pre-trained weights. This + also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the + model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch, + setting this argument to `True` will raise an error. + variant (`str`, *optional*): + If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is + ignored when using `from_flax`. + use_safetensors (`bool`, *optional*, defaults to `None`): + If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the + `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from + `safetensors` weights. If set to `False`, loading will *not* use `safetensors`. + """ + idx = 0 + controlnets = [] + + # load controlnet and append to list until no controlnet directory exists anymore + # first controlnet has to be saved under `./mydirectory/controlnet` to be compliant with `DiffusionPipeline.from_prertained` + # second, third, ... controlnets have to be saved under `./mydirectory/controlnet_1`, `./mydirectory/controlnet_2`, ... + model_path_to_load = pretrained_model_path + while os.path.isdir(model_path_to_load): + controlnet = ControlNetModel.from_pretrained(model_path_to_load, **kwargs) + controlnets.append(controlnet) + + idx += 1 + model_path_to_load = pretrained_model_path + f"_{idx}" + + logger.info(f"{len(controlnets)} controlnets loaded from {pretrained_model_path}.") + + if len(controlnets) == 0: + raise ValueError( + f"No ControlNets found under {os.path.dirname(pretrained_model_path)}. Expected at least {pretrained_model_path + '_0'}." + ) + + return cls(controlnets) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 2a86ee0dfe1e..974d28fd5b05 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -14,7 +14,6 @@ import inspect -import os import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -560,7 +559,7 @@ def check_inputs( raise ValueError("A single batch of multiple conditionings are supported at the moment.") elif len(image) != len(self.controlnet.nets): raise ValueError( - "For multiple controlnets: `image` must have the same length as the number of controlnets." + f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." ) for image_ in image: @@ -679,18 +678,6 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype latents = latents * self.scheduler.init_noise_sigma return latents - # override DiffusionPipeline - def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - safe_serialization: bool = False, - variant: Optional[str] = None, - ): - if isinstance(self.controlnet, ControlNetModel): - super().save_pretrained(save_directory, safe_serialization, variant) - else: - raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.") - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index e42b27958446..febe4c8a5734 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -14,7 +14,6 @@ import inspect -import os import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -586,7 +585,7 @@ def check_inputs( raise ValueError("A single batch of multiple conditionings are supported at the moment.") elif len(image) != len(self.controlnet.nets): raise ValueError( - "For multiple controlnets: `image` must have the same length as the number of controlnets." + f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." ) for image_ in image: @@ -757,18 +756,6 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt return latents - # override DiffusionPipeline - def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - safe_serialization: bool = False, - variant: Optional[str] = None, - ): - if isinstance(self.controlnet, ControlNetModel): - super().save_pretrained(save_directory, safe_serialization, variant) - else: - raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.") - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index 165e2d88dca6..cab689d1d4e2 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -15,7 +15,6 @@ # This model implementation is heavily inspired by https://github.com/haofanwang/ControlNet-for-Diffusers/ import inspect -import os import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union @@ -718,7 +717,7 @@ def check_inputs( raise ValueError("A single batch of multiple conditionings are supported at the moment.") elif len(image) != len(self.controlnet.nets): raise ValueError( - "For multiple controlnets: `image` must have the same length as the number of controlnets." + f"For multiple controlnets: `image` must have the same length as the number of controlnets, but got {len(image)} images and {len(self.controlnet.nets)} ControlNets." ) for image_ in image: @@ -957,18 +956,6 @@ def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): return image_latents - # override DiffusionPipeline - def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - safe_serialization: bool = False, - variant: Optional[str] = None, - ): - if isinstance(self.controlnet, ControlNetModel): - super().save_pretrained(save_directory, safe_serialization, variant) - else: - raise NotImplementedError("Currently, the `save_pretrained()` is not implemented for Multi-ControlNet.") - @torch.no_grad() @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 9915998be24e..37d0f722fa70 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -346,21 +346,6 @@ def test_save_pretrained_raise_not_implemented_exception(self): except NotImplementedError: pass - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_float16(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_local(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_optional_components(self): - ... - @slow @require_torch_gpu diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index de8f578a3cce..18262149bb49 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -304,21 +304,6 @@ def test_save_pretrained_raise_not_implemented_exception(self): except NotImplementedError: pass - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_float16(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_local(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_optional_components(self): - ... - @slow @require_torch_gpu diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index 0f8808bcb728..b351ccfbf8f9 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -382,21 +382,6 @@ def test_save_pretrained_raise_not_implemented_exception(self): except NotImplementedError: pass - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_float16(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_local(self): - ... - - # override PipelineTesterMixin - @unittest.skip("save pretrained not implemented") - def test_save_load_optional_components(self): - ... - @slow @require_torch_gpu From ce5504934ac484fca39a1a5434ecfae09eabdf41 Mon Sep 17 00:00:00 2001 From: jfozard Date: Tue, 13 Jun 2023 01:25:46 +0100 Subject: [PATCH 120/199] Update pipeline_flax_stable_diffusion_controlnet.py (#3306) Update pipeline_flax_controlnet.py Change type of images array from jax.numpy.array to numpy.ndarray to permit in-place modification of the array when the safety checker detects a NSFW image. --- src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py index 6003fc96b0ad..872297605683 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_flax_controlnet.py @@ -464,7 +464,7 @@ def __call__( images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3) images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit) - images = np.asarray(images) + images = np.array(images) # block images if any(has_nsfw_concept): From 7761b89d7bf18bad00aa989b1e1bb0369f4e4293 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 14 Jun 2023 06:57:53 -1000 Subject: [PATCH 121/199] update conversion script for Kandinsky unet (#3766) * update kandinsky conversion script * style --------- Co-authored-by: yiyixuxu --- scripts/convert_kandinsky_to_diffusers.py | 167 ++++++++++++---------- 1 file changed, 89 insertions(+), 78 deletions(-) diff --git a/scripts/convert_kandinsky_to_diffusers.py b/scripts/convert_kandinsky_to_diffusers.py index de9879f7f03b..1b5722f5d5f3 100644 --- a/scripts/convert_kandinsky_to_diffusers.py +++ b/scripts/convert_kandinsky_to_diffusers.py @@ -8,7 +8,6 @@ from diffusers import UNet2DConditionModel from diffusers.models.prior_transformer import PriorTransformer from diffusers.models.vq_model import VQModel -from diffusers.pipelines.kandinsky.text_proj import KandinskyTextProjModel """ @@ -225,37 +224,55 @@ def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix UNET_CONFIG = { "act_fn": "silu", + "addition_embed_type": "text_image", + "addition_embed_type_num_heads": 64, "attention_head_dim": 64, - "block_out_channels": (384, 768, 1152, 1536), + "block_out_channels": [384, 768, 1152, 1536], "center_input_sample": False, - "class_embed_type": "identity", + "class_embed_type": None, + "class_embeddings_concat": False, + "conv_in_kernel": 3, + "conv_out_kernel": 3, "cross_attention_dim": 768, - "down_block_types": ( + "cross_attention_norm": None, + "down_block_types": [ "ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D", "SimpleCrossAttnDownBlock2D", "SimpleCrossAttnDownBlock2D", - ), + ], "downsample_padding": 1, "dual_cross_attention": False, + "encoder_hid_dim": 1024, + "encoder_hid_dim_type": "text_image_proj", "flip_sin_to_cos": True, "freq_shift": 0, "in_channels": 4, "layers_per_block": 3, + "mid_block_only_cross_attention": None, "mid_block_scale_factor": 1, "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", "norm_eps": 1e-05, "norm_num_groups": 32, + "num_class_embeds": None, "only_cross_attention": False, "out_channels": 8, + "projection_class_embeddings_input_dim": None, + "resnet_out_scale_factor": 1.0, + "resnet_skip_time_act": False, "resnet_time_scale_shift": "scale_shift", "sample_size": 64, - "up_block_types": ( + "time_cond_proj_dim": None, + "time_embedding_act_fn": None, + "time_embedding_dim": None, + "time_embedding_type": "positional", + "timestep_post_act": None, + "up_block_types": [ "SimpleCrossAttnUpBlock2D", "SimpleCrossAttnUpBlock2D", "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D", - ), + ], "upcast_attention": False, "use_linear_projection": False, } @@ -274,6 +291,8 @@ def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): diffusers_checkpoint.update(unet_time_embeddings(checkpoint)) diffusers_checkpoint.update(unet_conv_in(checkpoint)) + diffusers_checkpoint.update(unet_add_embedding(checkpoint)) + diffusers_checkpoint.update(unet_encoder_hid_proj(checkpoint)) # .input_blocks -> .down_blocks @@ -336,37 +355,55 @@ def unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): INPAINT_UNET_CONFIG = { "act_fn": "silu", + "addition_embed_type": "text_image", + "addition_embed_type_num_heads": 64, "attention_head_dim": 64, - "block_out_channels": (384, 768, 1152, 1536), + "block_out_channels": [384, 768, 1152, 1536], "center_input_sample": False, - "class_embed_type": "identity", + "class_embed_type": None, + "class_embeddings_concat": None, + "conv_in_kernel": 3, + "conv_out_kernel": 3, "cross_attention_dim": 768, - "down_block_types": ( + "cross_attention_norm": None, + "down_block_types": [ "ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D", "SimpleCrossAttnDownBlock2D", "SimpleCrossAttnDownBlock2D", - ), + ], "downsample_padding": 1, "dual_cross_attention": False, + "encoder_hid_dim": 1024, + "encoder_hid_dim_type": "text_image_proj", "flip_sin_to_cos": True, "freq_shift": 0, "in_channels": 9, "layers_per_block": 3, + "mid_block_only_cross_attention": None, "mid_block_scale_factor": 1, "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", "norm_eps": 1e-05, "norm_num_groups": 32, + "num_class_embeds": None, "only_cross_attention": False, "out_channels": 8, + "projection_class_embeddings_input_dim": None, + "resnet_out_scale_factor": 1.0, + "resnet_skip_time_act": False, "resnet_time_scale_shift": "scale_shift", "sample_size": 64, - "up_block_types": ( + "time_cond_proj_dim": None, + "time_embedding_act_fn": None, + "time_embedding_dim": None, + "time_embedding_type": "positional", + "timestep_post_act": None, + "up_block_types": [ "SimpleCrossAttnUpBlock2D", "SimpleCrossAttnUpBlock2D", "SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D", - ), + ], "upcast_attention": False, "use_linear_projection": False, } @@ -381,10 +418,12 @@ def inpaint_unet_model_from_original_config(): def inpaint_unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): diffusers_checkpoint = {} - num_head_channels = UNET_CONFIG["attention_head_dim"] + num_head_channels = INPAINT_UNET_CONFIG["attention_head_dim"] diffusers_checkpoint.update(unet_time_embeddings(checkpoint)) diffusers_checkpoint.update(unet_conv_in(checkpoint)) + diffusers_checkpoint.update(unet_add_embedding(checkpoint)) + diffusers_checkpoint.update(unet_encoder_hid_proj(checkpoint)) # .input_blocks -> .down_blocks @@ -440,38 +479,6 @@ def inpaint_unet_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): # done inpaint unet -# text proj - -TEXT_PROJ_CONFIG = {} - - -def text_proj_from_original_config(): - model = KandinskyTextProjModel(**TEXT_PROJ_CONFIG) - return model - - -# Note that the input checkpoint is the original text2img model checkpoint -def text_proj_original_checkpoint_to_diffusers_checkpoint(checkpoint): - diffusers_checkpoint = { - # .text_seq_proj.0 -> .encoder_hidden_states_proj - "encoder_hidden_states_proj.weight": checkpoint["to_model_dim_n.weight"], - "encoder_hidden_states_proj.bias": checkpoint["to_model_dim_n.bias"], - # .clip_tok_proj -> .clip_extra_context_tokens_proj - "clip_extra_context_tokens_proj.weight": checkpoint["clip_to_seq.weight"], - "clip_extra_context_tokens_proj.bias": checkpoint["clip_to_seq.bias"], - # .proj_n -> .embedding_proj - "embedding_proj.weight": checkpoint["proj_n.weight"], - "embedding_proj.bias": checkpoint["proj_n.bias"], - # .ln_model_n -> .embedding_norm - "embedding_norm.weight": checkpoint["ln_model_n.weight"], - "embedding_norm.bias": checkpoint["ln_model_n.bias"], - # .clip_emb -> .clip_image_embeddings_project_to_time_embeddings - "clip_image_embeddings_project_to_time_embeddings.weight": checkpoint["img_layer.weight"], - "clip_image_embeddings_project_to_time_embeddings.bias": checkpoint["img_layer.bias"], - } - - return diffusers_checkpoint - # unet utils @@ -506,6 +513,38 @@ def unet_conv_in(checkpoint): return diffusers_checkpoint +def unet_add_embedding(checkpoint): + diffusers_checkpoint = {} + + diffusers_checkpoint.update( + { + "add_embedding.text_norm.weight": checkpoint["ln_model_n.weight"], + "add_embedding.text_norm.bias": checkpoint["ln_model_n.bias"], + "add_embedding.text_proj.weight": checkpoint["proj_n.weight"], + "add_embedding.text_proj.bias": checkpoint["proj_n.bias"], + "add_embedding.image_proj.weight": checkpoint["img_layer.weight"], + "add_embedding.image_proj.bias": checkpoint["img_layer.bias"], + } + ) + + return diffusers_checkpoint + + +def unet_encoder_hid_proj(checkpoint): + diffusers_checkpoint = {} + + diffusers_checkpoint.update( + { + "encoder_hid_proj.image_embeds.weight": checkpoint["clip_to_seq.weight"], + "encoder_hid_proj.image_embeds.bias": checkpoint["clip_to_seq.bias"], + "encoder_hid_proj.text_proj.weight": checkpoint["to_model_dim_n.weight"], + "encoder_hid_proj.text_proj.bias": checkpoint["to_model_dim_n.bias"], + } + ) + + return diffusers_checkpoint + + # .out.0 -> .conv_norm_out def unet_conv_norm_out(checkpoint): diffusers_checkpoint = {} @@ -857,25 +896,13 @@ def text2img(*, args, checkpoint_map_location): unet_diffusers_checkpoint = unet_original_checkpoint_to_diffusers_checkpoint(unet_model, text2img_checkpoint) - # text proj interlude - - # The original decoder implementation includes a set of parameters that are used - # for creating the `encoder_hidden_states` which are what the U-net is conditioned - # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull - # the parameters into the KandinskyTextProjModel class - text_proj_model = text_proj_from_original_config() - - text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(text2img_checkpoint) - - load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True) - del text2img_checkpoint load_checkpoint_to_model(unet_diffusers_checkpoint, unet_model, strict=True) print("done loading text2img") - return unet_model, text_proj_model + return unet_model def inpaint_text2img(*, args, checkpoint_map_location): @@ -891,25 +918,13 @@ def inpaint_text2img(*, args, checkpoint_map_location): inpaint_unet_model, inpaint_text2img_checkpoint ) - # text proj interlude - - # The original decoder implementation includes a set of parameters that are used - # for creating the `encoder_hidden_states` which are what the U-net is conditioned - # on. The diffusers conditional unet directly takes the encoder_hidden_states. We pull - # the parameters into the KandinskyTextProjModel class - text_proj_model = text_proj_from_original_config() - - text_proj_checkpoint = text_proj_original_checkpoint_to_diffusers_checkpoint(inpaint_text2img_checkpoint) - - load_checkpoint_to_model(text_proj_checkpoint, text_proj_model, strict=True) - del inpaint_text2img_checkpoint load_checkpoint_to_model(inpaint_unet_diffusers_checkpoint, inpaint_unet_model, strict=True) print("done loading inpaint text2img") - return inpaint_unet_model, text_proj_model + return inpaint_unet_model # movq @@ -1384,15 +1399,11 @@ def load_checkpoint_to_model(checkpoint, model, strict=False): prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location) prior_model.save_pretrained(args.dump_path) elif args.debug == "text2img": - unet_model, text_proj_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location) + unet_model = text2img(args=args, checkpoint_map_location=checkpoint_map_location) unet_model.save_pretrained(f"{args.dump_path}/unet") - text_proj_model.save_pretrained(f"{args.dump_path}/text_proj") elif args.debug == "inpaint_text2img": - inpaint_unet_model, inpaint_text_proj_model = inpaint_text2img( - args=args, checkpoint_map_location=checkpoint_map_location - ) + inpaint_unet_model = inpaint_text2img(args=args, checkpoint_map_location=checkpoint_map_location) inpaint_unet_model.save_pretrained(f"{args.dump_path}/inpaint_unet") - inpaint_text_proj_model.save_pretrained(f"{args.dump_path}/inpaint_text_proj") elif args.debug == "decoder": decoder = movq(args=args, checkpoint_map_location=checkpoint_map_location) decoder.save_pretrained(f"{args.dump_path}/decoder") From f96b7606582bd6ca1a779d7c346083f578352ac7 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Wed, 14 Jun 2023 10:21:39 -0700 Subject: [PATCH 122/199] [docs] Fix Colab notebook cells (#3777) fix colab notebook cells --- docs/source/en/quicktour.mdx | 5 +- docs/source/en/training/dreambooth.mdx | 2 - docs/source/en/training/lora.mdx | 2 - docs/source/en/training/text_inversion.mdx | 2 - docs/source/en/tutorials/basic_training.mdx | 5 +- .../custom_pipeline_examples.mdx | 2 + .../custom_pipeline_overview.mdx | 2 + docs/source/en/using-diffusers/img2img.mdx | 5 +- docs/source/en/using-diffusers/loading.mdx | 2 + .../en/using-diffusers/other-formats.mdx | 7 +- .../en/using-diffusers/reproducibility.mdx | 2 + .../en/using-diffusers/reusing_seeds.mdx | 2 + docs/source/en/using-diffusers/schedulers.mdx | 2 + .../stable_diffusion_jax_how_to.mdx | 7 +- .../en/using-diffusers/using_safetensors.mdx | 7 +- .../en/using-diffusers/weighted_prompts.mdx | 2 + .../en/using-diffusers/write_own_pipeline.mdx | 78 +++++++++---------- 17 files changed, 76 insertions(+), 58 deletions(-) diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx index 2a2a5a3ad903..b3ac68ca9611 100644 --- a/docs/source/en/quicktour.mdx +++ b/docs/source/en/quicktour.mdx @@ -32,8 +32,9 @@ The quicktour is a simplified version of the introductory 🧨 Diffusers [notebo Before you begin, make sure you have all the necessary libraries installed: -```bash -!pip install --upgrade diffusers accelerate transformers +```py +# uncomment to install the necessary libraries in Colab +#!pip install --upgrade diffusers accelerate transformers ``` - [🤗 Accelerate](https://huggingface.co/docs/accelerate/index) speeds up model loading for inference and training. diff --git a/docs/source/en/training/dreambooth.mdx b/docs/source/en/training/dreambooth.mdx index c26762d4a75d..6ca9c4531b82 100644 --- a/docs/source/en/training/dreambooth.mdx +++ b/docs/source/en/training/dreambooth.mdx @@ -12,8 +12,6 @@ specific language governing permissions and limitations under the License. # DreamBooth -[[open-in-colab]] - [DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text-to-image models like Stable Diffusion given just a few (3-5) images of a subject. It allows the model to generate contextualized images of the subject in different scenes, poses, and views. ![Dreambooth examples from the project's blog](https://dreambooth.github.io/DreamBooth_files/teaser_static.jpg) diff --git a/docs/source/en/training/lora.mdx b/docs/source/en/training/lora.mdx index 1208178810a5..dfb31c7ef87a 100644 --- a/docs/source/en/training/lora.mdx +++ b/docs/source/en/training/lora.mdx @@ -12,8 +12,6 @@ specific language governing permissions and limitations under the License. # Low-Rank Adaptation of Large Language Models (LoRA) -[[open-in-colab]] - Currently, LoRA is only supported for the attention layers of the [`UNet2DConditionalModel`]. We also diff --git a/docs/source/en/training/text_inversion.mdx b/docs/source/en/training/text_inversion.mdx index a4fe4c2c4e5b..050b0ca3d403 100644 --- a/docs/source/en/training/text_inversion.mdx +++ b/docs/source/en/training/text_inversion.mdx @@ -14,8 +14,6 @@ specific language governing permissions and limitations under the License. # Textual Inversion -[[open-in-colab]] - [Textual Inversion](https://arxiv.org/abs/2208.01618) is a technique for capturing novel concepts from a small number of example images. While the technique was originally demonstrated with a [latent diffusion model](https://github.com/CompVis/latent-diffusion), it has since been applied to other model variants like [Stable Diffusion](https://huggingface.co/docs/diffusers/main/en/conceptual/stable_diffusion). The learned concepts can be used to better control the images generated from text-to-image pipelines. It learns new "words" in the text encoder's embedding space, which are used within text prompts for personalized image generation. ![Textual Inversion example](https://textual-inversion.github.io/static/images/editing/colorful_teapot.JPG) diff --git a/docs/source/en/tutorials/basic_training.mdx b/docs/source/en/tutorials/basic_training.mdx index 99221274f745..c8f5c7fac780 100644 --- a/docs/source/en/tutorials/basic_training.mdx +++ b/docs/source/en/tutorials/basic_training.mdx @@ -26,8 +26,9 @@ This tutorial will teach you how to train a [`UNet2DModel`] from scratch on a su Before you begin, make sure you have 🤗 Datasets installed to load and preprocess image datasets, and 🤗 Accelerate, to simplify training on any number of GPUs. The following command will also install [TensorBoard](https://www.tensorflow.org/tensorboard) to visualize training metrics (you can also use [Weights & Biases](https://docs.wandb.ai/) to track your training). -```bash -!pip install diffusers[training] +```py +# uncomment to install the necessary libraries in Colab +#!pip install diffusers[training] ``` We encourage you to share your model with the community, and in order to do that, you'll need to login to your Hugging Face account (create one [here](https://hf.co/join) if you don't already have one!). You can login from a notebook and enter your token when prompted: diff --git a/docs/source/en/using-diffusers/custom_pipeline_examples.mdx b/docs/source/en/using-diffusers/custom_pipeline_examples.mdx index 93ac6d1f782c..f97a9ad09ac5 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_examples.mdx +++ b/docs/source/en/using-diffusers/custom_pipeline_examples.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Community pipelines +[[open-in-colab]] + > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).** **Community** examples consist of both inference and training examples that have been added by the community. diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx index 3c5df7c0dd6e..78a64b6bcb96 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.mdx +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Load community pipelines +[[open-in-colab]] + Community pipelines are any [`DiffusionPipeline`] class that are different from the original implementation as specified in their paper (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://arxiv.org/abs/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline. There are many cool community pipelines like [Speech to Image](https://github.com/huggingface/diffusers/tree/main/examples/community#speech-to-image) or [Composable Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#composable-stable-diffusion), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community). diff --git a/docs/source/en/using-diffusers/img2img.mdx b/docs/source/en/using-diffusers/img2img.mdx index 71540fbf5dd9..5b881b311a6a 100644 --- a/docs/source/en/using-diffusers/img2img.mdx +++ b/docs/source/en/using-diffusers/img2img.mdx @@ -18,8 +18,9 @@ The [`StableDiffusionImg2ImgPipeline`] lets you pass a text prompt and an initia Before you begin, make sure you have all the necessary libraries installed: -```bash -!pip install diffusers transformers ftfy accelerate +```py +# uncomment to install the necessary libraries in Colab +#!pip install diffusers transformers ftfy accelerate ``` Get started by creating a [`StableDiffusionImg2ImgPipeline`] with a pretrained Stable Diffusion model like [`nitrosocke/Ghibli-Diffusion`](https://huggingface.co/nitrosocke/Ghibli-Diffusion). diff --git a/docs/source/en/using-diffusers/loading.mdx b/docs/source/en/using-diffusers/loading.mdx index 24dd1dd04cd1..8ebd3569e4b0 100644 --- a/docs/source/en/using-diffusers/loading.mdx +++ b/docs/source/en/using-diffusers/loading.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Load pipelines, models, and schedulers +[[open-in-colab]] + Having an easy way to use a diffusion system for inference is essential to 🧨 Diffusers. Diffusion systems often consist of multiple components like parameterized models, tokenizers, and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API, while remaining flexible enough to be adapted for other use cases, such as loading each component individually as building blocks to assemble your own diffusion system. Everything you need for inference or training is accessible with the `from_pretrained()` method. diff --git a/docs/source/en/using-diffusers/other-formats.mdx b/docs/source/en/using-diffusers/other-formats.mdx index 8e606f13469d..2aeb9f3ae204 100644 --- a/docs/source/en/using-diffusers/other-formats.mdx +++ b/docs/source/en/using-diffusers/other-formats.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Load different Stable Diffusion formats +[[open-in-colab]] + Stable Diffusion models are available in different formats depending on the framework they're trained and saved with, and where you download them from. Converting these formats for use in 🤗 Diffusers allows you to use all the features supported by the library, such as [using different schedulers](schedulers) for inference, [building your custom pipeline](write_own_pipeline), and a variety of techniques and methods for [optimizing inference speed](./optimization/opt_overview). @@ -141,8 +143,9 @@ pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.conf Download a LoRA checkpoint from Civitai; this example uses the [Howls Moving Castle,Interior/Scenery LoRA (Ghibli Stlye)](https://civitai.com/models/14605?modelVersionId=19998) checkpoint, but feel free to try out any LoRA checkpoint! -```bash -!wget https://civitai.com/api/download/models/19998 -O howls_moving_castle.safetensors +```py +# uncomment to download the safetensor weights +#!wget https://civitai.com/api/download/models/19998 -O howls_moving_castle.safetensors ``` Load the LoRA checkpoint into the pipeline with the [`~loaders.LoraLoaderMixin.load_lora_weights`] method: diff --git a/docs/source/en/using-diffusers/reproducibility.mdx b/docs/source/en/using-diffusers/reproducibility.mdx index b666dac72cbf..1594e967c847 100644 --- a/docs/source/en/using-diffusers/reproducibility.mdx +++ b/docs/source/en/using-diffusers/reproducibility.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Create reproducible pipelines +[[open-in-colab]] + Reproducibility is important for testing, replicating results, and can even be used to [improve image quality](reusing_seeds). However, the randomness in diffusion models is a desired property because it allows the pipeline to generate different images every time it is run. While you can't expect to get the exact same results across platforms, you can expect results to be reproducible across releases and platforms within a certain tolerance range. Even then, tolerance varies depending on the diffusion pipeline and checkpoint. This is why it's important to understand how to control sources of randomness in diffusion models or use deterministic algorithms. diff --git a/docs/source/en/using-diffusers/reusing_seeds.mdx b/docs/source/en/using-diffusers/reusing_seeds.mdx index eea0fd7e3e9d..1ff84f02596e 100644 --- a/docs/source/en/using-diffusers/reusing_seeds.mdx +++ b/docs/source/en/using-diffusers/reusing_seeds.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Improve image quality with deterministic generation +[[open-in-colab]] + A common way to improve the quality of generated images is with *deterministic batch generation*, generate a batch of images and select one image to improve with a more detailed prompt in a second round of inference. The key is to pass a list of [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html#generator)'s to the pipeline for batched image generation, and tie each `Generator` to a seed so you can reuse it for an image. Let's use [`runwayml/stable-diffusion-v1-5`](runwayml/stable-diffusion-v1-5) for example, and generate several versions of the following prompt: diff --git a/docs/source/en/using-diffusers/schedulers.mdx b/docs/source/en/using-diffusers/schedulers.mdx index 741d92bdd90d..c2395c106c15 100644 --- a/docs/source/en/using-diffusers/schedulers.mdx +++ b/docs/source/en/using-diffusers/schedulers.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Schedulers +[[open-in-colab]] + Diffusion pipelines are inherently a collection of diffusion models and schedulers that are partly independent from each other. This means that one is able to switch out parts of the pipeline to better customize a pipeline to one's use case. The best example of this is the [Schedulers](../api/schedulers/overview.mdx). diff --git a/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.mdx b/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.mdx index e0332fdc6496..2150f2f769fd 100644 --- a/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.mdx +++ b/docs/source/en/using-diffusers/stable_diffusion_jax_how_to.mdx @@ -14,9 +14,10 @@ Note that JAX is not exclusive to TPUs, but it shines on that hardware because e First make sure diffusers is installed. -```bash -!pip install jax==0.3.25 jaxlib==0.3.25 flax transformers ftfy -!pip install diffusers +```py +# uncomment to install the necessary libraries in Colab +#!pip install jax==0.3.25 jaxlib==0.3.25 flax transformers ftfy +#!pip install diffusers ``` ```python diff --git a/docs/source/en/using-diffusers/using_safetensors.mdx b/docs/source/en/using-diffusers/using_safetensors.mdx index 2015f2faf85a..c312ab597075 100644 --- a/docs/source/en/using-diffusers/using_safetensors.mdx +++ b/docs/source/en/using-diffusers/using_safetensors.mdx @@ -1,11 +1,14 @@ # Load safetensors +[[open-in-colab]] + [safetensors](https://github.com/huggingface/safetensors) is a safe and fast file format for storing and loading tensors. Typically, PyTorch model weights are saved or *pickled* into a `.bin` file with Python's [`pickle`](https://docs.python.org/3/library/pickle.html) utility. However, `pickle` is not secure and pickled files may contain malicious code that can be executed. safetensors is a secure alternative to `pickle`, making it ideal for sharing model weights. This guide will show you how you load `.safetensor` files, and how to convert Stable Diffusion model weights stored in other formats to `.safetensor`. Before you start, make sure you have safetensors installed: -```bash -!pip install safetensors +```py +# uncomment to install the necessary libraries in Colab +#!pip install safetensors ``` If you look at the [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5/tree/main) repository, you'll see weights inside the `text_encoder`, `unet` and `vae` subfolders are stored in the `.safetensors` format. By default, 🤗 Diffusers automatically loads these `.safetensors` files from their subfolders if they're available in the model repository. diff --git a/docs/source/en/using-diffusers/weighted_prompts.mdx b/docs/source/en/using-diffusers/weighted_prompts.mdx index 58e670fbafe9..5e6371d0116a 100644 --- a/docs/source/en/using-diffusers/weighted_prompts.mdx +++ b/docs/source/en/using-diffusers/weighted_prompts.mdx @@ -12,6 +12,8 @@ specific language governing permissions and limitations under the License. # Weighting prompts +[[open-in-colab]] + Text-guided diffusion models generate images based on a given text prompt. The text prompt can include multiple concepts that the model should generate and it's often desirable to weight certain parts of the prompt more or less. diff --git a/docs/source/en/using-diffusers/write_own_pipeline.mdx b/docs/source/en/using-diffusers/write_own_pipeline.mdx index be92980118b1..c7e257f4fa36 100644 --- a/docs/source/en/using-diffusers/write_own_pipeline.mdx +++ b/docs/source/en/using-diffusers/write_own_pipeline.mdx @@ -42,63 +42,63 @@ To recreate the pipeline with the model and scheduler separately, let's write ou 1. Load the model and scheduler: - ```py - >>> from diffusers import DDPMScheduler, UNet2DModel +```py +>>> from diffusers import DDPMScheduler, UNet2DModel - >>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256") - >>> model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda") - ``` +>>> scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256") +>>> model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda") +``` 2. Set the number of timesteps to run the denoising process for: - ```py - >>> scheduler.set_timesteps(50) - ``` +```py +>>> scheduler.set_timesteps(50) +``` 3. Setting the scheduler timesteps creates a tensor with evenly spaced elements in it, 50 in this example. Each element corresponds to a timestep at which the model denoises an image. When you create the denoising loop later, you'll iterate over this tensor to denoise an image: - ```py - >>> scheduler.timesteps - tensor([980, 960, 940, 920, 900, 880, 860, 840, 820, 800, 780, 760, 740, 720, - 700, 680, 660, 640, 620, 600, 580, 560, 540, 520, 500, 480, 460, 440, - 420, 400, 380, 360, 340, 320, 300, 280, 260, 240, 220, 200, 180, 160, - 140, 120, 100, 80, 60, 40, 20, 0]) - ``` +```py +>>> scheduler.timesteps +tensor([980, 960, 940, 920, 900, 880, 860, 840, 820, 800, 780, 760, 740, 720, + 700, 680, 660, 640, 620, 600, 580, 560, 540, 520, 500, 480, 460, 440, + 420, 400, 380, 360, 340, 320, 300, 280, 260, 240, 220, 200, 180, 160, + 140, 120, 100, 80, 60, 40, 20, 0]) +``` 4. Create some random noise with the same shape as the desired output: - ```py - >>> import torch +```py +>>> import torch - >>> sample_size = model.config.sample_size - >>> noise = torch.randn((1, 3, sample_size, sample_size)).to("cuda") - ``` +>>> sample_size = model.config.sample_size +>>> noise = torch.randn((1, 3, sample_size, sample_size)).to("cuda") +``` -4. Now write a loop to iterate over the timesteps. At each timestep, the model does a [`UNet2DModel.forward`] pass and returns the noisy residual. The scheduler's [`~DDPMScheduler.step`] method takes the noisy residual, timestep, and input and it predicts the image at the previous timestep. This output becomes the next input to the model in the denoising loop, and it'll repeat until it reaches the end of the `timesteps` array. +5. Now write a loop to iterate over the timesteps. At each timestep, the model does a [`UNet2DModel.forward`] pass and returns the noisy residual. The scheduler's [`~DDPMScheduler.step`] method takes the noisy residual, timestep, and input and it predicts the image at the previous timestep. This output becomes the next input to the model in the denoising loop, and it'll repeat until it reaches the end of the `timesteps` array. - ```py - >>> input = noise +```py +>>> input = noise - >>> for t in scheduler.timesteps: - ... with torch.no_grad(): - ... noisy_residual = model(input, t).sample - ... previous_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample - ... input = previous_noisy_sample - ``` +>>> for t in scheduler.timesteps: +... with torch.no_grad(): +... noisy_residual = model(input, t).sample +... previous_noisy_sample = scheduler.step(noisy_residual, t, input).prev_sample +... input = previous_noisy_sample +``` - This is the entire denoising process, and you can use this same pattern to write any diffusion system. +This is the entire denoising process, and you can use this same pattern to write any diffusion system. -5. The last step is to convert the denoised output into an image: +6. The last step is to convert the denoised output into an image: - ```py - >>> from PIL import Image - >>> import numpy as np +```py +>>> from PIL import Image +>>> import numpy as np - >>> image = (input / 2 + 0.5).clamp(0, 1) - >>> image = image.cpu().permute(0, 2, 3, 1).numpy()[0] - >>> image = Image.fromarray((image * 255).round().astype("uint8")) - >>> image - ``` +>>> image = (input / 2 + 0.5).clamp(0, 1) +>>> image = image.cpu().permute(0, 2, 3, 1).numpy()[0] +>>> image = Image.fromarray((image * 255).round().astype("uint8")) +>>> image +``` In the next section, you'll put your skills to the test and breakdown the more complex Stable Diffusion pipeline. The steps are more or less the same. You'll initialize the necessary components, and set the number of timesteps to create a `timestep` array. The `timestep` array is used in the denoising loop, and for each element in this array, the model predicts a less noisy image. The denoising loop iterates over the `timestep`'s, and at each timestep, it outputs a noisy residual and the scheduler uses it to predict a less noisy image at the previous timestep. This process is repeated until you reach the end of the `timestep` array. From 027a365a62ef971bb956b10a0ae3538ca3826b4d Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 15 Jun 2023 07:43:07 +0530 Subject: [PATCH 123/199] [Bug Report template] modify the issue template to include core maintainers. (#3785) * modify the issue template to include core maintainers. * add: entry for audio. * Update .github/ISSUE_TEMPLATE/bug-report.yml Co-authored-by: Patrick von Platen --------- Co-authored-by: Patrick von Platen --- .github/ISSUE_TEMPLATE/bug-report.yml | 29 +++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index a7a8ede4dd20..815036358953 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -49,3 +49,32 @@ body: placeholder: diffusers version, platform, python version, ... validations: required: true + - type: textarea + id: who-can-help + attributes: + label: Who can help? + description: | + Your issue will be replied to more quickly if you can figure out the right person to tag with @ + If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**. + + All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and + a core maintainer will ping the right person. + + Please tag fewer than 3 people. + + General library related questions: @patrickvonplaten and @sayakpaul + + Questions on the training examples: @williamberman, @sayakpaul, @yiyixuxu + + Questions on memory optimizations, LoRA, float16, etc.: @williamberman, @patrickvonplaten, and @sayakpaul + + Questions on schedulers: @patrickvonplaten and @williamberman + + Questions on models and pipelines: @patrickvonplaten, @sayakpaul, and @williamberman + + Questions on JAX- and MPS-related things: @pcuenca + + Questions on audio pipelines: @patrickvonplaten, @kashif, and @sanchit-gandhi + + Documentation: @stevhliu and @yiyixuxu + placeholder: "@Username ..." From 98e13816d91ed7b78b2e7386248ad3c4494166a2 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:27:36 +0530 Subject: [PATCH 124/199] Renaming ema model to target --- .../train_consistency_distillation.py | 73 +++++++------------ 1 file changed, 28 insertions(+), 45 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index b664cbe8bb39..375ac4ab0466 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -185,11 +185,6 @@ def parse_args(): "--adam_weight_decay", type=float, default=1e-6, help="Weight decay magnitude for the Adam optimizer." ) parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer.") - parser.add_argument( - "--use_ema", - action="store_true", - help="Whether to use Exponential Moving Average for the final model weights.", - ) parser.add_argument("--ema_inv_gamma", type=float, default=1.0, help="The inverse gamma value for the EMA decay.") parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.") parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.") @@ -314,8 +309,7 @@ def main(args): if version.parse(accelerate.__version__) >= version.parse("0.16.0"): # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format def save_model_hook(models, weights, output_dir): - if args.use_ema: - ema_model.save_pretrained(os.path.join(output_dir, "unet_ema")) + target_model_ema.save_pretrained(os.path.join(output_dir, "unet_ema")) for i, model in enumerate(models): model.save_pretrained(os.path.join(output_dir, "unet")) @@ -324,11 +318,10 @@ def save_model_hook(models, weights, output_dir): weights.pop() def load_model_hook(models, input_dir): - if args.use_ema: - load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel) - ema_model.load_state_dict(load_model.state_dict()) - ema_model.to(accelerator.device) - del load_model + load_model = EMAModel.from_pretrained(os.path.join(input_dir, "unet_ema"), UNet2DModel) + target_model_ema.load_state_dict(load_model.state_dict()) + target_model_ema.to(accelerator.device) + del load_model for i in range(len(models)): # pop models so that they are not loaded again @@ -429,20 +422,18 @@ def load_model_hook(models, input_dir): num_scales = 40 noise_scheduler.set_timesteps(num_scales) timesteps = noise_scheduler.timesteps - # print(teacher_model) # Create EMA for the model, this is the target model in the paper - if args.use_ema: - ema_model = EMAModel( - model.parameters(), - decay=args.ema_max_decay, - use_ema_warmup=True, - inv_gamma=args.ema_inv_gamma, - power=args.ema_power, - model_cls=UNet2DModel, - model_config=model.config, - ) + target_model_ema = EMAModel( + model.parameters(), + decay=args.ema_max_decay, + use_ema_warmup=True, + inv_gamma=args.ema_inv_gamma, + power=args.ema_power, + model_cls=UNet2DModel, + model_config=model.config, + ) if args.enable_xformers_memory_efficient_attention: if is_xformers_available(): @@ -514,12 +505,11 @@ def transform_images(examples): ) # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, ema_model = accelerator.prepare( - model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, ema_model + model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema = accelerator.prepare( + model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema ) - if args.use_ema: - ema_model.to(accelerator.device) + target_model_ema.to(accelerator.device) # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. @@ -583,7 +573,6 @@ def transform_images(examples): labels = batch["labels"] # Sample noise that we'll add to the images noise = torch.randn(clean_images.shape).to(clean_images.device) - bsz = clean_images.shape[0] # Sample a random timestep for each image, TODO - allow different timesteps in a batch index = torch.randint( 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=clean_images.device @@ -594,7 +583,7 @@ def transform_images(examples): noised_image = clean_images + noise*append_dims(timestep, clean_images.ndim) scaled_timesteps = noise_scheduler.scale_timestep(timestep) scaled_timesteps_prev = noise_scheduler.scale_timestep(timestep_prev) - ema_model.copy_to(target_model.parameters()) + target_model_ema.copy_to(target_model.parameters()) with accelerator.accumulate(model): # Predict the noise residual @@ -603,7 +592,7 @@ def transform_images(examples): model_output, timestep, noised_image, use_noise=False ).prev_sample - # Heun Solver to get previous timestep image + # Heun Solver to get previous timestep image using teacher model samples = noised_image x = samples model_output = teacher_model(x, scaled_timesteps, class_labels=labels).sample @@ -626,7 +615,7 @@ def transform_images(examples): model_output, timestep_prev, denoised_image, use_noise=False ).prev_sample - loss = F.mse_loss(distiller, distiller_target) # this could have different weights! + loss = F.mse_loss(distiller, distiller_target) loss = loss.mean() accelerator.backward(loss) @@ -639,8 +628,7 @@ def transform_images(examples): # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: - if args.use_ema: - ema_model.step(model.parameters()) + target_model_ema.step(model.parameters()) progress_bar.update(1) global_step += 1 @@ -651,8 +639,7 @@ def transform_images(examples): logger.info(f"Saved state to {save_path}") logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step} - if args.use_ema: - logs["ema_decay"] = ema_model.cur_decay_value + logs["ema_decay"] = target_model_ema.cur_decay_value progress_bar.set_postfix(**logs) accelerator.log(logs, step=global_step) progress_bar.close() @@ -664,9 +651,8 @@ def transform_images(examples): if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1: unet = accelerator.unwrap_model(model) - if args.use_ema: - ema_model.store(unet.parameters()) - ema_model.copy_to(unet.parameters()) + target_model_ema.store(unet.parameters()) + target_model_ema.copy_to(unet.parameters()) pipeline = ConsistencyModelPipeline( unet=unet, @@ -682,8 +668,7 @@ def transform_images(examples): output_type="numpy", ).images - if args.use_ema: - ema_model.restore(unet.parameters()) + target_model_ema.restore(unet.parameters()) # denormalize the images and save to tensorboard images_processed = (images * 255).round().astype("uint8") @@ -705,9 +690,8 @@ def transform_images(examples): # save the model unet = accelerator.unwrap_model(model) - if args.use_ema: - ema_model.store(unet.parameters()) - ema_model.copy_to(unet.parameters()) + target_model_ema.store(unet.parameters()) + target_model_ema.copy_to(unet.parameters()) pipeline = ConsistencyModelPipeline( unet=unet, @@ -716,8 +700,7 @@ def transform_images(examples): pipeline.save_pretrained(args.output_dir) - if args.use_ema: - ema_model.restore(unet.parameters()) + target_model_ema.restore(unet.parameters()) if args.push_to_hub: repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False) From baefc87c8a305ace0e26ae9a8a51bbb14efd7bf8 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:39:59 +0530 Subject: [PATCH 125/199] Add some comments --- .../train_consistency_distillation.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 375ac4ab0466..a6d131bf7290 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -235,7 +235,7 @@ def parse_args(): type=int, default=500, help=( - "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" + "Save a checkpoint of the training state every X updates. These checkpoints are only suitable for resuming" " training using `--resume_from_checkpoint`." ), ) @@ -369,7 +369,7 @@ def load_model_hook(models, input_dir): elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) - # Initialize the model + # Initialize the model, using a smaller model than the one defined in the original paper by default if args.model_config_name_or_path is None: model = UNet2DModel( sample_size= args.resolution, @@ -414,9 +414,10 @@ def load_model_hook(models, input_dir): model = UNet2DModel.from_config(config) target_model = UNet2DModel.from_config(config) + # load the model to distill into a consistency model teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet model = model.double() - target_model = target_model.double() + target_model = target_model.double() # TODO : support half precision training teacher_model = teacher_model.double() noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 @@ -579,7 +580,7 @@ def transform_images(examples): ).long() timestep = timesteps[index] timestep_prev = timestep + 1 - # TO-DO, we should have an add noise in the scheduler maybe? + # TODO, we should have an add noise in the scheduler maybe? noised_image = clean_images + noise*append_dims(timestep, clean_images.ndim) scaled_timesteps = noise_scheduler.scale_timestep(timestep) scaled_timesteps_prev = noise_scheduler.scale_timestep(timestep_prev) @@ -593,6 +594,7 @@ def transform_images(examples): ).prev_sample # Heun Solver to get previous timestep image using teacher model + # TODO - make this cleaner samples = noised_image x = samples model_output = teacher_model(x, scaled_timesteps, class_labels=labels).sample From 1ae15fa64c040673fea6688a405e2b2fc872d61e Mon Sep 17 00:00:00 2001 From: takuoko Date: Thu, 15 Jun 2023 21:34:12 +0900 Subject: [PATCH 126/199] [Enhance] Update reference (#3723) * update reference pipeline * update reference pipeline --------- Co-authored-by: Patrick von Platen --- .../stable_diffusion_controlnet_reference.py | 32 +++++++++++++------ .../community/stable_diffusion_reference.py | 15 +++++++++ 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/examples/community/stable_diffusion_controlnet_reference.py b/examples/community/stable_diffusion_controlnet_reference.py index ca06136d7829..f52da6f5a193 100644 --- a/examples/community/stable_diffusion_controlnet_reference.py +++ b/examples/community/stable_diffusion_controlnet_reference.py @@ -1,6 +1,7 @@ # Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280 from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import numpy as np import PIL.Image import torch @@ -97,7 +98,14 @@ def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do def __call__( self, prompt: Union[str, List[str]] = None, - image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None, height: Optional[int] = None, width: Optional[int] = None, @@ -130,8 +138,8 @@ def __call__( prompt (`str` or `List[str]`, *optional*): The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. instead. - image (`torch.FloatTensor`, `PIL.Image.Image`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, - `List[List[torch.FloatTensor]]`, or `List[List[PIL.Image.Image]]`): + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): The ControlNet input condition. ControlNet uses this input condition to generate guidance to Unet. If the type is specified as `Torch.FloatTensor`, it is passed to ControlNet as is. `PIL.Image.Image` can also be accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If @@ -223,15 +231,12 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ - # 0. Default height and width to unet - height, width = self._default_height_width(height, width, image) + assert reference_attn or reference_adain, "`reference_attn` or `reference_adain` must be True." # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, image, - height, - width, callback_steps, negative_prompt, prompt_embeds, @@ -266,6 +271,9 @@ def __call__( guess_mode = guess_mode or global_pool_conditions # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -274,6 +282,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Prepare image @@ -289,6 +298,7 @@ def __call__( do_classifier_free_guidance=do_classifier_free_guidance, guess_mode=guess_mode, ) + height, width = image.shape[-2:] elif isinstance(controlnet, MultiControlNetModel): images = [] @@ -308,6 +318,7 @@ def __call__( images.append(image_) image = images + height, width = image[0].shape[-2:] else: assert False @@ -720,14 +731,15 @@ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb= # controlnet(s) inference if guess_mode and do_classifier_free_guidance: # Infer ControlNet only for the conditional batch. - controlnet_latent_model_input = latents + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) controlnet_prompt_embeds = prompt_embeds.chunk(2)[1] else: - controlnet_latent_model_input = latent_model_input + control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds down_block_res_samples, mid_block_res_sample = self.controlnet( - controlnet_latent_model_input, + control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=image, diff --git a/examples/community/stable_diffusion_reference.py b/examples/community/stable_diffusion_reference.py index dbfb768f8b4f..364d5d80d721 100644 --- a/examples/community/stable_diffusion_reference.py +++ b/examples/community/stable_diffusion_reference.py @@ -9,6 +9,7 @@ from diffusers.models.attention import BasicTransformerBlock from diffusers.models.unet_2d_blocks import CrossAttnDownBlock2D, CrossAttnUpBlock2D, DownBlock2D, UpBlock2D from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import rescale_noise_cfg from diffusers.utils import PIL_INTERPOLATION, logging, randn_tensor @@ -179,6 +180,7 @@ def __call__( callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, attention_auto_machine_weight: float = 1.0, gn_auto_machine_weight: float = 1.0, style_fidelity: float = 0.5, @@ -248,6 +250,11 @@ def __call__( A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under `self.processor` in [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. attention_auto_machine_weight (`float`): Weight of using reference query for self attention's context. If attention_auto_machine_weight=1.0, use reference query for all self attention's context. @@ -295,6 +302,9 @@ def __call__( do_classifier_free_guidance = guidance_scale > 1.0 # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) prompt_embeds = self._encode_prompt( prompt, device, @@ -303,6 +313,7 @@ def __call__( negative_prompt, prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, ) # 4. Preprocess reference image @@ -748,6 +759,10 @@ def hacked_UpBlock2D_forward(self, hidden_states, res_hidden_states_tuple, temb= noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] From 2715079344b725bdb045f601551dae02509e393e Mon Sep 17 00:00:00 2001 From: cmdr2 Date: Thu, 15 Jun 2023 18:26:40 +0530 Subject: [PATCH 127/199] Fix broken cpu-offloading in legacy inpainting SD pipeline (#3773) --- .../pipeline_stable_diffusion_inpaint_legacy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 147d914fe6c1..55d571ab0998 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -548,7 +548,7 @@ def get_timesteps(self, num_inference_steps, strength, device): return timesteps, num_inference_steps - t_start def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, generator): - image = image.to(device=self.device, dtype=dtype) + image = image.to(device=device, dtype=dtype) init_latent_dist = self.vae.encode(image).latent_dist init_latents = init_latent_dist.sample(generator=generator) init_latents = self.vae.config.scaling_factor * init_latents @@ -558,7 +558,7 @@ def prepare_latents(self, image, timestep, num_images_per_prompt, dtype, device, init_latents_orig = init_latents # add noise to latents using the timesteps - noise = randn_tensor(init_latents.shape, generator=generator, device=self.device, dtype=dtype) + noise = randn_tensor(init_latents.shape, generator=generator, device=device, dtype=dtype) init_latents = self.scheduler.add_noise(init_latents, noise, timestep) latents = init_latents return latents, init_latents_orig, noise @@ -710,7 +710,7 @@ def __call__( ) # 7. Prepare mask latent - mask = mask_image.to(device=self.device, dtype=latents.dtype) + mask = mask_image.to(device=device, dtype=latents.dtype) mask = torch.cat([mask] * num_images_per_prompt) # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline From 908e5e9cc6d9b02016050010a33ec56bd1151c6f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Jun 2023 15:07:51 +0200 Subject: [PATCH 128/199] Fix some bad comment in training scripts (#3798) * relax tolerance slightly * correct incorrect naming --- examples/dreambooth/train_dreambooth.py | 4 ++-- examples/dreambooth/train_dreambooth_lora.py | 4 ++-- examples/text_to_image/train_text_to_image.py | 4 ++-- examples/text_to_image/train_text_to_image_lora.py | 4 ++-- examples/textual_inversion/textual_inversion.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 7f6c27dec54c..695b0a0423a6 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -1092,8 +1092,8 @@ def compute_text_embeddings(prompt): unet, optimizer, train_dataloader, lr_scheduler ) - # For mixed precision training we cast the text_encoder and vae weights to half-precision - # as these models are only used for inference, keeping weights in full precision is not required. + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index 1c64523caf45..b4f099fc2f58 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -790,8 +790,8 @@ def main(args): text_encoder.requires_grad_(False) unet.requires_grad_(False) - # For mixed precision training we cast the text_encoder and vae weights to half-precision - # as these models are only used for inference, keeping weights in full precision is not required. + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index f4ce1d96a82c..3fe72b90b24a 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -747,8 +747,8 @@ def collate_fn(examples): if args.use_ema: ema_unet.to(accelerator.device) - # For mixed precision training we cast the text_encoder and vae weights to half-precision - # as these models are only used for inference, keeping weights in full precision is not required. + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 990695fa0ece..4a39f37a2896 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -430,8 +430,8 @@ def main(): text_encoder.requires_grad_(False) - # For mixed precision training we cast the text_encoder and vae weights to half-precision - # as these models are only used for inference, keeping weights in full precision is not required. + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index d5988a9b1707..8c44247a75b5 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -752,8 +752,8 @@ def main(): text_encoder, optimizer, train_dataloader, lr_scheduler ) - # For mixed precision training we cast the unet and vae weights to half-precision - # as these models are only used for inference, keeping weights in full precision is not required. + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 From 75124fc91ebb6007b364e37ab3589e66d3566355 Mon Sep 17 00:00:00 2001 From: Arpan Tripathi <42506819+tripathiarpan20@users.noreply.github.com> Date: Thu, 15 Jun 2023 14:09:44 +0100 Subject: [PATCH 129/199] Added LoRA loading to `StableDiffusionKDiffusionPipeline` (#3751) Added `LoraLoaderMixin` to `StableDiffusionKDiffusionPipeline` --- .../stable_diffusion/pipeline_stable_diffusion_k_diffusion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py index ab613dd4dfe4..e2800342e578 100755 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_k_diffusion.py @@ -45,7 +45,7 @@ def apply_model(self, *args, **kwargs): return self.model(*args, encoder_hidden_states=encoder_hidden_states, **kwargs).sample -class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin): +class StableDiffusionKDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. From 231bdf2e560a57976749b55a941bcf3f311df6fc Mon Sep 17 00:00:00 2001 From: Naga Sai Abhinay Date: Thu, 15 Jun 2023 18:45:40 +0530 Subject: [PATCH 130/199] UnCLIP Image Interpolation -> Keep same initial noise across interpolation steps (#3782) * Maintain same decoder start noise for all interp steps * Correct comment * use batch_size for consistency --- examples/community/unclip_image_interpolation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/community/unclip_image_interpolation.py b/examples/community/unclip_image_interpolation.py index 453ac07af7c6..618ac25bdc95 100644 --- a/examples/community/unclip_image_interpolation.py +++ b/examples/community/unclip_image_interpolation.py @@ -376,14 +376,16 @@ def __call__( height = self.decoder.config.sample_size width = self.decoder.config.sample_size + # Get the decoder latents for 1 step and then repeat the same tensor for the entire batch to keep same noise across all interpolation steps. decoder_latents = self.prepare_latents( - (batch_size, num_channels_latents, height, width), + (1, num_channels_latents, height, width), text_encoder_hidden_states.dtype, device, generator, decoder_latents, self.decoder_scheduler, ) + decoder_latents = decoder_latents.repeat((batch_size, 1, 1, 1)) for i, t in enumerate(self.progress_bar(decoder_timesteps_tensor)): # expand the latents if we are doing classifier free guidance From 77f9137f103b04f137b6c487814ea58599ebc200 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 15 Jun 2023 19:41:54 +0530 Subject: [PATCH 131/199] feat: add PR template. (#3786) * feat: add PR template. * address pr comments. * Update .github/PULL_REQUEST_TEMPLATE.md Co-authored-by: Patrick von Platen --------- Co-authored-by: Patrick von Platen --- .github/PULL_REQUEST_TEMPLATE.md | 60 ++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 .github/PULL_REQUEST_TEMPLATE.md diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000000..05c211645330 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,60 @@ +# What does this PR do? + + + + + +Fixes # (issue) + + +## Before submitting +- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case). +- [ ] Did you read the [contributor guideline](https://github.com/huggingface/diffusers/blob/main/CONTRIBUTING.md)? +- [ ] Did you read our [philosophy doc](https://github.com/huggingface/diffusers/blob/main/PHILOSOPHY.md) (important for complex PRs)? +- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link to it if that's the case. +- [ ] Did you make sure to update the documentation with your changes? Here are the + [documentation guidelines](https://github.com/huggingface/diffusers/tree/main/docs), and + [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/main/docs#writing-source-documentation). +- [ ] Did you write any new necessary tests? + + +## Who can review? + +Anyone in the community is free to review the PR once the tests have passed. Feel free to tag +members/contributors who may be interested in your PR. + + From 958d9ec72310b67eb3c4d3fed55219af857ae5d1 Mon Sep 17 00:00:00 2001 From: estelleafl Date: Thu, 15 Jun 2023 18:36:52 +0300 Subject: [PATCH 132/199] Ldm3d first PR (#3668) * added ldm3d pipeline and updated image processor to support depth * added description * added paper reference * added docs * fixed bug * added test * Update tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * added reference in indexmdx * reverted changes tto image processor' * added LDM3DOutput * Fixes with make style * fix failing tests for make fix-copies * aligned with our version * Update pipeline_stable_diffusion_ldm3d.py updated the guidance scale * Fix for failing check_code_quality test * Code review feedback * Fix typo in ldm3d_diffusion.mdx * updated the doc accordnlgy * copyrights * fixed test failure * make style * added image processor of LDM3D in the documentation: * added ldm3d doc to toctree * run make style && make quality * run make fix-copies * Update docs/source/en/api/image_processor.mdx Co-authored-by: Sayak Paul * Update docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx Co-authored-by: Sayak Paul * Update docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx Co-authored-by: Sayak Paul * updated the safety checker to accept tuple * make style and make quality * Update src/diffusers/pipelines/stable_diffusion/__init__.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py Co-authored-by: Patrick von Platen * LDM3D output * up --------- Co-authored-by: Patrick von Platen Co-authored-by: Aflalo Co-authored-by: Anahita Bhiwandiwalla Co-authored-by: Aflalo Co-authored-by: Aflalo Co-authored-by: Sayak Paul Co-authored-by: Aflalo Co-authored-by: Aflalo --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/image_processor.mdx | 13 +- .../stable_diffusion/ldm3d_diffusion.mdx | 55 ++ docs/source/en/index.mdx | 1 + src/diffusers/__init__.py | 1 + src/diffusers/image_processor.py | 106 +++ src/diffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_stable_diffusion_ldm3d.py | 717 ++++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 15 + .../test_stable_diffusion_ldm3d.py | 290 +++++++ 11 files changed, 1201 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d6c753056044..d85f715203bc 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -221,6 +221,8 @@ title: Stable-Diffusion-Latent-Upscaler - local: api/pipelines/stable_diffusion/upscale title: Super-Resolution + - local: api/pipelines/stable_diffusion/ldm3d_diffusion + title: LDM3D Text-to-(RGB, Depth) title: Stable Diffusion - local: api/pipelines/stable_unclip title: Stable unCLIP diff --git a/docs/source/en/api/image_processor.mdx b/docs/source/en/api/image_processor.mdx index 1964df214f94..e2ed4ad48c19 100644 --- a/docs/source/en/api/image_processor.mdx +++ b/docs/source/en/api/image_processor.mdx @@ -17,6 +17,17 @@ Image processor provides a unified API for Stable Diffusion pipelines to prepare All pipelines with VAE image processor will accept image inputs in the format of PIL Image, PyTorch tensor, or Numpy array, and will able to return outputs in the format of PIL Image, Pytorch tensor, and Numpy array based on the `output_type` argument from the user. Additionally, the User can pass encoded image latents directly to the pipeline, or ask the pipeline to return latents as output with `output_type = 'pt'` argument. This allows you to take the generated latents from one pipeline and pass it to another pipeline as input, without ever having to leave the latent space. It also makes it much easier to use multiple pipelines together, by passing PyTorch tensors directly between different pipelines. +# Image Processor for VAE adapted to LDM3D + +LDM3D Image processor does the same as the Image processor for VAE but accepts both RGB and depth inputs and will return RGB and depth outputs. + + + ## VaeImageProcessor -[[autodoc]] image_processor.VaeImageProcessor \ No newline at end of file +[[autodoc]] image_processor.VaeImageProcessor + + +## VaeImageProcessorLDM3D + +[[autodoc]] image_processor.VaeImageProcessorLDM3D \ No newline at end of file diff --git a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx new file mode 100644 index 000000000000..ca5798d93a8e --- /dev/null +++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx @@ -0,0 +1,55 @@ + + +# LDM3D + +LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://arxiv.org/abs/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, Vasudev Lal +The abstract of the paper is the following: + +*This research paper proposes a Latent Diffusion Model for 3D (LDM3D) that generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts. The LDM3D model is fine-tuned on a dataset of tuples containing an RGB image, depth map and caption, and validated through extensive experiments. We also develop an application called DepthFusion, which uses the generated RGB images and depth maps to create immersive and interactive 360-degree-view experiences using TouchDesigner. This technology has the potential to transform a wide range of industries, from entertainment and gaming to architecture and design. Overall, this paper presents a significant contribution to the field of generative AI and computer vision, and showcases the potential of LDM3D and DepthFusion to revolutionize content creation and digital experiences. A short video summarizing the approach can be found at [this url](https://t.ly/tdi2).* + + +*Overview*: + +| Pipeline | Tasks | Colab | Demo +|---|---|:---:|:---:| +| [pipeline_stable_diffusion_ldm3d.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py) | *Text-to-Image Generation* | - | - + +## Tips + +- LDM3D generates both an image and a depth map from a given text prompt, compared to the existing txt-to-img diffusion models such as [Stable Diffusion](./stable_diffusion/overview) that generates only an image. +- With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps. + + +Running LDM3D is straighforward with the [`StableDiffusionLDM3DPipeline`]: + +```python +>>> from diffusers import StableDiffusionLDM3DPipeline + +>>> pipe_ldm3d = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d") +prompt ="A picture of some lemons on a table" +output = pipe_ldm3d(prompt) +rgb_image, depth_image = output.rgb, output.depth +rgb_image[0].save("lemons_ldm3d_rgb.jpg") +depth_image[0].save("lemons_ldm3d_depth.png") +``` + + +## StableDiffusionPipelineOutput +[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput + - all + - __call__ + +## StableDiffusionLDM3DPipeline +[[autodoc]] StableDiffusionLDM3DPipeline + - all + - __call__ diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 66548663827a..4673a92d5e91 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -94,3 +94,4 @@ The library has three main components: | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | +| [stable_diffusion_ldm3d](./api/pipelines/stable_diffusion/ldm3d_diffusion) | [LDM3D: Latent Diffusion Model for 3D](https://arxiv.org/abs/2305.10853) | Text to Image and Depth Generation | diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index a322903ab5d1..07a60946b8c5 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -149,6 +149,7 @@ StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, StableDiffusionLatentUpscalePipeline, + StableDiffusionLDM3DPipeline, StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, StableDiffusionPipeline, diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 17c083914753..4f3c61208539 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -251,3 +251,109 @@ def postprocess( if output_type == "pil": return self.numpy_to_pil(image) + + +class VaeImageProcessorLDM3D(VaeImageProcessor): + """ + Image Processor for VAE LDM3D. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. + vae_scale_factor (`int`, *optional*, defaults to `8`): + VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this + factor. + resample (`str`, *optional*, defaults to `lanczos`): + Resampling filter to use when resizing the image. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image to [-1,1] + """ + + config_name = CONFIG_NAME + + @register_to_config + def __init__( + self, + do_resize: bool = True, + vae_scale_factor: int = 8, + resample: str = "lanczos", + do_normalize: bool = True, + ): + super().__init__() + + @staticmethod + def numpy_to_pil(images): + """ + Convert a numpy image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + if images.shape[-1] == 1: + # special case for grayscale (single channel) images + pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] + else: + pil_images = [Image.fromarray(image[:, :, :3]) for image in images] + + return pil_images + + @staticmethod + def rgblike_to_depthmap(image): + """ + Args: + image: RGB-like depth image + + Returns: depth map + + """ + return image[:, :, 1] * 2**8 + image[:, :, 2] + + def numpy_to_depth(self, images): + """ + Convert a numpy depth image or a batch of images to a PIL image. + """ + if images.ndim == 3: + images = images[None, ...] + images = (images * 255).round().astype("uint8") + if images.shape[-1] == 1: + # special case for grayscale (single channel) images + raise Exception("Not supported") + else: + pil_images = [Image.fromarray(self.rgblike_to_depthmap(image[:, :, 3:]), mode="I;16") for image in images] + + return pil_images + + def postprocess( + self, + image: torch.FloatTensor, + output_type: str = "pil", + do_denormalize: Optional[List[bool]] = None, + ): + if not isinstance(image, torch.Tensor): + raise ValueError( + f"Input for postprocessing is in incorrect format: {type(image)}. We only support pytorch tensor" + ) + if output_type not in ["latent", "pt", "np", "pil"]: + deprecation_message = ( + f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " + "`pil`, `np`, `pt`, `latent`" + ) + deprecate("Unsupported output_type", "1.0.0", deprecation_message, standard_warn=False) + output_type = "np" + + if do_denormalize is None: + do_denormalize = [self.config.do_normalize] * image.shape[0] + + image = torch.stack( + [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])] + ) + + image = self.pt_to_numpy(image) + + if output_type == "np": + return image[:, :, :, :3], np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0) + + if output_type == "pil": + return self.numpy_to_pil(image), self.numpy_to_depth(image) + else: + raise Exception(f"This type {output_type} is not supported") diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 9e68538f233c..42c7dc33970d 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -77,6 +77,7 @@ StableDiffusionInpaintPipelineLegacy, StableDiffusionInstructPix2PixPipeline, StableDiffusionLatentUpscalePipeline, + StableDiffusionLDM3DPipeline, StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, StableDiffusionPipeline, diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index f39ae67a9aff..cff7a765a7ef 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -50,6 +50,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy from .pipeline_stable_diffusion_instruct_pix2pix import StableDiffusionInstructPix2PixPipeline from .pipeline_stable_diffusion_latent_upscale import StableDiffusionLatentUpscalePipeline + from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py new file mode 100644 index 000000000000..c804d2f1918b --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py @@ -0,0 +1,717 @@ +# Copyright 2023 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from dataclasses import dataclass +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer + +from ...image_processor import VaeImageProcessorLDM3D +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + BaseOutput, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionPipeline + + >>> pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d") + >>> pipe = pipe.to("cuda") + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> output = pipe_ldm3d(prompt) + >>> rgb_image, depth_image = output.rgb, output.depth + ``` +""" + + +@dataclass +class LDM3DPipelineOutput(BaseOutput): + """ + Output class for Stable Diffusion pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + nsfw_content_detected (`List[bool]`) + List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, or `None` if safety checking could not be performed. + """ + + rgb: Union[List[PIL.Image.Image], np.ndarray] + depth: Union[List[PIL.Image.Image], np.ndarray] + nsfw_content_detected: Optional[List[bool]] + + +class StableDiffusionLDM3DPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): + r""" + Pipeline for text-to-image and 3d generation using LDM3D. LDM3D: Latent Diffusion Model for 3D: + https://arxiv.org/abs/2305.10853 + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode rgb and depth images to and from latent + representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded rgb and depth latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPImageProcessor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + rgb_feature_extractor_input = feature_extractor_input[0] + safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 49, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 5.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return ((rgb, depth), has_nsfw_concept) + + return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 95d07c081ccd..c7a6ac79efbd 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -452,6 +452,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class StableDiffusionLDM3DPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionModelEditingPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py new file mode 100644 index 000000000000..933e4307a41b --- /dev/null +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py @@ -0,0 +1,290 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import gc +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + PNDMScheduler, + StableDiffusionLDM3DPipeline, + UNet2DConditionModel, +) +from diffusers.utils import nightly, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS + + +enable_full_determinism() + + +class StableDiffusionLDM3DPipelineFastTests(unittest.TestCase): + pipeline_class = StableDiffusionLDM3DPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + ) + scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=6, + out_channels=6, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "output_type": "numpy", + } + return inputs + + def test_stable_diffusion_ddim(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + + components = self.get_dummy_components() + ldm3d_pipe = StableDiffusionLDM3DPipeline(**components) + ldm3d_pipe = ldm3d_pipe.to(torch_device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + output = ldm3d_pipe(**inputs) + rgb, depth = output.rgb, output.depth + + image_slice_rgb = rgb[0, -3:, -3:, -1] + image_slice_depth = depth[0, -3:, -1] + + assert rgb.shape == (1, 64, 64, 3) + assert depth.shape == (1, 64, 64) + + expected_slice_rgb = np.array( + [0.37301102, 0.7023895, 0.7418312, 0.5163375, 0.5825485, 0.60929704, 0.4188174, 0.48407027, 0.46555096] + ) + expected_slice_depth = np.array([103.4673, 85.81202, 87.84926]) + + assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2 + assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2 + + def test_stable_diffusion_prompt_embeds(self): + components = self.get_dummy_components() + ldm3d_pipe = StableDiffusionLDM3DPipeline(**components) + ldm3d_pipe = ldm3d_pipe.to(torch_device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(torch_device) + inputs["prompt"] = 3 * [inputs["prompt"]] + + # forward + output = ldm3d_pipe(**inputs) + rgb_slice_1, depth_slice_1 = output.rgb, output.depth + rgb_slice_1 = rgb_slice_1[0, -3:, -3:, -1] + depth_slice_1 = depth_slice_1[0, -3:, -1] + + inputs = self.get_dummy_inputs(torch_device) + prompt = 3 * [inputs.pop("prompt")] + + text_inputs = ldm3d_pipe.tokenizer( + prompt, + padding="max_length", + max_length=ldm3d_pipe.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_inputs = text_inputs["input_ids"].to(torch_device) + + prompt_embeds = ldm3d_pipe.text_encoder(text_inputs)[0] + + inputs["prompt_embeds"] = prompt_embeds + + # forward + output = ldm3d_pipe(**inputs) + rgb_slice_2, depth_slice_2 = output.rgb, output.depth + rgb_slice_2 = rgb_slice_2[0, -3:, -3:, -1] + depth_slice_2 = depth_slice_2[0, -3:, -1] + + assert np.abs(rgb_slice_1.flatten() - rgb_slice_2.flatten()).max() < 1e-4 + assert np.abs(depth_slice_1.flatten() - depth_slice_2.flatten()).max() < 1e-4 + + def test_stable_diffusion_negative_prompt(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + components["scheduler"] = PNDMScheduler(skip_prk_steps=True) + ldm3d_pipe = StableDiffusionLDM3DPipeline(**components) + ldm3d_pipe = ldm3d_pipe.to(device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + negative_prompt = "french fries" + output = ldm3d_pipe(**inputs, negative_prompt=negative_prompt) + + rgb, depth = output.rgb, output.depth + rgb_slice = rgb[0, -3:, -3:, -1] + depth_slice = depth[0, -3:, -1] + + assert rgb.shape == (1, 64, 64, 3) + assert depth.shape == (1, 64, 64) + + expected_slice_rgb = np.array( + [0.37044, 0.71811503, 0.7223251, 0.48603675, 0.5638391, 0.6364948, 0.42833704, 0.4901315, 0.47926217] + ) + expected_slice_depth = np.array([107.84738, 84.62802, 89.962135]) + assert np.abs(rgb_slice.flatten() - expected_slice_rgb).max() < 1e-2 + assert np.abs(depth_slice.flatten() - expected_slice_depth).max() < 1e-2 + + +@slow +@require_torch_gpu +class StableDiffusionLDM3DPipelineSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) + latents = torch.from_numpy(latents).to(device=device, dtype=dtype) + inputs = { + "prompt": "a photograph of an astronaut riding a horse", + "latents": latents, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "numpy", + } + return inputs + + def test_ldm3d_stable_diffusion(self): + ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d") + ldm3d_pipe = ldm3d_pipe.to(torch_device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + output = ldm3d_pipe(**inputs) + rgb, depth = output.rgb, output.depth + rgb_slice = rgb[0, -3:, -3:, -1].flatten() + depth_slice = rgb[0, -3:, -1].flatten() + + assert rgb.shape == (1, 512, 512, 3) + assert depth.shape == (1, 512, 512) + + expected_slice_rgb = np.array( + [0.53805465, 0.56707305, 0.5486515, 0.57012236, 0.5814511, 0.56253487, 0.54843014, 0.55092263, 0.6459706] + ) + expected_slice_depth = np.array( + [0.9263781, 0.6678672, 0.5486515, 0.92202145, 0.67831135, 0.56253487, 0.9241694, 0.7551478, 0.6459706] + ) + assert np.abs(rgb_slice - expected_slice_rgb).max() < 3e-3 + assert np.abs(depth_slice - expected_slice_depth).max() < 3e-3 + + +@nightly +@require_torch_gpu +class StableDiffusionPipelineNightlyTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) + latents = torch.from_numpy(latents).to(device=device, dtype=dtype) + inputs = { + "prompt": "a photograph of an astronaut riding a horse", + "latents": latents, + "generator": generator, + "num_inference_steps": 50, + "guidance_scale": 7.5, + "output_type": "numpy", + } + return inputs + + def test_ldm3d(self): + ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d").to(torch_device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + output = ldm3d_pipe(**inputs) + rgb, depth = output.rgb, output.depth + + expected_rgb_mean = 0.54461557 + expected_rgb_std = 0.2806707 + expected_depth_mean = 143.64595 + expected_depth_std = 83.491776 + assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 + assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 + assert np.abs(expected_depth_mean - depth.mean()) < 1e-3 + assert np.abs(expected_depth_std - depth.std()) < 1e-3 From ea8ae8c6397d8333760471e573e4d8ca4646efd0 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 15 Jun 2023 17:42:49 +0200 Subject: [PATCH 133/199] Complete set_attn_processor for prior and vae (#3796) * relax tolerance slightly * Add more tests * upload readme * upload readme * Apply suggestions from code review * Improve API Autoencoder KL * finalize * finalize tests * finalize tests * Apply suggestions from code review Co-authored-by: Sayak Paul * up --------- Co-authored-by: Sayak Paul --- src/diffusers/models/autoencoder_kl.py | 66 ++++++- src/diffusers/models/prior_transformer.py | 66 ++++++- tests/models/test_modeling_common.py | 151 +++++++++----- tests/models/test_models_prior.py | 185 ++++++++++++++++++ tests/models/test_models_unet_1d.py | 8 +- tests/models/test_models_unet_2d.py | 11 +- tests/models/test_models_unet_2d_condition.py | 5 +- tests/models/test_models_unet_3d_condition.py | 5 +- tests/models/test_models_vae.py | 6 +- tests/models/test_models_vq.py | 5 +- 10 files changed, 446 insertions(+), 62 deletions(-) create mode 100644 tests/models/test_models_prior.py diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py index a4894e78c43f..71785431320f 100644 --- a/src/diffusers/models/autoencoder_kl.py +++ b/src/diffusers/models/autoencoder_kl.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import torch import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput, apply_forward_hook +from .attention_processor import AttentionProcessor, AttnProcessor from .modeling_utils import ModelMixin from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder @@ -156,6 +157,69 @@ def disable_slicing(self): """ self.use_slicing = False + @property + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Parameters: + `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + of **all** `Attention` layers. + In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + self.set_attn_processor(AttnProcessor()) + @apply_forward_hook def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size): diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py index b245612e6fc1..58804f2672b9 100644 --- a/src/diffusers/models/prior_transformer.py +++ b/src/diffusers/models/prior_transformer.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import Optional, Union +from typing import Dict, Optional, Union import torch import torch.nn.functional as F @@ -8,6 +8,7 @@ from ..configuration_utils import ConfigMixin, register_to_config from ..utils import BaseOutput from .attention import BasicTransformerBlock +from .attention_processor import AttentionProcessor, AttnProcessor from .embeddings import TimestepEmbedding, Timesteps from .modeling_utils import ModelMixin @@ -104,6 +105,69 @@ def __init__( self.clip_mean = nn.Parameter(torch.zeros(1, embedding_dim)) self.clip_std = nn.Parameter(torch.zeros(1, embedding_dim)) + @property + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "set_processor"): + processors[f"{name}.processor"] = module.processor + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Parameters: + `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + of **all** `Attention` layers. + In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + self.set_attn_processor(AttnProcessor()) + def forward( self, hidden_states, diff --git a/tests/models/test_modeling_common.py b/tests/models/test_modeling_common.py index adc18e003a56..ee8e55842f8d 100644 --- a/tests/models/test_modeling_common.py +++ b/tests/models/test_modeling_common.py @@ -26,9 +26,10 @@ from requests.exceptions import HTTPError from diffusers.models import UNet2DConditionModel +from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0, XFormersAttnProcessor from diffusers.training_utils import EMAModel from diffusers.utils import logging, torch_device -from diffusers.utils.testing_utils import CaptureLogger, require_torch_2, run_test_in_subprocess +from diffusers.utils.testing_utils import CaptureLogger, require_torch_2, require_torch_gpu, run_test_in_subprocess # Will be run via run_test_in_subprocess @@ -150,7 +151,43 @@ def test_weight_overwrite(self): assert model.config.in_channels == 9 +class UNetTesterMixin: + def test_forward_signature(self): + init_dict, _ = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["sample", "timestep"] + self.assertListEqual(arg_names[:2], expected_arg_names) + + def test_forward_with_norm_groups(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + init_dict["norm_num_groups"] = 16 + init_dict["block_out_channels"] = (16, 32) + + model = self.model_class(**init_dict) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + output = model(**inputs_dict) + + if isinstance(output, dict): + output = output.to_tuple()[0] + + self.assertIsNotNone(output) + expected_shape = inputs_dict["sample"].shape + self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") + + class ModelTesterMixin: + main_input_name = None # overwrite in model specific tester class + base_precision = 1e-3 + def test_from_save_pretrained(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -170,12 +207,12 @@ def test_from_save_pretrained(self): with torch.no_grad(): image = model(**inputs_dict) if isinstance(image, dict): - image = image.sample + image = image.to_tuple()[0] new_image = new_model(**inputs_dict) if isinstance(new_image, dict): - new_image = new_image.sample + new_image = new_image.to_tuple()[0] max_diff = (image - new_image).abs().sum().item() self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes") @@ -223,12 +260,62 @@ def test_getattr_is_correct(self): assert str(error.exception) == f"'{type(model).__name__}' object has no attribute 'does_not_exist'" + @require_torch_gpu + def test_set_attn_processor_for_determinism(self): + torch.use_deterministic_algorithms(False) + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + model = self.model_class(**init_dict) + model.to(torch_device) + + if not hasattr(model, "set_attn_processor"): + # If not has `set_attn_processor`, skip test + return + + assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values()) + with torch.no_grad(): + output_1 = model(**inputs_dict)[0] + + model.set_default_attn_processor() + assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values()) + with torch.no_grad(): + output_2 = model(**inputs_dict)[0] + + model.enable_xformers_memory_efficient_attention() + assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values()) + with torch.no_grad(): + output_3 = model(**inputs_dict)[0] + + model.set_attn_processor(AttnProcessor2_0()) + assert all(type(proc) == AttnProcessor2_0 for proc in model.attn_processors.values()) + with torch.no_grad(): + output_4 = model(**inputs_dict)[0] + + model.set_attn_processor(AttnProcessor()) + assert all(type(proc) == AttnProcessor for proc in model.attn_processors.values()) + with torch.no_grad(): + output_5 = model(**inputs_dict)[0] + + model.set_attn_processor(XFormersAttnProcessor()) + assert all(type(proc) == XFormersAttnProcessor for proc in model.attn_processors.values()) + with torch.no_grad(): + output_6 = model(**inputs_dict)[0] + + torch.use_deterministic_algorithms(True) + + # make sure that outputs match + assert torch.allclose(output_2, output_1, atol=self.base_precision) + assert torch.allclose(output_2, output_3, atol=self.base_precision) + assert torch.allclose(output_2, output_4, atol=self.base_precision) + assert torch.allclose(output_2, output_5, atol=self.base_precision) + assert torch.allclose(output_2, output_6, atol=self.base_precision) + def test_from_save_pretrained_variant(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() model = self.model_class(**init_dict) if hasattr(model, "set_default_attn_processor"): model.set_default_attn_processor() + model.to(torch_device) model.eval() @@ -250,12 +337,12 @@ def test_from_save_pretrained_variant(self): with torch.no_grad(): image = model(**inputs_dict) if isinstance(image, dict): - image = image.sample + image = image.to_tuple()[0] new_image = new_model(**inputs_dict) if isinstance(new_image, dict): - new_image = new_image.sample + new_image = new_image.to_tuple()[0] max_diff = (image - new_image).abs().sum().item() self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes") @@ -293,11 +380,11 @@ def test_determinism(self, expected_max_diff=1e-5): with torch.no_grad(): first = model(**inputs_dict) if isinstance(first, dict): - first = first.sample + first = first.to_tuple()[0] second = model(**inputs_dict) if isinstance(second, dict): - second = second.sample + second = second.to_tuple()[0] out_1 = first.cpu().numpy() out_2 = second.cpu().numpy() @@ -316,43 +403,15 @@ def test_output(self): output = model(**inputs_dict) if isinstance(output, dict): - output = output.sample + output = output.to_tuple()[0] self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape - self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - - def test_forward_with_norm_groups(self): - init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() - init_dict["norm_num_groups"] = 16 - init_dict["block_out_channels"] = (16, 32) - - model = self.model_class(**init_dict) - model.to(torch_device) - model.eval() - - with torch.no_grad(): - output = model(**inputs_dict) - - if isinstance(output, dict): - output = output.sample - - self.assertIsNotNone(output) - expected_shape = inputs_dict["sample"].shape + # input & output have to have the same shape + input_tensor = inputs_dict[self.main_input_name] + expected_shape = input_tensor.shape self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match") - def test_forward_signature(self): - init_dict, _ = self.prepare_init_args_and_inputs_for_common() - - model = self.model_class(**init_dict) - signature = inspect.signature(model.forward) - # signature.parameters is an OrderedDict => so arg_names order is deterministic - arg_names = [*signature.parameters.keys()] - - expected_arg_names = ["sample", "timestep"] - self.assertListEqual(arg_names[:2], expected_arg_names) - def test_model_from_pretrained(self): init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() @@ -378,12 +437,12 @@ def test_model_from_pretrained(self): output_1 = model(**inputs_dict) if isinstance(output_1, dict): - output_1 = output_1.sample + output_1 = output_1.to_tuple()[0] output_2 = new_model(**inputs_dict) if isinstance(output_2, dict): - output_2 = output_2.sample + output_2 = output_2.to_tuple()[0] self.assertEqual(output_1.shape, output_2.shape) @@ -397,9 +456,10 @@ def test_training(self): output = model(**inputs_dict) if isinstance(output, dict): - output = output.sample + output = output.to_tuple()[0] - noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device) + input_tensor = inputs_dict[self.main_input_name] + noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device) loss = torch.nn.functional.mse_loss(output, noise) loss.backward() @@ -415,9 +475,10 @@ def test_ema_training(self): output = model(**inputs_dict) if isinstance(output, dict): - output = output.sample + output = output.to_tuple()[0] - noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device) + input_tensor = inputs_dict[self.main_input_name] + noise = torch.randn((input_tensor.shape[0],) + self.output_shape).to(torch_device) loss = torch.nn.functional.mse_loss(output, noise) loss.backward() ema_model.step(model.parameters()) diff --git a/tests/models/test_models_prior.py b/tests/models/test_models_prior.py new file mode 100644 index 000000000000..25b9768ee34f --- /dev/null +++ b/tests/models/test_models_prior.py @@ -0,0 +1,185 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import inspect +import unittest + +import torch +from parameterized import parameterized + +from diffusers import PriorTransformer +from diffusers.utils import floats_tensor, slow, torch_all_close, torch_device +from diffusers.utils.testing_utils import enable_full_determinism + +from .test_modeling_common import ModelTesterMixin + + +enable_full_determinism() + + +class PriorTransformerTests(ModelTesterMixin, unittest.TestCase): + model_class = PriorTransformer + main_input_name = "hidden_states" + + @property + def dummy_input(self): + batch_size = 4 + embedding_dim = 8 + num_embeddings = 7 + + hidden_states = floats_tensor((batch_size, embedding_dim)).to(torch_device) + + proj_embedding = floats_tensor((batch_size, embedding_dim)).to(torch_device) + encoder_hidden_states = floats_tensor((batch_size, num_embeddings, embedding_dim)).to(torch_device) + + return { + "hidden_states": hidden_states, + "timestep": 2, + "proj_embedding": proj_embedding, + "encoder_hidden_states": encoder_hidden_states, + } + + def get_dummy_seed_input(self, seed=0): + torch.manual_seed(seed) + batch_size = 4 + embedding_dim = 8 + num_embeddings = 7 + + hidden_states = torch.randn((batch_size, embedding_dim)).to(torch_device) + + proj_embedding = torch.randn((batch_size, embedding_dim)).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, num_embeddings, embedding_dim)).to(torch_device) + + return { + "hidden_states": hidden_states, + "timestep": 2, + "proj_embedding": proj_embedding, + "encoder_hidden_states": encoder_hidden_states, + } + + @property + def input_shape(self): + return (4, 8) + + @property + def output_shape(self): + return (4, 8) + + def prepare_init_args_and_inputs_for_common(self): + init_dict = { + "num_attention_heads": 2, + "attention_head_dim": 4, + "num_layers": 2, + "embedding_dim": 8, + "num_embeddings": 7, + "additional_embeddings": 4, + } + inputs_dict = self.dummy_input + return init_dict, inputs_dict + + def test_from_pretrained_hub(self): + model, loading_info = PriorTransformer.from_pretrained( + "hf-internal-testing/prior-dummy", output_loading_info=True + ) + self.assertIsNotNone(model) + self.assertEqual(len(loading_info["missing_keys"]), 0) + + model.to(torch_device) + hidden_states = model(**self.dummy_input)[0] + + assert hidden_states is not None, "Make sure output is not None" + + def test_forward_signature(self): + init_dict, _ = self.prepare_init_args_and_inputs_for_common() + + model = self.model_class(**init_dict) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["hidden_states", "timestep"] + self.assertListEqual(arg_names[:2], expected_arg_names) + + def test_output_pretrained(self): + model = PriorTransformer.from_pretrained("hf-internal-testing/prior-dummy") + model = model.to(torch_device) + + if hasattr(model, "set_default_attn_processor"): + model.set_default_attn_processor() + + input = self.get_dummy_seed_input() + + with torch.no_grad(): + output = model(**input)[0] + + output_slice = output[0, :5].flatten().cpu() + print(output_slice) + + # Since the VAE Gaussian prior's generator is seeded on the appropriate device, + # the expected output slices are not the same for CPU and GPU. + expected_output_slice = torch.tensor([-1.3436, -0.2870, 0.7538, 0.4368, -0.0239]) + self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-2)) + + +@slow +class PriorTransformerIntegrationTests(unittest.TestCase): + def get_dummy_seed_input(self, batch_size=1, embedding_dim=768, num_embeddings=77, seed=0): + torch.manual_seed(seed) + batch_size = batch_size + embedding_dim = embedding_dim + num_embeddings = num_embeddings + + hidden_states = torch.randn((batch_size, embedding_dim)).to(torch_device) + + proj_embedding = torch.randn((batch_size, embedding_dim)).to(torch_device) + encoder_hidden_states = torch.randn((batch_size, num_embeddings, embedding_dim)).to(torch_device) + + return { + "hidden_states": hidden_states, + "timestep": 2, + "proj_embedding": proj_embedding, + "encoder_hidden_states": encoder_hidden_states, + } + + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + @parameterized.expand( + [ + # fmt: off + [13, [-0.5861, 0.1283, -0.0931, 0.0882, 0.4476, 0.1329, -0.0498, 0.0640]], + [37, [-0.4913, 0.0110, -0.0483, 0.0541, 0.4954, -0.0170, 0.0354, 0.1651]], + # fmt: on + ] + ) + def test_kandinsky_prior(self, seed, expected_slice): + model = PriorTransformer.from_pretrained("kandinsky-community/kandinsky-2-1-prior", subfolder="prior") + model.to(torch_device) + input = self.get_dummy_seed_input(seed=seed) + + with torch.no_grad(): + sample = model(**input)[0] + + assert list(sample.shape) == [1, 768] + + output_slice = sample[0, :8].flatten().cpu() + print(output_slice) + expected_output_slice = torch.tensor(expected_slice) + + assert torch_all_close(output_slice, expected_output_slice, atol=1e-3) diff --git a/tests/models/test_models_unet_1d.py b/tests/models/test_models_unet_1d.py index 9fb1a61011e3..99a243e911c9 100644 --- a/tests/models/test_models_unet_1d.py +++ b/tests/models/test_models_unet_1d.py @@ -20,11 +20,12 @@ from diffusers import UNet1DModel from diffusers.utils import floats_tensor, slow, torch_device -from .test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin, UNetTesterMixin -class UNet1DModelTests(ModelTesterMixin, unittest.TestCase): +class UNet1DModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet1DModel + main_input_name = "sample" @property def dummy_input(self): @@ -153,8 +154,9 @@ def test_unet_1d_maestro(self): assert (output_max - 0.0607).abs() < 4e-4 -class UNetRLModelTests(ModelTesterMixin, unittest.TestCase): +class UNetRLModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet1DModel + main_input_name = "sample" @property def dummy_input(self): diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py index 92a5664daa2b..4857afb85257 100644 --- a/tests/models/test_models_unet_2d.py +++ b/tests/models/test_models_unet_2d.py @@ -23,7 +23,7 @@ from diffusers.utils import floats_tensor, logging, slow, torch_all_close, torch_device from diffusers.utils.testing_utils import enable_full_determinism -from .test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin, UNetTesterMixin logger = logging.get_logger(__name__) @@ -31,8 +31,9 @@ enable_full_determinism() -class Unet2DModelTests(ModelTesterMixin, unittest.TestCase): +class Unet2DModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet2DModel + main_input_name = "sample" @property def dummy_input(self): @@ -68,8 +69,9 @@ def prepare_init_args_and_inputs_for_common(self): return init_dict, inputs_dict -class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase): +class UNetLDMModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet2DModel + main_input_name = "sample" @property def dummy_input(self): @@ -182,8 +184,9 @@ def test_output_pretrained(self): self.assertTrue(torch_all_close(output_slice, expected_output_slice, rtol=1e-3)) -class NCSNppModelTests(ModelTesterMixin, unittest.TestCase): +class NCSNppModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet2DModel + main_input_name = "sample" @property def dummy_input(self, sizes=(32, 32)): diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index 8a3d9dd16fd5..24da508227d2 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -36,7 +36,7 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import enable_full_determinism -from .test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin, UNetTesterMixin logger = logging.get_logger(__name__) @@ -120,8 +120,9 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True): return custom_diffusion_attn_procs -class UNet2DConditionModelTests(ModelTesterMixin, unittest.TestCase): +class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet2DConditionModel + main_input_name = "sample" @property def dummy_input(self): diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 4193b6e17bd3..2d3edfffd39c 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -31,7 +31,7 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import enable_full_determinism -from .test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin, UNetTesterMixin enable_full_determinism() @@ -73,8 +73,9 @@ def create_lora_layers(model, mock_weights: bool = True): @skip_mps -class UNet3DConditionModelTests(ModelTesterMixin, unittest.TestCase): +class UNet3DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = UNet3DConditionModel + main_input_name = "sample" @property def dummy_input(self): diff --git a/tests/models/test_models_vae.py b/tests/models/test_models_vae.py index fe27e138f5fa..08b030bbf944 100644 --- a/tests/models/test_models_vae.py +++ b/tests/models/test_models_vae.py @@ -24,14 +24,16 @@ from diffusers.utils.import_utils import is_xformers_available from diffusers.utils.testing_utils import enable_full_determinism -from .test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin, UNetTesterMixin enable_full_determinism() -class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase): +class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = AutoencoderKL + main_input_name = "sample" + base_precision = 1e-2 @property def dummy_input(self): diff --git a/tests/models/test_models_vq.py b/tests/models/test_models_vq.py index 8ea6ef77ce63..5706c13a0c45 100644 --- a/tests/models/test_models_vq.py +++ b/tests/models/test_models_vq.py @@ -21,14 +21,15 @@ from diffusers.utils import floats_tensor, torch_device from diffusers.utils.testing_utils import enable_full_determinism -from .test_modeling_common import ModelTesterMixin +from .test_modeling_common import ModelTesterMixin, UNetTesterMixin enable_full_determinism() -class VQModelTests(ModelTesterMixin, unittest.TestCase): +class VQModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase): model_class = VQModel + main_input_name = "sample" @property def dummy_input(self, sizes=(32, 32)): From 7bfd2375c7a769ec685d6feee701ea2abc1bce2f Mon Sep 17 00:00:00 2001 From: Isotr0py <41363108+Isotr0py@users.noreply.github.com> Date: Fri, 16 Jun 2023 00:30:47 +0800 Subject: [PATCH 134/199] fix typo (#3800) --- .../stable_diffusion/pipeline_stable_diffusion_panorama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py index 3826447576d4..e03687e89eb1 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_panorama.py @@ -463,7 +463,7 @@ def get_views(self, panorama_height, panorama_width, window_size=64, stride=8): panorama_height /= 8 panorama_width /= 8 num_blocks_height = (panorama_height - window_size) // stride + 1 if panorama_height > window_size else 1 - num_blocks_width = (panorama_width - window_size) // stride + 1 if panorama_height > window_size else 1 + num_blocks_width = (panorama_width - window_size) // stride + 1 if panorama_width > window_size else 1 total_num_blocks = int(num_blocks_height * num_blocks_width) views = [] for i in range(total_num_blocks): From d49e2dd54caebcc3b781724b751c66e1d2a5556e Mon Sep 17 00:00:00 2001 From: Will Berman Date: Thu, 15 Jun 2023 15:38:54 -0700 Subject: [PATCH 135/199] manual check for checkpoints_total_limit instead of using accelerate (#3681) * manual check for checkpoints_total_limit instead of using accelerate * remove controlnet_conditioning_embedding_out_channels --- examples/controlnet/train_controlnet.py | 31 +- .../train_custom_diffusion.py | 31 +- examples/dreambooth/train_dreambooth.py | 28 +- examples/dreambooth/train_dreambooth_lora.py | 31 +- .../train_instruct_pix2pix.py | 31 +- examples/test_examples.py | 801 +++++++++++++++++- examples/text_to_image/train_text_to_image.py | 31 +- .../text_to_image/train_text_to_image_lora.py | 56 +- .../textual_inversion/textual_inversion.py | 31 +- .../train_unconditional.py | 31 +- 10 files changed, 1007 insertions(+), 95 deletions(-) diff --git a/examples/controlnet/train_controlnet.py b/examples/controlnet/train_controlnet.py index fc358783b5f9..21ab38a3d7a9 100644 --- a/examples/controlnet/train_controlnet.py +++ b/examples/controlnet/train_controlnet.py @@ -18,6 +18,7 @@ import math import os import random +import shutil from pathlib import Path import accelerate @@ -307,11 +308,7 @@ def parse_args(input_args=None): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more details" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -716,9 +713,7 @@ def collate_fn(examples): def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -1060,6 +1055,26 @@ def load_model_hook(models, input_dir): if accelerator.is_main_process: if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") diff --git a/examples/custom_diffusion/train_custom_diffusion.py b/examples/custom_diffusion/train_custom_diffusion.py index 421532602137..e0ec56eca1f3 100644 --- a/examples/custom_diffusion/train_custom_diffusion.py +++ b/examples/custom_diffusion/train_custom_diffusion.py @@ -21,6 +21,7 @@ import math import os import random +import shutil import warnings from pathlib import Path @@ -446,11 +447,7 @@ def parse_args(input_args=None): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -637,9 +634,7 @@ def parse_args(input_args=None): def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -1171,6 +1166,26 @@ def main(args): if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 695b0a0423a6..797cfbd0e5d7 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -20,6 +20,7 @@ import logging import math import os +import shutil import warnings from pathlib import Path @@ -771,9 +772,7 @@ def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_atte def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -1270,12 +1269,33 @@ def compute_text_embeddings(prompt): global_step += 1 if accelerator.is_main_process: - images = [] if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") + images = [] + if args.validation_prompt is not None and global_step % args.validation_steps == 0: images = log_validation( text_encoder, diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index b4f099fc2f58..72fcfa648b48 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -20,6 +20,7 @@ import logging import math import os +import shutil import warnings from pathlib import Path @@ -276,11 +277,7 @@ def parse_args(input_args=None): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -653,9 +650,7 @@ def encode_prompt(text_encoder, input_ids, attention_mask, text_encoder_use_atte def main(args): logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -1221,6 +1216,26 @@ def compute_text_embeddings(prompt): if accelerator.is_main_process: if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") diff --git a/examples/instruct_pix2pix/train_instruct_pix2pix.py b/examples/instruct_pix2pix/train_instruct_pix2pix.py index 08dd5cd42701..e84698a8f215 100644 --- a/examples/instruct_pix2pix/train_instruct_pix2pix.py +++ b/examples/instruct_pix2pix/train_instruct_pix2pix.py @@ -20,6 +20,7 @@ import logging import math import os +import shutil from pathlib import Path import accelerate @@ -327,11 +328,7 @@ def parse_args(): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -387,9 +384,7 @@ def main(): ), ) logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, @@ -867,6 +862,26 @@ def collate_fn(examples): if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") diff --git a/examples/test_examples.py b/examples/test_examples.py index 59c96f44fe93..d11841350064 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -435,8 +435,10 @@ def test_text_to_image_checkpointing(self): pipe(prompt, num_inference_steps=2) # check checkpoint directories exist - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2"))) - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4"))) + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4"}, + ) # check can run an intermediate checkpoint unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet") @@ -474,12 +476,15 @@ def test_text_to_image_checkpointing(self): pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe(prompt, num_inference_steps=2) - # check old checkpoints do not exist - self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2"))) - - # check new checkpoints exist - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4"))) - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6"))) + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + { + # no checkpoint-2 -> check old checkpoints do not exist + # check new checkpoints exist + "checkpoint-4", + "checkpoint-6", + }, + ) def test_text_to_image_checkpointing_use_ema(self): pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" @@ -516,8 +521,10 @@ def test_text_to_image_checkpointing_use_ema(self): pipe(prompt, num_inference_steps=2) # check checkpoint directories exist - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2"))) - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4"))) + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4"}, + ) # check can run an intermediate checkpoint unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet") @@ -556,9 +563,773 @@ def test_text_to_image_checkpointing_use_ema(self): pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) pipe(prompt, num_inference_steps=2) - # check old checkpoints do not exist - self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2"))) + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + { + # no checkpoint-2 -> check old checkpoints do not exist + # check new checkpoints exist + "checkpoint-4", + "checkpoint-6", + }, + ) + + def test_text_to_image_checkpointing_checkpoints_total_limit(self): + pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" + prompt = "a prompt" - # check new checkpoints exist - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4"))) - self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6"))) + with tempfile.TemporaryDirectory() as tmpdir: + # Run training script with checkpointing + # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2 + # Should create checkpoints at steps 2, 4, 6 + # with checkpoint at step 2 deleted + + initial_run_args = f""" + examples/text_to_image/train_text_to_image.py + --pretrained_model_name_or_path {pretrained_model_name_or_path} + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 64 + --center_crop + --random_flip + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 7 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=2 + --checkpoints_total_limit=2 + --seed=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) + pipe(prompt, num_inference_steps=2) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + # checkpoint-2 should have been deleted + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" + prompt = "a prompt" + + with tempfile.TemporaryDirectory() as tmpdir: + # Run training script with checkpointing + # max_train_steps == 9, checkpointing_steps == 2 + # Should create checkpoints at steps 2, 4, 6, 8 + + initial_run_args = f""" + examples/text_to_image/train_text_to_image.py + --pretrained_model_name_or_path {pretrained_model_name_or_path} + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 64 + --center_crop + --random_flip + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 9 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=2 + --seed=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) + pipe(prompt, num_inference_steps=2) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + # resume and we should try to checkpoint at 10, where we'll have to remove + # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint + + resume_run_args = f""" + examples/text_to_image/train_text_to_image.py + --pretrained_model_name_or_path {pretrained_model_name_or_path} + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 64 + --center_crop + --random_flip + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 11 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=2 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + --seed=0 + """.split() + + run_command(self._launch_args + resume_run_args) + + pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None) + pipe(prompt, num_inference_steps=2) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8", "checkpoint-10"}, + ) + + def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self): + pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" + prompt = "a prompt" + + with tempfile.TemporaryDirectory() as tmpdir: + # Run training script with checkpointing + # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2 + # Should create checkpoints at steps 2, 4, 6 + # with checkpoint at step 2 deleted + + initial_run_args = f""" + examples/text_to_image/train_text_to_image_lora.py + --pretrained_model_name_or_path {pretrained_model_name_or_path} + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 64 + --center_crop + --random_flip + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 7 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=2 + --checkpoints_total_limit=2 + --seed=0 + --num_validation_images=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None + ) + pipe.load_lora_weights(tmpdir) + pipe(prompt, num_inference_steps=2) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + # checkpoint-2 should have been deleted + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe" + prompt = "a prompt" + + with tempfile.TemporaryDirectory() as tmpdir: + # Run training script with checkpointing + # max_train_steps == 9, checkpointing_steps == 2 + # Should create checkpoints at steps 2, 4, 6, 8 + + initial_run_args = f""" + examples/text_to_image/train_text_to_image_lora.py + --pretrained_model_name_or_path {pretrained_model_name_or_path} + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 64 + --center_crop + --random_flip + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 9 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=2 + --seed=0 + --num_validation_images=0 + """.split() + + run_command(self._launch_args + initial_run_args) + + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None + ) + pipe.load_lora_weights(tmpdir) + pipe(prompt, num_inference_steps=2) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + # resume and we should try to checkpoint at 10, where we'll have to remove + # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint + + resume_run_args = f""" + examples/text_to_image/train_text_to_image_lora.py + --pretrained_model_name_or_path {pretrained_model_name_or_path} + --dataset_name hf-internal-testing/dummy_image_text_data + --resolution 64 + --center_crop + --random_flip + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 11 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=2 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + --seed=0 + --num_validation_images=0 + """.split() + + run_command(self._launch_args + resume_run_args) + + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None + ) + pipe.load_lora_weights(tmpdir) + pipe(prompt, num_inference_steps=2) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8", "checkpoint-10"}, + ) + + def test_unconditional_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + initial_run_args = f""" + examples/unconditional_image_generation/train_unconditional.py + --dataset_name hf-internal-testing/dummy_image_class_data + --model_config_name_or_path diffusers/ddpm_dummy + --resolution 64 + --output_dir {tmpdir} + --train_batch_size 1 + --num_epochs 1 + --gradient_accumulation_steps 1 + --ddpm_num_inference_steps 2 + --learning_rate 1e-3 + --lr_warmup_steps 5 + --checkpointing_steps=2 + --checkpoints_total_limit=2 + """.split() + + run_command(self._launch_args + initial_run_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + # checkpoint-2 should have been deleted + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_unconditional_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + initial_run_args = f""" + examples/unconditional_image_generation/train_unconditional.py + --dataset_name hf-internal-testing/dummy_image_class_data + --model_config_name_or_path diffusers/ddpm_dummy + --resolution 64 + --output_dir {tmpdir} + --train_batch_size 1 + --num_epochs 1 + --gradient_accumulation_steps 1 + --ddpm_num_inference_steps 2 + --learning_rate 1e-3 + --lr_warmup_steps 5 + --checkpointing_steps=1 + """.split() + + run_command(self._launch_args + initial_run_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-1", "checkpoint-2", "checkpoint-3", "checkpoint-4", "checkpoint-5", "checkpoint-6"}, + ) + + resume_run_args = f""" + examples/unconditional_image_generation/train_unconditional.py + --dataset_name hf-internal-testing/dummy_image_class_data + --model_config_name_or_path diffusers/ddpm_dummy + --resolution 64 + --output_dir {tmpdir} + --train_batch_size 1 + --num_epochs 2 + --gradient_accumulation_steps 1 + --ddpm_num_inference_steps 2 + --learning_rate 1e-3 + --lr_warmup_steps 5 + --resume_from_checkpoint=checkpoint-6 + --checkpointing_steps=2 + --checkpoints_total_limit=3 + """.split() + + run_command(self._launch_args + resume_run_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-8", "checkpoint-10", "checkpoint-12"}, + ) + + def test_textual_inversion_checkpointing(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/textual_inversion/textual_inversion.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe + --train_data_dir docs/source/en/imgs + --learnable_property object + --placeholder_token + --initializer_token a + --validation_prompt + --validation_steps 1 + --save_steps 1 + --num_vectors 2 + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 3 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=1 + --checkpoints_total_limit=2 + """.split() + + run_command(self._launch_args + test_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-3"}, + ) + + def test_textual_inversion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/textual_inversion/textual_inversion.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe + --train_data_dir docs/source/en/imgs + --learnable_property object + --placeholder_token + --initializer_token a + --validation_prompt + --validation_steps 1 + --save_steps 1 + --num_vectors 2 + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 3 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=1 + """.split() + + run_command(self._launch_args + test_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-1", "checkpoint-2", "checkpoint-3"}, + ) + + resume_run_args = f""" + examples/textual_inversion/textual_inversion.py + --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe + --train_data_dir docs/source/en/imgs + --learnable_property object + --placeholder_token + --initializer_token a + --validation_prompt + --validation_steps 1 + --save_steps 1 + --num_vectors 2 + --resolution 64 + --train_batch_size 1 + --gradient_accumulation_steps 1 + --max_train_steps 4 + --learning_rate 5.0e-04 + --scale_lr + --lr_scheduler constant + --lr_warmup_steps 0 + --output_dir {tmpdir} + --checkpointing_steps=1 + --resume_from_checkpoint=checkpoint-3 + --checkpoints_total_limit=2 + """.split() + + run_command(self._launch_args + resume_run_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-3", "checkpoint-4"}, + ) + + def test_instruct_pix2pix_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/instruct_pix2pix/train_instruct_pix2pix.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --dataset_name=hf-internal-testing/instructpix2pix-10-samples + --resolution=64 + --random_flip + --train_batch_size=1 + --max_train_steps=7 + --checkpointing_steps=2 + --checkpoints_total_limit=2 + --output_dir {tmpdir} + --seed=0 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_instruct_pix2pix_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/instruct_pix2pix/train_instruct_pix2pix.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --dataset_name=hf-internal-testing/instructpix2pix-10-samples + --resolution=64 + --random_flip + --train_batch_size=1 + --max_train_steps=9 + --checkpointing_steps=2 + --output_dir {tmpdir} + --seed=0 + """.split() + + run_command(self._launch_args + test_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + resume_run_args = f""" + examples/instruct_pix2pix/train_instruct_pix2pix.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --dataset_name=hf-internal-testing/instructpix2pix-10-samples + --resolution=64 + --random_flip + --train_batch_size=1 + --max_train_steps=11 + --checkpointing_steps=2 + --output_dir {tmpdir} + --seed=0 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + """.split() + + run_command(self._launch_args + resume_run_args) + + # check checkpoint directories exist + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8", "checkpoint-10"}, + ) + + def test_dreambooth_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt=prompt + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=6 + --checkpoints_total_limit=2 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_dreambooth_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt=prompt + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=9 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + resume_run_args = f""" + examples/dreambooth/train_dreambooth.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt=prompt + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=11 + --checkpointing_steps=2 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + """.split() + + run_command(self._launch_args + resume_run_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8", "checkpoint-10"}, + ) + + def test_dreambooth_lora_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth_lora.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt=prompt + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=6 + --checkpoints_total_limit=2 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_dreambooth_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/dreambooth/train_dreambooth_lora.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt=prompt + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=9 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + resume_run_args = f""" + examples/dreambooth/train_dreambooth_lora.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt=prompt + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=11 + --checkpointing_steps=2 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + """.split() + + run_command(self._launch_args + resume_run_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8", "checkpoint-10"}, + ) + + def test_controlnet_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/controlnet/train_controlnet.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --dataset_name=hf-internal-testing/fill10 + --output_dir={tmpdir} + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --max_train_steps=6 + --checkpoints_total_limit=2 + --checkpointing_steps=2 + --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/controlnet/train_controlnet.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --dataset_name=hf-internal-testing/fill10 + --output_dir={tmpdir} + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet + --max_train_steps=9 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + resume_run_args = f""" + examples/controlnet/train_controlnet.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --dataset_name=hf-internal-testing/fill10 + --output_dir={tmpdir} + --resolution=64 + --train_batch_size=1 + --gradient_accumulation_steps=1 + --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet + --max_train_steps=11 + --checkpointing_steps=2 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + """.split() + + run_command(self._launch_args + resume_run_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-8", "checkpoint-10", "checkpoint-12"}, + ) + + def test_custom_diffusion_checkpointing_checkpoints_total_limit(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/custom_diffusion/train_custom_diffusion.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt= + --resolution=64 + --train_batch_size=1 + --modifier_token= + --dataloader_num_workers=0 + --max_train_steps=6 + --checkpoints_total_limit=2 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-4", "checkpoint-6"}, + ) + + def test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/custom_diffusion/train_custom_diffusion.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt= + --resolution=64 + --train_batch_size=1 + --modifier_token= + --dataloader_num_workers=0 + --max_train_steps=9 + --checkpointing_steps=2 + """.split() + + run_command(self._launch_args + test_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"}, + ) + + resume_run_args = f""" + examples/custom_diffusion/train_custom_diffusion.py + --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe + --instance_data_dir=docs/source/en/imgs + --output_dir={tmpdir} + --instance_prompt= + --resolution=64 + --train_batch_size=1 + --modifier_token= + --dataloader_num_workers=0 + --max_train_steps=11 + --checkpointing_steps=2 + --resume_from_checkpoint=checkpoint-8 + --checkpoints_total_limit=3 + """.split() + + run_command(self._launch_args + resume_run_args) + + self.assertEqual( + {x for x in os.listdir(tmpdir) if "checkpoint" in x}, + {"checkpoint-6", "checkpoint-8", "checkpoint-10"}, + ) diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 3fe72b90b24a..2ec2702e439a 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -18,6 +18,7 @@ import math import os import random +import shutil from pathlib import Path import accelerate @@ -362,11 +363,7 @@ def parse_args(): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -427,9 +424,7 @@ def main(): ) logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -909,6 +904,26 @@ def collate_fn(examples): if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 4a39f37a2896..7c2601d8e9b5 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -19,6 +19,7 @@ import math import os import random +import shutil from pathlib import Path import datasets @@ -327,11 +328,7 @@ def parse_args(): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -368,9 +365,7 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -809,6 +804,26 @@ def collate_fn(examples): if global_step % args.checkpointing_steps == 0: if accelerator.is_main_process: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") @@ -903,18 +918,19 @@ def collate_fn(examples): if accelerator.is_main_process: for tracker in accelerator.trackers: - if tracker.name == "tensorboard": - np_images = np.stack([np.asarray(img) for img in images]) - tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC") - if tracker.name == "wandb": - tracker.log( - { - "test": [ - wandb.Image(image, caption=f"{i}: {args.validation_prompt}") - for i, image in enumerate(images) - ] - } - ) + if len(images) != 0: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("test", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "test": [ + wandb.Image(image, caption=f"{i}: {args.validation_prompt}") + for i, image in enumerate(images) + ] + } + ) accelerator.end_training() diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index 8c44247a75b5..14b0997862d2 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -18,6 +18,7 @@ import math import os import random +import shutil import warnings from pathlib import Path @@ -394,11 +395,7 @@ def parse_args(): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -566,9 +563,7 @@ def __getitem__(self, i): def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, @@ -887,6 +882,26 @@ def main(): if accelerator.is_main_process: if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index b07143f8b267..d6e4b17ba889 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -3,6 +3,7 @@ import logging import math import os +import shutil from pathlib import Path from typing import Optional @@ -245,11 +246,7 @@ def parse_args(): "--checkpoints_total_limit", type=int, default=None, - help=( - "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." - " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" - ), + help=("Max number of checkpoints to store."), ) parser.add_argument( "--resume_from_checkpoint", @@ -287,9 +284,7 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) - accelerator_project_config = ProjectConfiguration( - total_limit=args.checkpoints_total_limit, project_dir=args.output_dir, logging_dir=logging_dir - ) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -607,6 +602,26 @@ def transform_images(examples): global_step += 1 if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + if accelerator.is_main_process: save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) From 3ddc2b73957a0d2172664ce9551c4fd5c9b8380c Mon Sep 17 00:00:00 2001 From: Will Berman Date: Thu, 15 Jun 2023 23:24:49 -0700 Subject: [PATCH 136/199] [train text to image] add note to loading from checkpoint (#3806) add note to loading from checkpoint --- examples/text_to_image/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index 160e73fa02bb..a7deec1f8fe8 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -111,6 +111,21 @@ image = pipe(prompt="yoda").images[0] image.save("yoda-pokemon.png") ``` +Checkpoints only save the unet, so to run inference from a checkpoint, just load the unet +```python +from diffusers import StableDiffusionPipeline, UNet2DConditionModel + +model_path = "path_to_saved_model" + +unet = UNet2DConditionModel.from_pretrained(model_path + "/checkpoint-/unet") + +pipe = StableDiffusionPipeline.from_pretrained("", unet=unet, torch_dtype=torch.float16) +pipe.to("cuda") + +image = pipe(prompt="yoda").images[0] +image.save("yoda-pokemon.png") +``` + #### Training with multiple GPUs `accelerate` allows for seamless multi-GPU training. Follow the instructions [here](https://huggingface.co/docs/accelerate/basic_tutorials/launch) From 59aefe9ea6d56245d257439f757384fb1ac5d477 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Fri, 16 Jun 2023 10:39:20 -0700 Subject: [PATCH 137/199] device map legacy attention block weight conversion (#3804) --- src/diffusers/models/attention_processor.py | 1 + src/diffusers/models/modeling_utils.py | 102 ++++++++++++++++++-- tests/models/test_attention_processor.py | 44 +++++++++ 3 files changed, 137 insertions(+), 10 deletions(-) diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index e0404a83cc9a..0bc7886c2653 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -78,6 +78,7 @@ def __init__( self.upcast_softmax = upcast_softmax self.rescale_output_factor = rescale_output_factor self.residual_connection = residual_connection + self.dropout = dropout # we make use of this private variable to know whether this class is loaded # with an deprecated state dict so that we can convert it on the fly diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index f6d6bc5711cd..9e9b5cde4675 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -22,7 +22,7 @@ from typing import Any, Callable, List, Optional, Tuple, Union import torch -from torch import Tensor, device +from torch import Tensor, device, nn from .. import __version__ from ..utils import ( @@ -646,15 +646,47 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P else: # else let accelerate handle loading and dispatching. # Load weights and dispatch according to the device_map # by default the device_map is None and the weights are loaded on the CPU - accelerate.load_checkpoint_and_dispatch( - model, - model_file, - device_map, - max_memory=max_memory, - offload_folder=offload_folder, - offload_state_dict=offload_state_dict, - dtype=torch_dtype, - ) + try: + accelerate.load_checkpoint_and_dispatch( + model, + model_file, + device_map, + max_memory=max_memory, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + ) + except AttributeError as e: + # When using accelerate loading, we do not have the ability to load the state + # dict and rename the weight names manually. Additionally, accelerate skips + # torch loading conventions and directly writes into `module.{_buffers, _parameters}` + # (which look like they should be private variables?), so we can't use the standard hooks + # to rename parameters on load. We need to mimic the original weight names so the correct + # attributes are available. After we have loaded the weights, we convert the deprecated + # names to the new non-deprecated names. Then we _greatly encourage_ the user to convert + # the weights so we don't have to do this again. + + if "'Attention' object has no attribute" in str(e): + logger.warn( + f"Taking `{str(e)}` while using `accelerate.load_checkpoint_and_dispatch` to mean {pretrained_model_name_or_path}" + " was saved with deprecated attention block weight names. We will load it with the deprecated attention block" + " names and convert them on the fly to the new attention block format. Please re-save the model after this conversion," + " so we don't have to do the on the fly renaming in the future. If the model is from a hub checkpoint," + " please also re-upload it or open a PR on the original repository." + ) + model._temp_convert_self_to_deprecated_attention_blocks() + accelerate.load_checkpoint_and_dispatch( + model, + model_file, + device_map, + max_memory=max_memory, + offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + ) + model._undo_temp_convert_self_to_deprecated_attention_blocks() + else: + raise e loading_info = { "missing_keys": [], @@ -889,3 +921,53 @@ def recursive_find_attn_block(name, module): state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight") if f"{path}.proj_attn.bias" in state_dict: state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias") + + def _temp_convert_self_to_deprecated_attention_blocks(self): + deprecated_attention_block_modules = [] + + def recursive_find_attn_block(module): + if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: + deprecated_attention_block_modules.append(module) + + for sub_module in module.children(): + recursive_find_attn_block(sub_module) + + recursive_find_attn_block(self) + + for module in deprecated_attention_block_modules: + module.query = module.to_q + module.key = module.to_k + module.value = module.to_v + module.proj_attn = module.to_out[0] + + # We don't _have_ to delete the old attributes, but it's helpful to ensure + # that _all_ the weights are loaded into the new attributes and we're not + # making an incorrect assumption that this model should be converted when + # it really shouldn't be. + del module.to_q + del module.to_k + del module.to_v + del module.to_out + + def _undo_temp_convert_self_to_deprecated_attention_blocks(self): + deprecated_attention_block_modules = [] + + def recursive_find_attn_block(module): + if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: + deprecated_attention_block_modules.append(module) + + for sub_module in module.children(): + recursive_find_attn_block(sub_module) + + recursive_find_attn_block(self) + + for module in deprecated_attention_block_modules: + module.to_q = module.query + module.to_k = module.key + module.to_v = module.value + module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)]) + + del module.query + del module.key + del module.value + del module.proj_attn diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py index 172d6d4d91fc..f9b5924ca5e0 100644 --- a/tests/models/test_attention_processor.py +++ b/tests/models/test_attention_processor.py @@ -1,7 +1,10 @@ +import tempfile import unittest +import numpy as np import torch +from diffusers import DiffusionPipeline from diffusers.models.attention_processor import Attention, AttnAddedKVProcessor @@ -73,3 +76,44 @@ def test_only_cross_attention(self): only_cross_attn_out = attn(**forward_args) self.assertTrue((only_cross_attn_out != self_and_cross_attn_out).all()) + + +class DeprecatedAttentionBlockTests(unittest.TestCase): + def test_conversion_when_using_device_map(self): + pipe = DiffusionPipeline.from_pretrained("hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None) + + pre_conversion = pipe( + "foo", + num_inference_steps=2, + generator=torch.Generator("cpu").manual_seed(0), + output_type="np", + ).images + + # the initial conversion succeeds + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-pipe", device_map="sequential", safety_checker=None + ) + + conversion = pipe( + "foo", + num_inference_steps=2, + generator=torch.Generator("cpu").manual_seed(0), + output_type="np", + ).images + + with tempfile.TemporaryDirectory() as tmpdir: + # save the converted model + pipe.save_pretrained(tmpdir) + + # can also load the converted weights + pipe = DiffusionPipeline.from_pretrained(tmpdir, device_map="sequential", safety_checker=None) + + after_conversion = pipe( + "foo", + num_inference_steps=2, + generator=torch.Generator("cpu").manual_seed(0), + output_type="np", + ).images + + self.assertTrue(np.allclose(pre_conversion, conversion)) + self.assertTrue(np.allclose(conversion, after_conversion)) From f7cc9adc05c36985c8b3f3e95a54940ddac5b4ca Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Fri, 16 Jun 2023 13:19:37 -0700 Subject: [PATCH 138/199] [docs] Zero SNR (#3776) * add zero snr doc * fix image link * apply feedback * separate page --- docs/source/en/_toctree.yml | 2 + .../stable_diffusion/stable_diffusion_2.mdx | 4 +- docs/source/en/api/schedulers/ddim.mdx | 4 +- .../en/using-diffusers/control_brightness.mdx | 45 +++++++++++++++++++ .../test_stable_diffusion_v_pred.py | 2 +- 5 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 docs/source/en/using-diffusers/control_brightness.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index d85f715203bc..be85ce592f38 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -50,6 +50,8 @@ title: Distributed inference with multiple GPUs - local: using-diffusers/reusing_seeds title: Improve image quality with deterministic generation + - local: using-diffusers/control_brightness + title: Control image brightness - local: using-diffusers/reproducibility title: Create reproducible pipelines - local: using-diffusers/custom_pipeline_examples diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx index 7162626ebbde..6ba805cf445d 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_2.mdx @@ -101,7 +101,7 @@ Continue fine-tuning a checkpoint with [`train_text_to_image.py`](https://github and `--prediction_type="v_prediction"`. - (3) change the sampler to always start from the last timestep; ```py -pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_scaling="trailing") +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing") ``` - (4) rescale classifier-free guidance to prevent over-exposure. ```py @@ -118,7 +118,7 @@ from diffusers import DiffusionPipeline, DDIMScheduler pipe = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config( - pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_scaling="trailing" + pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" ) pipe.to("cuda") diff --git a/docs/source/en/api/schedulers/ddim.mdx b/docs/source/en/api/schedulers/ddim.mdx index 0db5e4f4e2b5..2e69fd672cfa 100644 --- a/docs/source/en/api/schedulers/ddim.mdx +++ b/docs/source/en/api/schedulers/ddim.mdx @@ -59,7 +59,7 @@ Continue fine-tuning a checkpoint with [`train_text_to_image.py`](https://github and `--prediction_type="v_prediction"`. - (3) change the sampler to always start from the last timestep; ```py -pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_scaling="trailing") +pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing") ``` - (4) rescale classifier-free guidance to prevent over-exposure. ```py @@ -76,7 +76,7 @@ from diffusers import DiffusionPipeline, DDIMScheduler pipe = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config( - pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_scaling="trailing" + pipe.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" ) pipe.to("cuda") diff --git a/docs/source/en/using-diffusers/control_brightness.mdx b/docs/source/en/using-diffusers/control_brightness.mdx new file mode 100644 index 000000000000..91ba8692028f --- /dev/null +++ b/docs/source/en/using-diffusers/control_brightness.mdx @@ -0,0 +1,45 @@ +# Control image brightness + +The Stable Diffusion pipeline is mediocre at generating images that are either very bright or dark as explained in the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper. The solutions proposed in the paper are currently implemented in the [`DDIMScheduler`] which you can use to improve the lighting in your images. + + + +💡 Take a look at the paper linked above for more details about the proposed solutions! + + + +One of the solutions is to train a model with *v prediction* and *v loss*. Add the following flag to the [`train_text_to_image.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [`train_text_to_image_lora.py`](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts to enable `v_prediction`: + +```bash +--prediction_type="v_prediction" +``` + +For example, let's use the [`ptx0/pseudo-journey-v2`](https://huggingface.co/ptx0/pseudo-journey-v2) checkpoint which has been finetuned with `v_prediction`. + +Next, configure the following parameters in the [`DDIMScheduler`]: + +1. `rescale_betas_zero_snr=True`, rescales the noise schedule to zero terminal signal-to-noise ratio (SNR) +2. `timestep_spacing="trailing"`, starts sampling from the last timestep + +```py +>>> from diffusers import DiffusionPipeline, DDIMScheduler + +>>> pipeline = DiffusioPipeline.from_pretrained("ptx0/pseudo-journey-v2") +# switch the scheduler in the pipeline to use the DDIMScheduler + +>>> pipeline.scheduler = DDIMScheduler.from_config( +... pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing" +... ) +>>> pipeline.to("cuda") +``` + +Finally, in your call to the pipeline, set `guidance_rescale` to prevent overexposure: + +```py +prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" +image = pipeline(prompt, guidance_rescale=0.7).images[0] +``` + +
+ +
\ No newline at end of file diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index 21862ba6a216..67486e61dbef 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -392,7 +392,7 @@ def test_stable_diffusion_text2img_pipeline_unflawed(self): pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") pipe.scheduler = DDIMScheduler.from_config( - pipe.scheduler.config, timestep_scaling="trailing", rescale_betas_zero_snr=True + pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True ) pipe.to(torch_device) pipe.enable_attention_slicing() From cd460ca2f5107ca6053e27c6e92f0aee2248627c Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Sat, 17 Jun 2023 09:10:42 +0530 Subject: [PATCH 139/199] Remove xformers, refactor ckpt resuming --- examples/consistency_models/script.sh | 2 +- .../train_consistency_distillation.py | 40 ++++++------------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/examples/consistency_models/script.sh b/examples/consistency_models/script.sh index 5b5f6b4710e9..dfe6bcc5b7a0 100644 --- a/examples/consistency_models/script.sh +++ b/examples/consistency_models/script.sh @@ -1,3 +1,3 @@ #!/bin/bash -accelerate launch train_consistency_distillation.py --dataset_name="huggan/flowers-102-categories" --resolution=64 --center_crop --random_flip --output_dir="ddpm-ema-flowers-64" --train_batch_size=16 --num_epochs=100 --gradient_accumulation_steps=1 --use_ema --learning_rate=1e-4 --lr_warmup_steps=500 --mixed_precision=no --push_to_hub \ No newline at end of file +accelerate launch train_consistency_distillation.py --dataset_name="cifar10" --resolution=32 --center_crop --random_flip --output_dir="cifar10-32" --train_batch_size=16 --num_epochs=100 --gradient_accumulation_steps=1 --learning_rate=1e-4 --lr_warmup_steps=500 --mixed_precision=no --push_to_hub \ No newline at end of file diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index a6d131bf7290..2ffcd729887e 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -24,7 +24,6 @@ from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available -from diffusers.utils.import_utils import is_xformers_available #Copied from examples/unconditional_image_generation/train_unconditional.py for now @@ -258,9 +257,6 @@ def parse_args(): ' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.' ), ) - parser.add_argument( - "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." - ) args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) @@ -436,19 +432,6 @@ def load_model_hook(models, input_dir): model_config=model.config, ) - if args.enable_xformers_memory_efficient_attention: - if is_xformers_available(): - import xformers - - xformers_version = version.parse(xformers.__version__) - if xformers_version == version.parse("0.0.16"): - logger.warn( - "xFormers 0.0.16 cannot be used for training in some GPUs. If you observe problems during training, please update xFormers to at least 0.0.17. See https://huggingface.co/docs/diffusers/main/en/optimization/xformers for more details." - ) - model.enable_xformers_memory_efficient_attention() - else: - raise ValueError("xformers is not available. Make sure it is installed correctly") - # Initialize the optimizer optimizer = torch.optim.AdamW( model.parameters(), @@ -554,22 +537,25 @@ def transform_images(examples): accelerator.load_state(os.path.join(args.output_dir, path)) global_step = int(path.split("-")[1]) - resume_global_step = global_step * args.gradient_accumulation_steps + initial_global_step = global_step * args.gradient_accumulation_steps first_epoch = global_step // num_update_steps_per_epoch - resume_step = resume_global_step % (num_update_steps_per_epoch * args.gradient_accumulation_steps) + else: + initial_global_step = 0 + + progress_bar = tqdm( + range(0, max_train_steps), + initial=initial_global_step, + desc="Steps", + # Only show the progress bar once on each machine. + disable=not accelerator.is_local_main_process, + ) + + # Train! for epoch in range(first_epoch, args.num_epochs): model.train() - progress_bar = tqdm(total=num_update_steps_per_epoch, disable=not accelerator.is_local_main_process) - progress_bar.set_description(f"Epoch {epoch}") for step, batch in enumerate(train_dataloader): - # Skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step: - if step % args.gradient_accumulation_steps == 0: - progress_bar.update(1) - continue - clean_images = batch["input"] labels = batch["labels"] # Sample noise that we'll add to the images From 9a816425914dd6db6e7ab4baba9dbbcdcf9a75e4 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Mon, 19 Jun 2023 12:11:56 +0530 Subject: [PATCH 140/199] Add input scaling, disable gradients --- .../train_consistency_distillation.py | 59 ++++++++++--------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 2ffcd729887e..99f7d06fa097 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -412,9 +412,9 @@ def load_model_hook(models, input_dir): # load the model to distill into a consistency model teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet - model = model.double() - target_model = target_model.double() # TODO : support half precision training - teacher_model = teacher_model.double() + model = model.float() + target_model = target_model.float() # TODO : support half precision training + teacher_model = teacher_model.float() noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 noise_scheduler.set_timesteps(num_scales) @@ -566,42 +566,43 @@ def transform_images(examples): ).long() timestep = timesteps[index] timestep_prev = timestep + 1 - # TODO, we should have an add noise in the scheduler maybe? - noised_image = clean_images + noise*append_dims(timestep, clean_images.ndim) + noised_image = noise_scheduler.add_noise(clean_images, noise, timestep) scaled_timesteps = noise_scheduler.scale_timestep(timestep) scaled_timesteps_prev = noise_scheduler.scale_timestep(timestep_prev) target_model_ema.copy_to(target_model.parameters()) with accelerator.accumulate(model): # Predict the noise residual - model_output = model(noised_image, scaled_timesteps, class_labels=labels).sample + + model_output = model(noise_scheduler.scale_model_input(noised_image, timestep), scaled_timesteps, class_labels=labels).sample distiller = noise_scheduler.step( model_output, timestep, noised_image, use_noise=False ).prev_sample - # Heun Solver to get previous timestep image using teacher model - # TODO - make this cleaner - samples = noised_image - x = samples - model_output = teacher_model(x, scaled_timesteps, class_labels=labels).sample - teacher_denoiser = noise_scheduler.step( - model_output, timestep, x, use_noise=False - ).prev_sample - d = (x - teacher_denoiser) / append_dims(scaled_timesteps, x.ndim) - samples = x + d * append_dims(scaled_timesteps_prev - scaled_timesteps, x.ndim) - model_output = teacher_model(samples, scaled_timesteps_prev, class_labels=labels).sample - teacher_denoiser = noise_scheduler.step( - model_output, timestep_prev, samples, use_noise=False - ).prev_sample - - next_d = (samples - teacher_denoiser) / append_dims(scaled_timesteps_prev, x.ndim) - denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndim) - - # get output from target model - model_output = target_model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample - distiller_target = noise_scheduler.step( - model_output, timestep_prev, denoised_image, use_noise=False - ).prev_sample + with torch.no_grad(): + # Heun Solver to get previous timestep image using teacher model + # TODO - make this cleaner + samples = noised_image + x = samples + model_output = teacher_model(noise_scheduler.scale_model_input(x, timestep), scaled_timesteps, class_labels=labels).sample + teacher_denoiser = noise_scheduler.step( + model_output, timestep, x, use_noise=False + ).prev_sample + d = (x - teacher_denoiser) / append_dims(scaled_timesteps, x.ndim) + samples = x + d * append_dims(scaled_timesteps_prev - scaled_timesteps, x.ndim) + model_output = teacher_model(noise_scheduler.scale_model_input(samples, timestep_prev), scaled_timesteps_prev, class_labels=labels).sample + teacher_denoiser = noise_scheduler.step( + model_output, timestep_prev, samples, use_noise=False + ).prev_sample + + next_d = (samples - teacher_denoiser) / append_dims(scaled_timesteps_prev, x.ndim) + denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndim) + + # get output from target model + model_output = target_model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample + distiller_target = noise_scheduler.step( + model_output, timestep_prev, denoised_image, use_noise=False + ).prev_sample loss = F.mse_loss(distiller, distiller_target) loss = loss.mean() From 666743302ff5bd1e02c204b81a80e566648d60de Mon Sep 17 00:00:00 2001 From: estelleafl Date: Mon, 19 Jun 2023 18:38:02 +0300 Subject: [PATCH 141/199] [ldm3d] Fixed small typo (#3820) * fixed typo * updated doc to be consistent in naming * make style/quality --------- Co-authored-by: Aflalo --- .../en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx | 4 ++-- .../stable_diffusion/pipeline_stable_diffusion_ldm3d.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx index ca5798d93a8e..d311fdb5f4f6 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.mdx @@ -35,9 +35,9 @@ Running LDM3D is straighforward with the [`StableDiffusionLDM3DPipeline`]: ```python >>> from diffusers import StableDiffusionLDM3DPipeline ->>> pipe_ldm3d = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d") +>>> pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d") prompt ="A picture of some lemons on a table" -output = pipe_ldm3d(prompt) +output = pipe(prompt) rgb_image, depth_image = output.rgb, output.depth rgb_image[0].save("lemons_ldm3d_rgb.jpg") depth_image[0].save("lemons_ldm3d_depth.png") diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py index c804d2f1918b..2df9c46f0be3 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py @@ -49,7 +49,7 @@ >>> pipe = pipe.to("cuda") >>> prompt = "a photo of an astronaut riding a horse on mars" - >>> output = pipe_ldm3d(prompt) + >>> output = pipe(prompt) >>> rgb_image, depth_image = output.rgb, output.depth ``` """ From 4870626728ac6bad424a37d6f45c40082ca6a602 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 20 Jun 2023 08:59:41 +0530 Subject: [PATCH 142/199] [Examples] Improve the model card pushed from the `train_text_to_image.py` script (#3810) * refactor: readme serialized from the example when push_to_hub is True. * fix: batch size arg. * a bit better formatting * minor fixes. * add note on env. * Apply suggestions from code review Co-authored-by: Pedro Cuenca * condition wandb info better * make mixed_precision assignment in cli args explicit. * separate inference block for sample images. * Apply suggestions from code review Co-authored-by: Pedro Cuenca * address more comments. * autocast mode. * correct none image type problem. * ifx: list assignment. * minor fix. --------- Co-authored-by: Pedro Cuenca --- examples/text_to_image/README.md | 12 +- examples/text_to_image/train_text_to_image.py | 113 ++++++++++++++++++ 2 files changed, 119 insertions(+), 6 deletions(-) diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index a7deec1f8fe8..62dd77617053 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -55,11 +55,11 @@ With `gradient_checkpointing` and `mixed_precision` it should be possible to fin ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export dataset_name="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/pokemon-blip-captions" accelerate launch --mixed_precision="fp16" train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ - --dataset_name=$dataset_name \ + --dataset_name=$DATASET_NAME \ --use_ema \ --resolution=512 --center_crop --random_flip \ --train_batch_size=1 \ @@ -133,11 +133,11 @@ for running distributed training with `accelerate`. Here is an example command: ```bash export MODEL_NAME="CompVis/stable-diffusion-v1-4" -export dataset_name="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/pokemon-blip-captions" accelerate launch --mixed_precision="fp16" --multi_gpu train_text_to_image.py \ --pretrained_model_name_or_path=$MODEL_NAME \ - --dataset_name=$dataset_name \ + --dataset_name=$DATASET_NAME \ --use_ema \ --resolution=512 --center_crop --random_flip \ --train_batch_size=1 \ @@ -274,11 +274,11 @@ pip install -U -r requirements_flax.txt ```bash export MODEL_NAME="duongna/stable-diffusion-v1-4-flax" -export dataset_name="lambdalabs/pokemon-blip-captions" +export DATASET_NAME="lambdalabs/pokemon-blip-captions" python train_text_to_image_flax.py \ --pretrained_model_name_or_path=$MODEL_NAME \ - --dataset_name=$dataset_name \ + --dataset_name=$DATASET_NAME \ --resolution=512 --center_crop --random_flip \ --train_batch_size=1 \ --mixed_precision="fp16" \ diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 2ec2702e439a..8f4cdba2bfb6 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -35,6 +35,7 @@ from datasets import load_dataset from huggingface_hub import create_repo, upload_folder from packaging import version +from PIL import Image from torchvision import transforms from tqdm.auto import tqdm from transformers import CLIPTextModel, CLIPTokenizer @@ -62,6 +63,92 @@ } +def make_image_grid(imgs, rows, cols): + assert len(imgs) == rows * cols + + w, h = imgs[0].size + grid = Image.new("RGB", size=(cols * w, rows * h)) + + for i, img in enumerate(imgs): + grid.paste(img, box=(i % cols * w, i // cols * h)) + return grid + + +def save_model_card( + args, + repo_id: str, + images=None, + repo_folder=None, +): + img_str = "" + if len(images) > 0: + image_grid = make_image_grid(images, 1, len(args.validation_prompts)) + image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png")) + img_str += "![val_imgs_grid](./val_imgs_grid.png)\n" + + yaml = f""" +--- +license: creativeml-openrail-m +base_model: {args.pretrained_model_name_or_path} +datasets: +- {args.dataset_name} +tags: +- stable-diffusion +- stable-diffusion-diffusers +- text-to-image +- diffusers +inference: true +--- + """ + model_card = f""" +# Text-to-image finetuning - {repo_id} + +This pipeline was finetuned from **{args.pretrained_model_name_or_path}** on the **{args.dataset_name}** dataset. Below are some example images generated with the finetuned pipeline using the following prompts: {args.validation_prompts}: \n +{img_str} + +## Pipeline usage + +You can use the pipeline like so: + +```python +from diffusers import DiffusionPipeline +import torch + +pipeline = DiffusionPipeline.from_pretrained("{repo_id}", torch_dtype=torch.float16) +prompt = "{args.validation_prompts[0]}" +image = pipeline(prompt).images[0] +image.save("my_image.png") +``` + +## Training info + +These are the key hyperparameters used during training: + +* Epochs: {args.num_train_epochs} +* Learning rate: {args.learning_rate} +* Batch size: {args.train_batch_size} +* Gradient accumulation steps: {args.gradient_accumulation_steps} +* Image resolution: {args.resolution} +* Mixed-precision: {args.mixed_precision} + +""" + wandb_info = "" + if is_wandb_available(): + wandb_run_url = None + if wandb.run is not None: + wandb_run_url = wandb.run.url + + if wandb_run_url is not None: + wandb_info = f""" +More information on all the CLI arguments and the environment are available on your [`wandb` run page]({wandb_run_url}). +""" + + model_card += wandb_info + + with open(os.path.join(repo_folder, "README.md"), "w") as f: + f.write(yaml + model_card) + + def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight_dtype, epoch): logger.info("Running validation... ") @@ -112,6 +199,8 @@ def log_validation(vae, text_encoder, tokenizer, unet, args, accelerator, weight del pipeline torch.cuda.empty_cache() + return images + def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") @@ -747,8 +836,10 @@ def collate_fn(examples): weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": weight_dtype = torch.float16 + args.mixed_precision = accelerator.mixed_precision elif accelerator.mixed_precision == "bf16": weight_dtype = torch.bfloat16 + args.mixed_precision = accelerator.mixed_precision # Move text_encode and vae to gpu and cast to weight_dtype text_encoder.to(accelerator.device, dtype=weight_dtype) @@ -970,7 +1061,29 @@ def collate_fn(examples): ) pipeline.save_pretrained(args.output_dir) + # Run a final round of inference. + images = [] + if args.validation_prompts is not None: + logger.info("Running inference for collecting generated images...") + pipeline = pipeline.to(accelerator.device) + pipeline.torch_dtype = weight_dtype + pipeline.set_progress_bar_config(disable=True) + + if args.enable_xformers_memory_efficient_attention: + pipeline.enable_xformers_memory_efficient_attention() + + if args.seed is None: + generator = None + else: + generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) + + for i in range(len(args.validation_prompts)): + with torch.autocast("cuda"): + image = pipeline(args.validation_prompts[i], num_inference_steps=20, generator=generator).images[0] + images.append(image) + if args.push_to_hub: + save_model_card(args, repo_id, images, repo_folder=args.output_dir) upload_folder( repo_id=repo_id, folder_path=args.output_dir, From 88eb04489d9083fee6b90f8619d2ae0a441e17d3 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Tue, 20 Jun 2023 11:15:21 +0530 Subject: [PATCH 143/199] [Docs] add missing pipelines from the overview pages and minor fixes (#3795) * add entry for safe stable diffusion to the sd overview page. * add missing pipelines o the broader overview section in the pipelines. * address PR feedback./ --- docs/source/en/api/pipelines/overview.mdx | 24 ++++++++++--------- .../pipelines/stable_diffusion/overview.mdx | 22 ++++++++--------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx index 0ae3d897a3b1..6c6a85d697b9 100644 --- a/docs/source/en/api/pipelines/overview.mdx +++ b/docs/source/en/api/pipelines/overview.mdx @@ -54,6 +54,9 @@ available a colab notebook to directly try them out. | [if](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) | [if_img2img](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) | [if_inpainting](./if) | [**IF**](https://github.com/deep-floyd/IF) | Image-to-Image Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/deepfloyd_if_free_tier_google_colab.ipynb) +| [kandinsky](./kandinsky) | **Kandinsky** | Text-to-Image Generation | +| [kandinsky_inpaint](./kandinsky) | **Kandinsky** | Image-to-Image Generation | +| [kandinsky_img2img](./kandinsky) | **Kandinsksy** | Image-to-Image Generation | | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Text-to-Image Generation | | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | | [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | @@ -72,21 +75,20 @@ available a colab notebook to directly try them out. | [stable_diffusion_self_attention_guidance](./stable_diffusion/self_attention_guidance) | [**Self-Attention Guidance**](https://arxiv.org/abs/2210.00939) | Text-to-Image Generation | | [stable_diffusion_image_variation](./stable_diffusion/image_variation) | [**Stable Diffusion Image Variations**](https://github.com/LambdaLabsML/lambda-diffusers#stable-diffusion-image-variations) | Image-to-Image Generation | | [stable_diffusion_latent_upscale](./stable_diffusion/latent_upscale) | [**Stable Diffusion Latent Upscaler**](https://twitter.com/StabilityAI/status/1590531958815064065) | Text-Guided Super Resolution Image-to-Image | -| [stable_diffusion_2](./stable_diffusion_2/) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-to-Image Generation | -| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | -| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Depth-to-Image Text-Guided Generation | -| [stable_diffusion_2](./stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image | +| [stable_diffusion_2](./stable_diffusion/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Image Inpainting | +| [stable_diffusion_2](./stable_diffusion/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Depth-to-Image Text-Guided Generation | +| [stable_diffusion_2](./stable_diffusion/stable_diffusion_2) | [**Stable Diffusion 2**](https://stability.ai/blog/stable-diffusion-v2-release) | Text-Guided Super Resolution Image-to-Image | | [stable_diffusion_safe](./stable_diffusion_safe) | [**Safe Stable Diffusion**](https://arxiv.org/abs/2211.05105) | Text-Guided Generation | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | [stable_unclip](./stable_unclip) | **Stable unCLIP** | Text-to-Image Generation | | [stable_unclip](./stable_unclip) | **Stable unCLIP** | Image-to-Image Text-Guided Generation | | [stochastic_karras_ve](./stochastic_karras_ve) | [**Elucidating the Design Space of Diffusion-Based Generative Models**](https://arxiv.org/abs/2206.00364) | Unconditional Image Generation | -| [text_to_video_sd](./api/pipelines/text_to_video) | [Modelscope's Text-to-video-synthesis Model in Open Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary) | Text-to-Video Generation | -| [unclip](./unclip) | [Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation | -| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | -| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Image Variations Generation | -| [versatile_diffusion](./versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | -| [vq_diffusion](./vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | -| [text_to_video_zero](./text_to_video_zero) | [Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators](https://arxiv.org/abs/2303.13439) | Text-to-Video Generation | +| [text_to_video_sd](./api/pipelines/text_to_video) | [**Modelscope's Text-to-video-synthesis Model in Open Domain**](https://modelscope.cn/models/damo/text-to-video-synthesis/summary) | Text-to-Video Generation | +| [unclip](./unclip) | [**Hierarchical Text-Conditional Image Generation with CLIP Latents](https://arxiv.org/abs/2204.06125) | Text-to-Image Generation | +| [versatile_diffusion](./versatile_diffusion) | [**Versatile Diffusion: Text, Images and Variations All in One Diffusion Model**](https://arxiv.org/abs/2211.08332) | Text-to-Image Generation | +| [versatile_diffusion](./versatile_diffusion) | [**Versatile Diffusion: Text, Images and Variations All in One Diffusion Model**](https://arxiv.org/abs/2211.08332) | Image Variations Generation | +| [versatile_diffusion](./versatile_diffusion) | [**Versatile Diffusion: Text, Images and Variations All in One Diffusion Model**](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation | +| [vq_diffusion](./vq_diffusion) | [**Vector Quantized Diffusion Model for Text-to-Image Synthesis**](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation | +| [text_to_video_zero](./text_to_video_zero) | [**Text2Video-Zero: Text-to-Image Diffusion Models are Zero-Shot Video Generators**](https://arxiv.org/abs/2303.13439) | Text-to-Video Generation | **Note**: Pipelines are simple examples of how to play around with the diffusion systems as described in the corresponding papers. diff --git a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx index a163b57f2a84..5f1a6a4aad5d 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/overview.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/overview.mdx @@ -26,19 +26,17 @@ For more details about how Stable Diffusion works and how it differs from the ba | Pipeline | Tasks | Colab | Demo |---|---|:---:|:---:| | [StableDiffusionPipeline](./text2img) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) | [🤗 Stable Diffusion](https://huggingface.co/spaces/stabilityai/stable-diffusion) +| [StableDiffusionPipelineSafe](./stable_diffusion_safe) | *Text-to-Image Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ml-research/safe-latent-diffusion/blob/main/examples/Safe%20Latent%20Diffusion.ipynb) | [![Huggingface Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/AIML-TUDA/unsafe-vs-safe-stable-diffusion) | [StableDiffusionImg2ImgPipeline](./img2img) | *Image-to-Image Text-Guided Generation* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/image_2_image_using_diffusers.ipynb) | [🤗 Diffuse the Rest](https://huggingface.co/spaces/huggingface/diffuse-the-rest) -| [StableDiffusionInpaintPipeline](./inpaint) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | Coming soon -| [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation * | | Coming soon -| [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation * | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations) -| [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon -| [StableDiffusionLatentUpscalePipeline](./latent_upscale) | **Experimental** – *Text-Guided Image Super-Resolution * | | Coming soon -| [StableDiffusionInstructPix2PixPipeline](./pix2pix) | **Experimental** – *Text-Based Image Editing * | | [InstructPix2Pix: Learning to Follow Image Editing Instructions](https://huggingface.co/spaces/timbrooks/instruct-pix2pix) -| [StableDiffusionAttendAndExcitePipeline](./attend_and_excite) | **Experimental** – *Text-to-Image Generation * | | [Attend-and-Excite: Attention-Based Semantic Guidance for Text-to-Image Diffusion Models](https://huggingface.co/spaces/AttendAndExcite/Attend-and-Excite) -| [StableDiffusionPix2PixZeroPipeline](./pix2pix_zero) | **Experimental** – *Text-Based Image Editing * | | [Zero-shot Image-to-Image Translation](https://arxiv.org/abs/2302.03027) -| [StableDiffusionModelEditingPipeline](./model_editing) | **Experimental** – *Text-to-Image Model Editing * | | [Editing Implicit Assumptions in Text-to-Image Diffusion Models](https://arxiv.org/abs/2303.08084) -| [StableDiffusionDiffEditPipeline](./diffedit) | **Experimental** – *Text-Based Image Editing * | | [DiffEdit: Diffusion-based semantic image editing with mask guidance](https://arxiv.org/abs/2210.11427) - - +| [StableDiffusionInpaintPipeline](./inpaint) | **Experimental** – *Text-Guided Image Inpainting* | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/in_painting_with_stable_diffusion_using_diffusers.ipynb) | +| [StableDiffusionDepth2ImgPipeline](./depth2img) | **Experimental** – *Depth-to-Image Text-Guided Generation* | | +| [StableDiffusionImageVariationPipeline](./image_variation) | **Experimental** – *Image Variation Generation* | | [🤗 Stable Diffusion Image Variations](https://huggingface.co/spaces/lambdalabs/stable-diffusion-image-variations) +| [StableDiffusionUpscalePipeline](./upscale) | **Experimental** – *Text-Guided Image Super-Resolution* | | +| [StableDiffusionLatentUpscalePipeline](./latent_upscale) | **Experimental** – *Text-Guided Image Super-Resolution* | | +| [Stable Diffusion 2](./stable_diffusion_2) | *Text-Guided Image Inpainting* | +| [Stable Diffusion 2](./stable_diffusion_2) | *Depth-to-Image Text-Guided Generation* | +| [Stable Diffusion 2](./stable_diffusion_2) | *Text-Guided Super Resolution Image-to-Image* | +| [StableDiffusionLDM3DPipeline](./ldm3d) | *Text-to-(RGB, Depth)* | ## Tips From 73b125df68183279b90d21200c9a712b05e80083 Mon Sep 17 00:00:00 2001 From: Andy Shih Date: Tue, 20 Jun 2023 02:34:26 -0700 Subject: [PATCH 144/199] [Pipeline] Add new pipeline for ParaDiGMS -- parallel sampling of diffusion models (#3716) * add paradigms parallel sampling pipeline * linting * ran make fix-copies * add paradigms parallel sampling pipeline * linting * ran make fix-copies * Apply suggestions from code review Co-authored-by: Sayak Paul * changes based on review * add docs for paradigms * update docs with paradigms abstract * improve documentation, and add tests for ddim/ddpm batch_step_no_noise * fix docs and run make fix-copies * minor changes to docs. * Apply suggestions from code review Co-authored-by: Patrick von Platen * move parallel scheduler to new classes for DDPMParallelScheduler and DDIMParallelScheduler * remove changes for scheduling_ddim, adjust licenses, credits, and commented code * fix tensor type that is breaking tests --------- Co-authored-by: Sayak Paul Co-authored-by: Patrick von Platen --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/pipelines/overview.mdx | 1 + docs/source/en/api/pipelines/paradigms.mdx | 81 ++ src/diffusers/__init__.py | 3 + src/diffusers/pipelines/__init__.py | 1 + .../pipelines/stable_diffusion/__init__.py | 1 + .../pipeline_stable_diffusion_paradigms.py | 830 ++++++++++++++++++ src/diffusers/schedulers/__init__.py | 2 + .../schedulers/scheduling_ddim_parallel.py | 620 +++++++++++++ .../schedulers/scheduling_ddpm_parallel.py | 558 ++++++++++++ src/diffusers/utils/dummy_pt_objects.py | 30 + .../dummy_torch_and_transformers_objects.py | 15 + .../test_stable_diffusion_paradigms.py | 227 +++++ .../test_scheduler_ddim_parallel.py | 188 ++++ .../test_scheduler_ddpm_parallel.py | 216 +++++ tests/schedulers/test_schedulers.py | 6 + 16 files changed, 2781 insertions(+) create mode 100644 docs/source/en/api/pipelines/paradigms.mdx create mode 100644 src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py create mode 100644 src/diffusers/schedulers/scheduling_ddim_parallel.py create mode 100644 src/diffusers/schedulers/scheduling_ddpm_parallel.py create mode 100644 tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py create mode 100644 tests/schedulers/test_scheduler_ddim_parallel.py create mode 100644 tests/schedulers/test_scheduler_ddpm_parallel.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index be85ce592f38..a8884ac7dbe9 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -188,6 +188,8 @@ title: MultiDiffusion Panorama - local: api/pipelines/paint_by_example title: PaintByExample + - local: api/pipelines/paradigms + title: Parallel Sampling of Diffusion Models - local: api/pipelines/pix2pix_zero title: Pix2Pix Zero - local: api/pipelines/pndm diff --git a/docs/source/en/api/pipelines/overview.mdx b/docs/source/en/api/pipelines/overview.mdx index 6c6a85d697b9..693c32565c46 100644 --- a/docs/source/en/api/pipelines/overview.mdx +++ b/docs/source/en/api/pipelines/overview.mdx @@ -61,6 +61,7 @@ available a colab notebook to directly try them out. | [latent_diffusion](./latent_diffusion) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752)| Super Resolution Image-to-Image | | [latent_diffusion_uncond](./latent_diffusion_uncond) | [**High-Resolution Image Synthesis with Latent Diffusion Models**](https://arxiv.org/abs/2112.10752) | Unconditional Image Generation | | [paint_by_example](./paint_by_example) | [**Paint by Example: Exemplar-based Image Editing with Diffusion Models**](https://arxiv.org/abs/2211.13227) | Image-Guided Image Inpainting | +| [paradigms](./paradigms) | [**Parallel Sampling of Diffusion Models**](https://arxiv.org/abs/2305.16317) | Text-to-Image Generation | | [pndm](./pndm) | [**Pseudo Numerical Methods for Diffusion Models on Manifolds**](https://arxiv.org/abs/2202.09778) | Unconditional Image Generation | | [score_sde_ve](./score_sde_ve) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | | [score_sde_vp](./score_sde_vp) | [**Score-Based Generative Modeling through Stochastic Differential Equations**](https://openreview.net/forum?id=PxTIG12RRHS) | Unconditional Image Generation | diff --git a/docs/source/en/api/pipelines/paradigms.mdx b/docs/source/en/api/pipelines/paradigms.mdx new file mode 100644 index 000000000000..ec399e6b96d4 --- /dev/null +++ b/docs/source/en/api/pipelines/paradigms.mdx @@ -0,0 +1,81 @@ + + +# Parallel Sampling of Diffusion Models (ParaDiGMS) + +## Overview + +[Parallel Sampling of Diffusion Models](https://arxiv.org/abs/2305.16317) by Andy Shih, Suneel Belkhale, Stefano Ermon, Dorsa Sadigh, Nima Anari. + +The abstract of the paper is the following: + +*Diffusion models are powerful generative models but suffer from slow sampling, often taking 1000 sequential denoising steps for one sample. As a result, considerable efforts have been directed toward reducing the number of denoising steps, but these methods hurt sample quality. Instead of reducing the number of denoising steps (trading quality for speed), in this paper we explore an orthogonal approach: can we run the denoising steps in parallel (trading compute for speed)? In spite of the sequential nature of the denoising steps, we show that surprisingly it is possible to parallelize sampling via Picard iterations, by guessing the solution of future denoising steps and iteratively refining until convergence. With this insight, we present ParaDiGMS, a novel method to accelerate the sampling of pretrained diffusion models by denoising multiple steps in parallel. ParaDiGMS is the first diffusion sampling method that enables trading compute for speed and is even compatible with existing fast sampling techniques such as DDIM and DPMSolver. Using ParaDiGMS, we improve sampling speed by 2-4x across a range of robotics and image generation models, giving state-of-the-art sampling speeds of 0.2s on 100-step DiffusionPolicy and 16s on 1000-step StableDiffusion-v2 with no measurable degradation of task reward, FID score, or CLIP score.* + +Resources: + +* [Paper](https://arxiv.org/abs/2305.16317). +* [Original Code](https://github.com/AndyShih12/paradigms). + +## Available Pipelines: + +| Pipeline | Tasks | Demo +|---|---|:---:| +| [StableDiffusionParadigmsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py) | *Faster Text-to-Image Generation* | | + +## Usage example + +```python +import torch +from diffusers import DDPMParallelScheduler +from diffusers import StableDiffusionParadigmsPipeline + +scheduler = DDPMParallelScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler") + +pipe = StableDiffusionParadigmsPipeline.from_pretrained( + "runwayml/stable-diffusion-v1-5", scheduler=scheduler, torch_dtype=torch.float16 +) +pipe = pipe.to("cuda") + +ngpu, batch_per_device = torch.cuda.device_count(), 5 +pipe.wrapped_unet = torch.nn.DataParallel(pipe.unet, device_ids=[d for d in range(ngpu)]) + +prompt = "a photo of an astronaut riding a horse on mars" +image = pipe(prompt, parallel=ngpu * batch_per_device, num_inference_steps=1000).images[0] +``` + + +This pipeline improves sampling speed by running denoising steps in parallel, at the cost of increased total FLOPs. +Therefore, it is better to call this pipeline when running on multiple GPUs. Otherwise, without enough GPU bandwidth +sampling may be even slower than sequential sampling. + +The two parameters to play with are `parallel` (batch size) and `tolerance`. +- If it fits in memory, for 1000-step DDPM you can aim for a batch size of around 100 +(e.g. 8 GPUs and batch_per_device=12 to get parallel=96). Higher batch size +may not fit in memory, and lower batch size gives less parallelism. +- For tolerance, using a higher tolerance may get better speedups but can risk sample quality degradation. +If there is quality degradation with the default tolerance, then use a lower tolerance (e.g. 0.001). + +For 1000-step DDPM on 8 A100 GPUs, you can expect around a 3x speedup by StableDiffusionParadigmsPipeline instead of StableDiffusionPipeline +by setting parallel=80 and tolerance=0.1. + + + +Diffusers also offers distributed inference support for generating multiple prompts +in parallel on multiple GPUs. Check out the docs [here](https://huggingface.co/docs/diffusers/main/en/training/distributed_inference). + +In contrast, this pipeline is designed for speeding up sampling of a single prompt (by using multiple GPUs). + + +## StableDiffusionParadigmsPipeline +[[autodoc]] StableDiffusionParadigmsPipeline + - __call__ + - all diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 07a60946b8c5..02907075345e 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -73,7 +73,9 @@ ) from .schedulers import ( DDIMInverseScheduler, + DDIMParallelScheduler, DDIMScheduler, + DDPMParallelScheduler, DDPMScheduler, DEISMultistepScheduler, DPMSolverMultistepInverseScheduler, @@ -152,6 +154,7 @@ StableDiffusionLDM3DPipeline, StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, + StableDiffusionParadigmsPipeline, StableDiffusionPipeline, StableDiffusionPipelineSafe, StableDiffusionPix2PixZeroPipeline, diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 42c7dc33970d..b1650240848a 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -80,6 +80,7 @@ StableDiffusionLDM3DPipeline, StableDiffusionModelEditingPipeline, StableDiffusionPanoramaPipeline, + StableDiffusionParadigmsPipeline, StableDiffusionPipeline, StableDiffusionPix2PixZeroPipeline, StableDiffusionSAGPipeline, diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py index cff7a765a7ef..33ab05a1dacb 100644 --- a/src/diffusers/pipelines/stable_diffusion/__init__.py +++ b/src/diffusers/pipelines/stable_diffusion/__init__.py @@ -53,6 +53,7 @@ class StableDiffusionPipelineOutput(BaseOutput): from .pipeline_stable_diffusion_ldm3d import StableDiffusionLDM3DPipeline from .pipeline_stable_diffusion_model_editing import StableDiffusionModelEditingPipeline from .pipeline_stable_diffusion_panorama import StableDiffusionPanoramaPipeline + from .pipeline_stable_diffusion_paradigms import StableDiffusionParadigmsPipeline from .pipeline_stable_diffusion_sag import StableDiffusionSAGPipeline from .pipeline_stable_diffusion_upscale import StableDiffusionUpscalePipeline from .pipeline_stable_unclip import StableUnCLIPPipeline diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py new file mode 100644 index 000000000000..33549ebb0edb --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py @@ -0,0 +1,830 @@ +# Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import torch +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer + +from ...image_processor import VaeImageProcessor +from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet2DConditionModel +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionPipelineOutput +from .safety_checker import StableDiffusionSafetyChecker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import DDPMParallelScheduler + >>> from diffusers import StableDiffusionParadigmsPipeline + + >>> scheduler = DDPMParallelScheduler.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="scheduler") + + >>> pipe = StableDiffusionParadigmsPipeline.from_pretrained( + ... "runwayml/stable-diffusion-v1-5", scheduler=scheduler, torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + + >>> ngpu, batch_per_device = torch.cuda.device_count(), 5 + >>> pipe.wrapped_unet = torch.nn.DataParallel(pipe.unet, device_ids=[d for d in range(ngpu)]) + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt, parallel=ngpu * batch_per_device, num_inference_steps=1000).images[0] + ``` +""" + + +class StableDiffusionParadigmsPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): + r""" + Parallelized version of StableDiffusionPipeline, based on the paper https://arxiv.org/abs/2305.16317 This pipeline + parallelizes the denoising steps to generate a single image faster (more akin to model parallelism). + + Pipeline for text-to-image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details. + feature_extractor ([`CLIPImageProcessor`]): + Model that extracts features from generated images to be used as inputs for the `safety_checker`. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # attribute to wrap the unet with torch.nn.DataParallel when running multiple denoising steps on multiple GPUs + self.wrapped_unet = self.unet + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.check_inputs + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def _cumsum(self, input, dim, debug=False): + if debug: + # cumsum_cuda_kernel does not have a deterministic implementation + # so perform cumsum on cpu for debugging purposes + return torch.cumsum(input.cpu().float(), dim=dim).to(input.device) + else: + return torch.cumsum(input, dim=dim) + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + parallel: int = 10, + tolerance: float = 0.1, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + debug: bool = False, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + parallel (`int`, *optional*, defaults to 10): + The batch size to use when doing parallel sampling. More parallelism may lead to faster inference but + requires higher memory usage and also can require more total FLOPs. + tolerance (`float`, *optional*, defaults to 0.1): + The error tolerance for determining when to slide the batch window forward for parallel sampling. Lower + tolerance usually leads to less/no degradation. Higher tolerance is faster but can risk degradation of + sample quality. The tolerance is specified as a ratio of the scheduler's noise magnitude. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + debug (`bool`, *optional*, defaults to `False`): + Whether or not to run in debug mode. In debug mode, torch.cumsum is evaluated using the CPU. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated images, and the second element is a + list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + extra_step_kwargs.pop("generator", None) + + # # 7. Denoising loop + scheduler = self.scheduler + parallel = min(parallel, len(scheduler.timesteps)) + + begin_idx = 0 + end_idx = parallel + latents_time_evolution_buffer = torch.stack([latents] * (len(scheduler.timesteps) + 1)) + + # We must make sure the noise of stochastic schedulers such as DDPM is sampled only once per timestep. + # Sampling inside the parallel denoising loop will mess this up, so we pre-sample the noise vectors outside the denoising loop. + noise_array = torch.zeros_like(latents_time_evolution_buffer) + for j in range(len(scheduler.timesteps)): + base_noise = randn_tensor( + shape=latents.shape, generator=generator, device=latents.device, dtype=prompt_embeds.dtype + ) + noise = (self.scheduler._get_variance(scheduler.timesteps[j]) ** 0.5) * base_noise + noise_array[j] = noise.clone() + + # We specify the error tolerance as a ratio of the scheduler's noise magnitude. We similarly compute the error tolerance + # outside of the denoising loop to avoid recomputing it at every step. + # We will be dividing the norm of the noise, so we store its inverse here to avoid a division at every step. + inverse_variance_norm = 1.0 / torch.tensor( + [scheduler._get_variance(scheduler.timesteps[j]) for j in range(len(scheduler.timesteps))] + [0] + ).to(noise_array.device) + latent_dim = noise_array[0, 0].numel() + inverse_variance_norm = inverse_variance_norm[:, None] / latent_dim + + scaled_tolerance = tolerance**2 + + with self.progress_bar(total=num_inference_steps) as progress_bar: + steps = 0 + while begin_idx < len(scheduler.timesteps): + # these have shape (parallel_dim, 2*batch_size, ...) + # parallel_len is at most parallel, but could be less if we are at the end of the timesteps + # we are processing batch window of timesteps spanning [begin_idx, end_idx) + parallel_len = end_idx - begin_idx + + block_prompt_embeds = torch.stack([prompt_embeds] * parallel_len) + block_latents = latents_time_evolution_buffer[begin_idx:end_idx] + block_t = scheduler.timesteps[begin_idx:end_idx, None].repeat(1, batch_size * num_images_per_prompt) + t_vec = block_t + if do_classifier_free_guidance: + t_vec = t_vec.repeat(1, 2) + + # expand the latents if we are doing classifier free guidance + latent_model_input = ( + torch.cat([block_latents] * 2, dim=1) if do_classifier_free_guidance else block_latents + ) + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t_vec) + + # if parallel_len is small, no need to use multiple GPUs + net = self.wrapped_unet if parallel_len > 3 else self.unet + # predict the noise residual, shape is now [parallel_len * 2 * batch_size * num_images_per_prompt, ...] + model_output = net( + latent_model_input.flatten(0, 1), + t_vec.flatten(0, 1), + encoder_hidden_states=block_prompt_embeds.flatten(0, 1), + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + per_latent_shape = model_output.shape[1:] + if do_classifier_free_guidance: + model_output = model_output.reshape( + parallel_len, 2, batch_size * num_images_per_prompt, *per_latent_shape + ) + noise_pred_uncond, noise_pred_text = model_output[:, 0], model_output[:, 1] + model_output = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + model_output = model_output.reshape( + parallel_len * batch_size * num_images_per_prompt, *per_latent_shape + ) + + block_latents_denoise = scheduler.batch_step_no_noise( + model_output=model_output, + timesteps=block_t.flatten(0, 1), + sample=block_latents.flatten(0, 1), + **extra_step_kwargs, + ).reshape(block_latents.shape) + + # back to shape (parallel_dim, batch_size, ...) + # now we want to add the pre-sampled noise + # parallel sampling algorithm requires computing the cumulative drift from the beginning + # of the window, so we need to compute cumulative sum of the deltas and the pre-sampled noises. + delta = block_latents_denoise - block_latents + cumulative_delta = self._cumsum(delta, dim=0, debug=debug) + cumulative_noise = self._cumsum(noise_array[begin_idx:end_idx], dim=0, debug=debug) + + # if we are using an ODE-like scheduler (like DDIM), we don't want to add noise + if scheduler._is_ode_scheduler: + cumulative_noise = 0 + + block_latents_new = ( + latents_time_evolution_buffer[begin_idx][None,] + cumulative_delta + cumulative_noise + ) + cur_error = torch.linalg.norm( + (block_latents_new - latents_time_evolution_buffer[begin_idx + 1 : end_idx + 1]).reshape( + parallel_len, batch_size * num_images_per_prompt, -1 + ), + dim=-1, + ).pow(2) + error_ratio = cur_error * inverse_variance_norm[begin_idx + 1 : end_idx + 1] + + # find the first index of the vector error_ratio that is greater than error tolerance + # we can shift the window for the next iteration up to this index + error_ratio = torch.nn.functional.pad( + error_ratio, (0, 0, 0, 1), value=1e9 + ) # handle the case when everything is below ratio, by padding the end of parallel_len dimension + any_error_at_time = torch.max(error_ratio > scaled_tolerance, dim=1).values.int() + ind = torch.argmax(any_error_at_time).item() + + # compute the new begin and end idxs for the window + new_begin_idx = begin_idx + min(1 + ind, parallel) + new_end_idx = min(new_begin_idx + parallel, len(scheduler.timesteps)) + + # store the computed latents for the current window in the global buffer + latents_time_evolution_buffer[begin_idx + 1 : end_idx + 1] = block_latents_new + # initialize the new sliding window latents with the end of the current window, + # should be better than random initialization + latents_time_evolution_buffer[end_idx : new_end_idx + 1] = latents_time_evolution_buffer[end_idx][ + None, + ] + + steps += 1 + + progress_bar.update(new_begin_idx - begin_idx) + if callback is not None and steps % callback_steps == 0: + callback(begin_idx, block_t[begin_idx], latents_time_evolution_buffer[begin_idx]) + + begin_idx = new_begin_idx + end_idx = new_end_idx + + latents = latents_time_evolution_buffer[-1] + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py index 05414e32fc9e..935759bbb6af 100644 --- a/src/diffusers/schedulers/__init__.py +++ b/src/diffusers/schedulers/__init__.py @@ -30,7 +30,9 @@ else: from .scheduling_ddim import DDIMScheduler from .scheduling_ddim_inverse import DDIMInverseScheduler + from .scheduling_ddim_parallel import DDIMParallelScheduler from .scheduling_ddpm import DDPMScheduler + from .scheduling_ddpm_parallel import DDPMParallelScheduler from .scheduling_deis_multistep import DEISMultistepScheduler from .scheduling_dpmsolver_multistep import DPMSolverMultistepScheduler from .scheduling_dpmsolver_multistep_inverse import DPMSolverMultistepInverseScheduler diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py new file mode 100644 index 000000000000..22b7d8ec97dc --- /dev/null +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -0,0 +1,620 @@ +# Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion +# and https://github.com/hojonathanho/diffusion + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput, randn_tensor +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + + +@dataclass +# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput +class DDIMParallelSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's step function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample (x_{0}) based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: torch.FloatTensor + pred_original_sample: Optional[torch.FloatTensor] = None + + +# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: + """ + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + + + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. + + Returns: + betas (`np.ndarray`): the betas used by the scheduler to step the model outputs + """ + + def alpha_bar(time_step): + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return torch.tensor(betas, dtype=torch.float32) + + +# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr +def rescale_zero_terminal_snr(betas): + """ + Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) + + + Args: + betas (`torch.FloatTensor`): + the betas that the scheduler is being initialized with. + + Returns: + `torch.FloatTensor`: rescaled betas with zero terminal SNR + """ + # Convert betas to alphas_bar_sqrt + alphas = 1.0 - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) + alphas_bar_sqrt = alphas_cumprod.sqrt() + + # Store old values. + alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() + alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() + + # Shift so the last timestep is zero. + alphas_bar_sqrt -= alphas_bar_sqrt_T + + # Scale so the first timestep is back to the old value. + alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) + + # Convert alphas_bar_sqrt to betas + alphas_bar = alphas_bar_sqrt**2 # Revert sqrt + alphas = alphas_bar[1:] / alphas_bar[:-1] # Revert cumprod + alphas = torch.cat([alphas_bar[0:1], alphas]) + betas = 1 - alphas + + return betas + + +class DDIMParallelScheduler(SchedulerMixin, ConfigMixin): + """ + Denoising diffusion implicit models is a scheduler that extends the denoising procedure introduced in denoising + diffusion probabilistic models (DDPMs) with non-Markovian guidance. + + [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` + function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. + [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and + [`~SchedulerMixin.from_pretrained`] functions. + + For more details, see the original paper: https://arxiv.org/abs/2010.02502 + + Args: + num_train_timesteps (`int`): number of diffusion steps used to train the model. + beta_start (`float`): the starting `beta` value of inference. + beta_end (`float`): the final `beta` value. + beta_schedule (`str`): + the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear`, `scaled_linear`, or `squaredcos_cap_v2`. + trained_betas (`np.ndarray`, optional): + option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. + clip_sample (`bool`, default `True`): + option to clip predicted sample for numerical stability. + clip_sample_range (`float`, default `1.0`): + the maximum magnitude for sample clipping. Valid only when `clip_sample=True`. + set_alpha_to_one (`bool`, default `True`): + each diffusion step uses the value of alphas product at that step and at the previous one. For the final + step there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, + otherwise it uses the value of alpha at step 0. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. + prediction_type (`str`, default `epsilon`, optional): + prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion + process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 + https://imagen.research.google/video/paper.pdf) + thresholding (`bool`, default `False`): + whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487). + Note that the thresholding method is unsuitable for latent-space diffusion models (such as + stable-diffusion). + dynamic_thresholding_ratio (`float`, default `0.995`): + the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen + (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`. + sample_max_value (`float`, default `1.0`): + the threshold value for dynamic thresholding. Valid only when `thresholding=True`. + timestep_spacing (`str`, default `"leading"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + rescale_betas_zero_snr (`bool`, default `False`): + whether to rescale the betas to have zero terminal SNR (proposed by https://arxiv.org/pdf/2305.08891.pdf). + This can enable the model to generate very bright and dark samples instead of limiting it to samples with + medium brightness. Loosely related to + [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + _is_ode_scheduler = True + + @register_to_config + # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.__init__ + def __init__( + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + clip_sample: bool = True, + set_alpha_to_one: bool = True, + steps_offset: int = 0, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + clip_sample_range: float = 1.0, + sample_max_value: float = 1.0, + timestep_spacing: str = "leading", + rescale_betas_zero_snr: bool = False, + ): + if trained_betas is not None: + self.betas = torch.tensor(trained_betas, dtype=torch.float32) + elif beta_schedule == "linear": + self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + elif beta_schedule == "scaled_linear": + # this schedule is very specific to the latent diffusion model. + self.betas = ( + torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2 + ) + elif beta_schedule == "squaredcos_cap_v2": + # Glide cosine schedule + self.betas = betas_for_alpha_bar(num_train_timesteps) + else: + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + + # Rescale for zero SNR + if rescale_betas_zero_snr: + self.betas = rescale_zero_terminal_snr(self.betas) + + self.alphas = 1.0 - self.betas + self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) + + # At every step in ddim, we are looking into the previous alphas_cumprod + # For the final step, there is no previous alphas_cumprod because we are already at 0 + # `set_alpha_to_one` decides whether we set this parameter simply to one or + # whether we use the final alpha of the "non-previous" one. + self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] + + # standard deviation of the initial noise distribution + self.init_noise_sigma = 1.0 + + # setable values + self.num_inference_steps = None + self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) + + # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.scale_model_input + def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + """ + Ensures interchangeability with schedulers that need to scale the denoising model input depending on the + current timestep. + + Args: + sample (`torch.FloatTensor`): input sample + timestep (`int`, optional): current timestep + + Returns: + `torch.FloatTensor`: scaled input sample + """ + return sample + + def _get_variance(self, timestep, prev_timestep=None): + if prev_timestep is None: + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps + + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + + return variance + + def _batch_get_variance(self, t, prev_t): + alpha_prod_t = self.alphas_cumprod[t] + alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)] + alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0) + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + + return variance + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample + def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + """ + "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the + prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by + s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing + pixels from saturation at each step. We find that dynamic thresholding results in significantly better + photorealism as well as better image-text alignment, especially when using very large guidance weights." + + https://arxiv.org/abs/2205.11487 + """ + dtype = sample.dtype + batch_size, channels, height, width = sample.shape + + if dtype not in (torch.float32, torch.float64): + sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half + + # Flatten sample for doing quantile calculation along each image + sample = sample.reshape(batch_size, channels * height * width) + + abs_sample = sample.abs() # "a certain percentile absolute pixel value" + + s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1) + s = torch.clamp( + s, min=1, max=self.config.sample_max_value + ) # When clamped to min=1, equivalent to standard clipping to [-1, 1] + + s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0 + sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" + + sample = sample.reshape(batch_size, channels, height, width) + sample = sample.to(dtype) + + return sample + + # Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler.set_timesteps + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + """ + Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps (`int`): + the number of diffusion steps used when generating samples with a pre-trained model. + """ + + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + + # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'." + ) + + self.timesteps = torch.from_numpy(timesteps).to(device) + + def step( + self, + model_output: torch.FloatTensor, + timestep: int, + sample: torch.FloatTensor, + eta: float = 0.0, + use_clipped_model_output: bool = False, + generator=None, + variance_noise: Optional[torch.FloatTensor] = None, + return_dict: bool = True, + ) -> Union[DDIMParallelSchedulerOutput, Tuple]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): direct output from learned diffusion model. + timestep (`int`): current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + current instance of sample being created by diffusion process. + eta (`float`): weight of noise for added noise in diffusion step. + use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped + predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when + `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would + coincide with the one provided as input and `use_clipped_model_output` will have not effect. + generator: random number generator. + variance_noise (`torch.FloatTensor`): instead of generating noise for the variance using `generator`, we + can directly provide the noise for the variance itself. This is useful for methods such as + CycleDiffusion. (https://arxiv.org/abs/2210.05559) + return_dict (`bool`): option for returning tuple rather than DDIMParallelSchedulerOutput class + + Returns: + [`~schedulers.scheduling_utils.DDIMParallelSchedulerOutput`] or `tuple`: + [`~schedulers.scheduling_utils.DDIMParallelSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. + When returning a tuple, the first element is the sample tensor. + + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf + # Ideally, read DDIM paper in-detail understanding + + # Notation ( -> + # - pred_noise_t -> e_theta(x_t, t) + # - pred_original_sample -> f_theta(x_t, t) or x_0 + # - std_dev_t -> sigma_t + # - eta -> η + # - pred_sample_direction -> "direction pointing to x_t" + # - pred_prev_sample -> "x_t-1" + + # 1. get previous step value (=t-1) + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + + beta_prod_t = 1 - alpha_prod_t + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + if self.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + pred_epsilon = model_output + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + elif self.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" + " `v_prediction`" + ) + + # 4. Clip or threshold "predicted x_0" + if self.config.thresholding: + pred_original_sample = self._threshold_sample(pred_original_sample) + elif self.config.clip_sample: + pred_original_sample = pred_original_sample.clamp( + -self.config.clip_sample_range, self.config.clip_sample_range + ) + + # 5. compute variance: "sigma_t(η)" -> see formula (16) + # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) + variance = self._get_variance(timestep, prev_timestep) + std_dev_t = eta * variance ** (0.5) + + if use_clipped_model_output: + # the pred_epsilon is always re-derived from the clipped x_0 in Glide + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + + # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon + + # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction + + if eta > 0: + if variance_noise is not None and generator is not None: + raise ValueError( + "Cannot pass both generator and variance_noise. Please make sure that either `generator` or" + " `variance_noise` stays `None`." + ) + + if variance_noise is None: + variance_noise = randn_tensor( + model_output.shape, generator=generator, device=model_output.device, dtype=model_output.dtype + ) + variance = std_dev_t * variance_noise + + prev_sample = prev_sample + variance + + if not return_dict: + return (prev_sample,) + + return DDIMParallelSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) + + def batch_step_no_noise( + self, + model_output: torch.FloatTensor, + timesteps: List[int], + sample: torch.FloatTensor, + eta: float = 0.0, + use_clipped_model_output: bool = False, + ) -> torch.FloatTensor: + """ + Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. + Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise + is pre-sampled by the pipeline. + + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): direct output from learned diffusion model. + timesteps (`List[int]`): + current discrete timesteps in the diffusion chain. This is now a list of integers. + sample (`torch.FloatTensor`): + current instance of sample being created by diffusion process. + eta (`float`): weight of noise for added noise in diffusion step. + use_clipped_model_output (`bool`): if `True`, compute "corrected" `model_output` from the clipped + predicted original sample. Necessary because predicted original sample is clipped to [-1, 1] when + `self.config.clip_sample` is `True`. If no clipping has happened, "corrected" `model_output` would + coincide with the one provided as input and `use_clipped_model_output` will have not effect. + + Returns: + `torch.FloatTensor`: sample tensor at previous timestep. + + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + assert eta == 0.0 + + # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf + # Ideally, read DDIM paper in-detail understanding + + # Notation ( -> + # - pred_noise_t -> e_theta(x_t, t) + # - pred_original_sample -> f_theta(x_t, t) or x_0 + # - std_dev_t -> sigma_t + # - eta -> η + # - pred_sample_direction -> "direction pointing to x_t" + # - pred_prev_sample -> "x_t-1" + + # 1. get previous step value (=t-1) + t = timesteps + prev_t = t - self.config.num_train_timesteps // self.num_inference_steps + + t = t.view(-1, *([1] * (model_output.ndim - 1))) + prev_t = prev_t.view(-1, *([1] * (model_output.ndim - 1))) + + # 1. compute alphas, betas + self.alphas_cumprod = self.alphas_cumprod.to(model_output.device) + self.final_alpha_cumprod = self.final_alpha_cumprod.to(model_output.device) + alpha_prod_t = self.alphas_cumprod[t] + alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)] + alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0) + + beta_prod_t = 1 - alpha_prod_t + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + if self.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + pred_epsilon = model_output + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + elif self.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" + " `v_prediction`" + ) + + # 4. Clip or threshold "predicted x_0" + if self.config.thresholding: + pred_original_sample = self._threshold_sample(pred_original_sample) + elif self.config.clip_sample: + pred_original_sample = pred_original_sample.clamp( + -self.config.clip_sample_range, self.config.clip_sample_range + ) + + # 5. compute variance: "sigma_t(η)" -> see formula (16) + # σ_t = sqrt((1 − α_t−1)/(1 − α_t)) * sqrt(1 − α_t/α_t−1) + variance = self._batch_get_variance(t, prev_t).to(model_output.device).view(*alpha_prod_t_prev.shape) + std_dev_t = eta * variance ** (0.5) + + if use_clipped_model_output: + # the pred_epsilon is always re-derived from the clipped x_0 in Glide + pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + + # 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + pred_sample_direction = (1 - alpha_prod_t_prev - std_dev_t**2) ** (0.5) * pred_epsilon + + # 7. compute x_t without "random noise" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + prev_sample = alpha_prod_t_prev ** (0.5) * pred_original_sample + pred_sample_direction + + return prev_sample + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + timesteps: torch.IntTensor, + ) -> torch.FloatTensor: + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples + alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(original_samples.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity + def get_velocity( + self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor + ) -> torch.FloatTensor: + # Make sure alphas_cumprod and timestep have same device and dtype as sample + alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) + timesteps = timesteps.to(sample.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(sample.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample + return velocity + + def __len__(self): + return self.config.num_train_timesteps diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py new file mode 100644 index 000000000000..2719d90b9314 --- /dev/null +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -0,0 +1,558 @@ +# Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput, randn_tensor +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + + +@dataclass +# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput +class DDPMParallelSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's step function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample (x_{0}) based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: torch.FloatTensor + pred_original_sample: Optional[torch.FloatTensor] = None + + +# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar +def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + + + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. + + Returns: + betas (`np.ndarray`): the betas used by the scheduler to step the model outputs + """ + + def alpha_bar(time_step): + return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return torch.tensor(betas, dtype=torch.float32) + + +class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): + """ + Denoising diffusion probabilistic models (DDPMs) explores the connections between denoising score matching and + Langevin dynamics sampling. + + [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` + function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. + [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and + [`~SchedulerMixin.from_pretrained`] functions. + + For more details, see the original paper: https://arxiv.org/abs/2006.11239 + + Args: + num_train_timesteps (`int`): number of diffusion steps used to train the model. + beta_start (`float`): the starting `beta` value of inference. + beta_end (`float`): the final `beta` value. + beta_schedule (`str`): + the beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear`, `scaled_linear`, `squaredcos_cap_v2` or `sigmoid`. + trained_betas (`np.ndarray`, optional): + option to pass an array of betas directly to the constructor to bypass `beta_start`, `beta_end` etc. + variance_type (`str`): + options to clip the variance used when adding noise to the denoised sample. Choose from `fixed_small`, + `fixed_small_log`, `fixed_large`, `fixed_large_log`, `learned` or `learned_range`. + clip_sample (`bool`, default `True`): + option to clip predicted sample for numerical stability. + clip_sample_range (`float`, default `1.0`): + the maximum magnitude for sample clipping. Valid only when `clip_sample=True`. + prediction_type (`str`, default `epsilon`, optional): + prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion + process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 + https://imagen.research.google/video/paper.pdf) + thresholding (`bool`, default `False`): + whether to use the "dynamic thresholding" method (introduced by Imagen, https://arxiv.org/abs/2205.11487). + Note that the thresholding method is unsuitable for latent-space diffusion models (such as + stable-diffusion). + dynamic_thresholding_ratio (`float`, default `0.995`): + the ratio for the dynamic thresholding method. Default is `0.995`, the same as Imagen + (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`. + sample_max_value (`float`, default `1.0`): + the threshold value for dynamic thresholding. Valid only when `thresholding=True`. + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + _is_ode_scheduler = False + + @register_to_config + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.__init__ + def __init__( + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.0001, + beta_end: float = 0.02, + beta_schedule: str = "linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + variance_type: str = "fixed_small", + clip_sample: bool = True, + prediction_type: str = "epsilon", + thresholding: bool = False, + dynamic_thresholding_ratio: float = 0.995, + clip_sample_range: float = 1.0, + sample_max_value: float = 1.0, + ): + if trained_betas is not None: + self.betas = torch.tensor(trained_betas, dtype=torch.float32) + elif beta_schedule == "linear": + self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + elif beta_schedule == "scaled_linear": + # this schedule is very specific to the latent diffusion model. + self.betas = ( + torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2 + ) + elif beta_schedule == "squaredcos_cap_v2": + # Glide cosine schedule + self.betas = betas_for_alpha_bar(num_train_timesteps) + elif beta_schedule == "sigmoid": + # GeoDiff sigmoid schedule + betas = torch.linspace(-6, 6, num_train_timesteps) + self.betas = torch.sigmoid(betas) * (beta_end - beta_start) + beta_start + else: + raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") + + self.alphas = 1.0 - self.betas + self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) + self.one = torch.tensor(1.0) + + # standard deviation of the initial noise distribution + self.init_noise_sigma = 1.0 + + # setable values + self.custom_timesteps = False + self.num_inference_steps = None + self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy()) + + self.variance_type = variance_type + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.scale_model_input + def scale_model_input(self, sample: torch.FloatTensor, timestep: Optional[int] = None) -> torch.FloatTensor: + """ + Ensures interchangeability with schedulers that need to scale the denoising model input depending on the + current timestep. + + Args: + sample (`torch.FloatTensor`): input sample + timestep (`int`, optional): current timestep + + Returns: + `torch.FloatTensor`: scaled input sample + """ + return sample + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.set_timesteps + def set_timesteps( + self, + num_inference_steps: Optional[int] = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + ): + """ + Sets the discrete timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps (`Optional[int]`): + the number of diffusion steps used when generating samples with a pre-trained model. If passed, then + `timesteps` must be `None`. + device (`str` or `torch.device`, optional): + the device to which the timesteps are moved to. + custom_timesteps (`List[int]`, optional): + custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of equal spacing between timesteps is used. If passed, `num_inference_steps` + must be `None`. + + """ + if num_inference_steps is not None and timesteps is not None: + raise ValueError("Can only pass one of `num_inference_steps` or `custom_timesteps`.") + + if timesteps is not None: + for i in range(1, len(timesteps)): + if timesteps[i] >= timesteps[i - 1]: + raise ValueError("`custom_timesteps` must be in descending order.") + + if timesteps[0] >= self.config.num_train_timesteps: + raise ValueError( + f"`timesteps` must start before `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps}." + ) + + timesteps = np.array(timesteps, dtype=np.int64) + self.custom_timesteps = True + else: + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + self.custom_timesteps = False + + self.timesteps = torch.from_numpy(timesteps).to(device) + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance + def _get_variance(self, t, predicted_variance=None, variance_type=None): + prev_t = self.previous_timestep(t) + + alpha_prod_t = self.alphas_cumprod[t] + alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one + current_beta_t = 1 - alpha_prod_t / alpha_prod_t_prev + + # For t > 0, compute predicted variance βt (see formula (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf) + # and sample from it to get previous sample + # x_{t-1} ~ N(pred_prev_sample, variance) == add variance to pred_sample + variance = (1 - alpha_prod_t_prev) / (1 - alpha_prod_t) * current_beta_t + + # we always take the log of variance, so clamp it to ensure it's not 0 + variance = torch.clamp(variance, min=1e-20) + + if variance_type is None: + variance_type = self.config.variance_type + + # hacks - were probably added for training stability + if variance_type == "fixed_small": + variance = variance + # for rl-diffuser https://arxiv.org/abs/2205.09991 + elif variance_type == "fixed_small_log": + variance = torch.log(variance) + variance = torch.exp(0.5 * variance) + elif variance_type == "fixed_large": + variance = current_beta_t + elif variance_type == "fixed_large_log": + # Glide max_log + variance = torch.log(current_beta_t) + elif variance_type == "learned": + return predicted_variance + elif variance_type == "learned_range": + min_log = torch.log(variance) + max_log = torch.log(current_beta_t) + frac = (predicted_variance + 1) / 2 + variance = frac * max_log + (1 - frac) * min_log + + return variance + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample + def _threshold_sample(self, sample: torch.FloatTensor) -> torch.FloatTensor: + """ + "Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the + prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by + s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing + pixels from saturation at each step. We find that dynamic thresholding results in significantly better + photorealism as well as better image-text alignment, especially when using very large guidance weights." + + https://arxiv.org/abs/2205.11487 + """ + dtype = sample.dtype + batch_size, channels, height, width = sample.shape + + if dtype not in (torch.float32, torch.float64): + sample = sample.float() # upcast for quantile calculation, and clamp not implemented for cpu half + + # Flatten sample for doing quantile calculation along each image + sample = sample.reshape(batch_size, channels * height * width) + + abs_sample = sample.abs() # "a certain percentile absolute pixel value" + + s = torch.quantile(abs_sample, self.config.dynamic_thresholding_ratio, dim=1) + s = torch.clamp( + s, min=1, max=self.config.sample_max_value + ) # When clamped to min=1, equivalent to standard clipping to [-1, 1] + + s = s.unsqueeze(1) # (batch_size, 1) because clamp will broadcast along dim=0 + sample = torch.clamp(sample, -s, s) / s # "we threshold xt0 to the range [-s, s] and then divide by s" + + sample = sample.reshape(batch_size, channels, height, width) + sample = sample.to(dtype) + + return sample + + def step( + self, + model_output: torch.FloatTensor, + timestep: int, + sample: torch.FloatTensor, + generator=None, + return_dict: bool = True, + ) -> Union[DDPMParallelSchedulerOutput, Tuple]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): direct output from learned diffusion model. + timestep (`int`): current discrete timestep in the diffusion chain. + sample (`torch.FloatTensor`): + current instance of sample being created by diffusion process. + generator: random number generator. + return_dict (`bool`): option for returning tuple rather than DDPMParallelSchedulerOutput class + + Returns: + [`~schedulers.scheduling_utils.DDPMParallelSchedulerOutput`] or `tuple`: + [`~schedulers.scheduling_utils.DDPMParallelSchedulerOutput`] if `return_dict` is True, otherwise a `tuple`. + When returning a tuple, the first element is the sample tensor. + + """ + t = timestep + + prev_t = self.previous_timestep(t) + + if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: + model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1) + else: + predicted_variance = None + + # 1. compute alphas, betas + alpha_prod_t = self.alphas_cumprod[t] + alpha_prod_t_prev = self.alphas_cumprod[prev_t] if prev_t >= 0 else self.one + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + current_alpha_t = alpha_prod_t / alpha_prod_t_prev + current_beta_t = 1 - current_alpha_t + + # 2. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf + if self.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + elif self.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or" + " `v_prediction` for the DDPMScheduler." + ) + + # 3. Clip or threshold "predicted x_0" + if self.config.thresholding: + pred_original_sample = self._threshold_sample(pred_original_sample) + elif self.config.clip_sample: + pred_original_sample = pred_original_sample.clamp( + -self.config.clip_sample_range, self.config.clip_sample_range + ) + + # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t + current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t + + # 5. Compute predicted previous sample µ_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample + + # 6. Add noise + variance = 0 + if t > 0: + device = model_output.device + variance_noise = randn_tensor( + model_output.shape, generator=generator, device=device, dtype=model_output.dtype + ) + if self.variance_type == "fixed_small_log": + variance = self._get_variance(t, predicted_variance=predicted_variance) * variance_noise + elif self.variance_type == "learned_range": + variance = self._get_variance(t, predicted_variance=predicted_variance) + variance = torch.exp(0.5 * variance) * variance_noise + else: + variance = (self._get_variance(t, predicted_variance=predicted_variance) ** 0.5) * variance_noise + + pred_prev_sample = pred_prev_sample + variance + + if not return_dict: + return (pred_prev_sample,) + + return DDPMParallelSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample) + + def batch_step_no_noise( + self, + model_output: torch.FloatTensor, + timesteps: List[int], + sample: torch.FloatTensor, + ) -> torch.FloatTensor: + """ + Batched version of the `step` function, to be able to reverse the SDE for multiple samples/timesteps at once. + Also, does not add any noise to the predicted sample, which is necessary for parallel sampling where the noise + is pre-sampled by the pipeline. + + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): direct output from learned diffusion model. + timesteps (`List[int]`): + current discrete timesteps in the diffusion chain. This is now a list of integers. + sample (`torch.FloatTensor`): + current instance of sample being created by diffusion process. + + Returns: + `torch.FloatTensor`: sample tensor at previous timestep. + """ + t = timesteps + num_inference_steps = self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps + prev_t = t - self.config.num_train_timesteps // num_inference_steps + + t = t.view(-1, *([1] * (model_output.ndim - 1))) + prev_t = prev_t.view(-1, *([1] * (model_output.ndim - 1))) + + if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in ["learned", "learned_range"]: + model_output, predicted_variance = torch.split(model_output, sample.shape[1], dim=1) + else: + pass + + # 1. compute alphas, betas + self.alphas_cumprod = self.alphas_cumprod.to(model_output.device) + alpha_prod_t = self.alphas_cumprod[t] + alpha_prod_t_prev = self.alphas_cumprod[torch.clip(prev_t, min=0)] + alpha_prod_t_prev[prev_t < 0] = torch.tensor(1.0) + + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + current_alpha_t = alpha_prod_t / alpha_prod_t_prev + current_beta_t = 1 - current_alpha_t + + # 2. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf + if self.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + elif self.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample` or" + " `v_prediction` for the DDPMParallelScheduler." + ) + + # 3. Clip or threshold "predicted x_0" + if self.config.thresholding: + pred_original_sample = self._threshold_sample(pred_original_sample) + elif self.config.clip_sample: + pred_original_sample = pred_original_sample.clamp( + -self.config.clip_sample_range, self.config.clip_sample_range + ) + + # 4. Compute coefficients for pred_original_sample x_0 and current sample x_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_original_sample_coeff = (alpha_prod_t_prev ** (0.5) * current_beta_t) / beta_prod_t + current_sample_coeff = current_alpha_t ** (0.5) * beta_prod_t_prev / beta_prod_t + + # 5. Compute predicted previous sample µ_t + # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf + pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample + + return pred_prev_sample + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + timesteps: torch.IntTensor, + ) -> torch.FloatTensor: + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples + alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(original_samples.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity + def get_velocity( + self, sample: torch.FloatTensor, noise: torch.FloatTensor, timesteps: torch.IntTensor + ) -> torch.FloatTensor: + # Make sure alphas_cumprod and timestep have same device and dtype as sample + alphas_cumprod = self.alphas_cumprod.to(device=sample.device, dtype=sample.dtype) + timesteps = timesteps.to(sample.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(sample.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample + return velocity + + def __len__(self): + return self.config.num_train_timesteps + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep + def previous_timestep(self, timestep): + if self.custom_timesteps: + index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0] + if index == self.timesteps.shape[0] - 1: + prev_t = torch.tensor(-1) + else: + prev_t = self.timesteps[index + 1] + else: + num_inference_steps = ( + self.num_inference_steps if self.num_inference_steps else self.config.num_train_timesteps + ) + prev_t = timestep - self.config.num_train_timesteps // num_inference_steps + + return prev_t diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index e07b7cb27da7..7a13bc89e883 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -405,6 +405,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class DDIMParallelScheduler(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class DDIMScheduler(metaclass=DummyObject): _backends = ["torch"] @@ -420,6 +435,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class DDPMParallelScheduler(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class DDPMScheduler(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index c7a6ac79efbd..3f0b17d879e5 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -497,6 +497,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class StableDiffusionParadigmsPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py new file mode 100644 index 000000000000..781cbcbd69a1 --- /dev/null +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_paradigms.py @@ -0,0 +1,227 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DDIMParallelScheduler, + DDPMParallelScheduler, + StableDiffusionParadigmsPipeline, + UNet2DConditionModel, +) +from diffusers.utils import slow, torch_device +from diffusers.utils.testing_utils import ( + enable_full_determinism, + require_torch_gpu, +) + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin + + +enable_full_determinism() + + +class StableDiffusionParadigmsPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): + pipeline_class = StableDiffusionParadigmsPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + cross_attention_dim=32, + # SD2-specific config below + attention_head_dim=(2, 4), + use_linear_projection=True, + ) + scheduler = DDIMParallelScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + # SD2-specific config below + hidden_act="gelu", + projection_dim=512, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "safety_checker": None, + "feature_extractor": None, + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "a photograph of an astronaut riding a horse", + "generator": generator, + "num_inference_steps": 10, + "guidance_scale": 6.0, + "output_type": "numpy", + "parallel": 3, + "debug": True, + } + return inputs + + def test_stable_diffusion_paradigms_default_case(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionParadigmsPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array([0.4773, 0.5417, 0.4723, 0.4925, 0.5631, 0.4752, 0.5240, 0.4935, 0.5023]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_stable_diffusion_paradigms_default_case_ddpm(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + torch.manual_seed(0) + components["scheduler"] = DDPMParallelScheduler() + torch.manual_seed(0) + sd_pipe = StableDiffusionParadigmsPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array([0.3573, 0.4420, 0.4960, 0.4799, 0.3796, 0.3879, 0.4819, 0.4365, 0.4468]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + # override to speed the overall test timing up. + def test_inference_batch_consistent(self): + super().test_inference_batch_consistent(batch_sizes=[1, 2]) + + # override to speed the overall test timing up. + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(batch_size=2, expected_max_diff=3e-3) + + def test_stable_diffusion_paradigms_negative_prompt(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionParadigmsPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + negative_prompt = "french fries" + output = sd_pipe(**inputs, negative_prompt=negative_prompt) + image = output.images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array([0.4771, 0.5420, 0.4683, 0.4918, 0.5636, 0.4725, 0.5230, 0.4923, 0.5015]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + +@slow +@require_torch_gpu +class StableDiffusionParadigmsPipelineSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, seed=0): + generator = torch.Generator(device=torch_device).manual_seed(seed) + inputs = { + "prompt": "a photograph of an astronaut riding a horse", + "generator": generator, + "num_inference_steps": 10, + "guidance_scale": 7.5, + "output_type": "numpy", + "parallel": 3, + "debug": True, + } + return inputs + + def test_stable_diffusion_paradigms_default(self): + model_ckpt = "stabilityai/stable-diffusion-2-base" + scheduler = DDIMParallelScheduler.from_pretrained(model_ckpt, subfolder="scheduler") + pipe = StableDiffusionParadigmsPipeline.from_pretrained(model_ckpt, scheduler=scheduler, safety_checker=None) + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + pipe.enable_attention_slicing() + + inputs = self.get_inputs() + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + + expected_slice = np.array([0.9622, 0.9602, 0.9748, 0.9591, 0.9630, 0.9691, 0.9661, 0.9631, 0.9741]) + + assert np.abs(expected_slice - image_slice).max() < 1e-2 diff --git a/tests/schedulers/test_scheduler_ddim_parallel.py b/tests/schedulers/test_scheduler_ddim_parallel.py new file mode 100644 index 000000000000..b96e12f60fb3 --- /dev/null +++ b/tests/schedulers/test_scheduler_ddim_parallel.py @@ -0,0 +1,188 @@ +# Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from diffusers import DDIMParallelScheduler + +from .test_schedulers import SchedulerCommonTest + + +class DDIMParallelSchedulerTest(SchedulerCommonTest): + scheduler_classes = (DDIMParallelScheduler,) + forward_default_kwargs = (("eta", 0.0), ("num_inference_steps", 50)) + + def get_scheduler_config(self, **kwargs): + config = { + "num_train_timesteps": 1000, + "beta_start": 0.0001, + "beta_end": 0.02, + "beta_schedule": "linear", + "clip_sample": True, + } + + config.update(**kwargs) + return config + + def full_loop(self, **config): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config(**config) + scheduler = scheduler_class(**scheduler_config) + + num_inference_steps, eta = 10, 0.0 + + model = self.dummy_model() + sample = self.dummy_sample_deter + + scheduler.set_timesteps(num_inference_steps) + + for t in scheduler.timesteps: + residual = model(sample, t) + sample = scheduler.step(residual, t, sample, eta).prev_sample + + return sample + + def test_timesteps(self): + for timesteps in [100, 500, 1000]: + self.check_over_configs(num_train_timesteps=timesteps) + + def test_steps_offset(self): + for steps_offset in [0, 1]: + self.check_over_configs(steps_offset=steps_offset) + + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config(steps_offset=1) + scheduler = scheduler_class(**scheduler_config) + scheduler.set_timesteps(5) + assert torch.equal(scheduler.timesteps, torch.LongTensor([801, 601, 401, 201, 1])) + + def test_betas(self): + for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): + self.check_over_configs(beta_start=beta_start, beta_end=beta_end) + + def test_schedules(self): + for schedule in ["linear", "squaredcos_cap_v2"]: + self.check_over_configs(beta_schedule=schedule) + + def test_prediction_type(self): + for prediction_type in ["epsilon", "v_prediction"]: + self.check_over_configs(prediction_type=prediction_type) + + def test_clip_sample(self): + for clip_sample in [True, False]: + self.check_over_configs(clip_sample=clip_sample) + + def test_timestep_spacing(self): + for timestep_spacing in ["trailing", "leading"]: + self.check_over_configs(timestep_spacing=timestep_spacing) + + def test_rescale_betas_zero_snr(self): + for rescale_betas_zero_snr in [True, False]: + self.check_over_configs(rescale_betas_zero_snr=rescale_betas_zero_snr) + + def test_thresholding(self): + self.check_over_configs(thresholding=False) + for threshold in [0.5, 1.0, 2.0]: + for prediction_type in ["epsilon", "v_prediction"]: + self.check_over_configs( + thresholding=True, + prediction_type=prediction_type, + sample_max_value=threshold, + ) + + def test_time_indices(self): + for t in [1, 10, 49]: + self.check_over_forward(time_step=t) + + def test_inference_steps(self): + for t, num_inference_steps in zip([1, 10, 50], [10, 50, 500]): + self.check_over_forward(time_step=t, num_inference_steps=num_inference_steps) + + def test_eta(self): + for t, eta in zip([1, 10, 49], [0.0, 0.5, 1.0]): + self.check_over_forward(time_step=t, eta=eta) + + def test_variance(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(420, 400) - 0.14771)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(980, 960) - 0.32460)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(0, 0) - 0.0)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(487, 486) - 0.00979)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(999, 998) - 0.02)) < 1e-5 + + def test_batch_step_no_noise(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + num_inference_steps, eta = 10, 0.0 + scheduler.set_timesteps(num_inference_steps) + + model = self.dummy_model() + sample1 = self.dummy_sample_deter + sample2 = self.dummy_sample_deter + 0.1 + sample3 = self.dummy_sample_deter - 0.1 + + per_sample_batch = sample1.shape[0] + samples = torch.stack([sample1, sample2, sample3], dim=0) + timesteps = torch.arange(num_inference_steps)[0:3, None].repeat(1, per_sample_batch) + + residual = model(samples.flatten(0, 1), timesteps.flatten(0, 1)) + pred_prev_sample = scheduler.batch_step_no_noise(residual, timesteps.flatten(0, 1), samples.flatten(0, 1), eta) + + result_sum = torch.sum(torch.abs(pred_prev_sample)) + result_mean = torch.mean(torch.abs(pred_prev_sample)) + + assert abs(result_sum.item() - 1147.7904) < 1e-2 + assert abs(result_mean.item() - 0.4982) < 1e-3 + + def test_full_loop_no_noise(self): + sample = self.full_loop() + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 172.0067) < 1e-2 + assert abs(result_mean.item() - 0.223967) < 1e-3 + + def test_full_loop_with_v_prediction(self): + sample = self.full_loop(prediction_type="v_prediction") + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 52.5302) < 1e-2 + assert abs(result_mean.item() - 0.0684) < 1e-3 + + def test_full_loop_with_set_alpha_to_one(self): + # We specify different beta, so that the first alpha is 0.99 + sample = self.full_loop(set_alpha_to_one=True, beta_start=0.01) + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 149.8295) < 1e-2 + assert abs(result_mean.item() - 0.1951) < 1e-3 + + def test_full_loop_with_no_set_alpha_to_one(self): + # We specify different beta, so that the first alpha is 0.99 + sample = self.full_loop(set_alpha_to_one=False, beta_start=0.01) + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 149.0784) < 1e-2 + assert abs(result_mean.item() - 0.1941) < 1e-3 diff --git a/tests/schedulers/test_scheduler_ddpm_parallel.py b/tests/schedulers/test_scheduler_ddpm_parallel.py new file mode 100644 index 000000000000..5f7d2b227340 --- /dev/null +++ b/tests/schedulers/test_scheduler_ddpm_parallel.py @@ -0,0 +1,216 @@ +# Copyright 2023 ParaDiGMS authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch + +from diffusers import DDPMParallelScheduler + +from .test_schedulers import SchedulerCommonTest + + +class DDPMParallelSchedulerTest(SchedulerCommonTest): + scheduler_classes = (DDPMParallelScheduler,) + + def get_scheduler_config(self, **kwargs): + config = { + "num_train_timesteps": 1000, + "beta_start": 0.0001, + "beta_end": 0.02, + "beta_schedule": "linear", + "variance_type": "fixed_small", + "clip_sample": True, + } + + config.update(**kwargs) + return config + + def test_timesteps(self): + for timesteps in [1, 5, 100, 1000]: + self.check_over_configs(num_train_timesteps=timesteps) + + def test_betas(self): + for beta_start, beta_end in zip([0.0001, 0.001, 0.01, 0.1], [0.002, 0.02, 0.2, 2]): + self.check_over_configs(beta_start=beta_start, beta_end=beta_end) + + def test_schedules(self): + for schedule in ["linear", "squaredcos_cap_v2"]: + self.check_over_configs(beta_schedule=schedule) + + def test_variance_type(self): + for variance in ["fixed_small", "fixed_large", "other"]: + self.check_over_configs(variance_type=variance) + + def test_clip_sample(self): + for clip_sample in [True, False]: + self.check_over_configs(clip_sample=clip_sample) + + def test_thresholding(self): + self.check_over_configs(thresholding=False) + for threshold in [0.5, 1.0, 2.0]: + for prediction_type in ["epsilon", "sample", "v_prediction"]: + self.check_over_configs( + thresholding=True, + prediction_type=prediction_type, + sample_max_value=threshold, + ) + + def test_prediction_type(self): + for prediction_type in ["epsilon", "sample", "v_prediction"]: + self.check_over_configs(prediction_type=prediction_type) + + def test_time_indices(self): + for t in [0, 500, 999]: + self.check_over_forward(time_step=t) + + def test_variance(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + assert torch.sum(torch.abs(scheduler._get_variance(0) - 0.0)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(487) - 0.00979)) < 1e-5 + assert torch.sum(torch.abs(scheduler._get_variance(999) - 0.02)) < 1e-5 + + def test_batch_step_no_noise(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + num_trained_timesteps = len(scheduler) + + model = self.dummy_model() + sample1 = self.dummy_sample_deter + sample2 = self.dummy_sample_deter + 0.1 + sample3 = self.dummy_sample_deter - 0.1 + + per_sample_batch = sample1.shape[0] + samples = torch.stack([sample1, sample2, sample3], dim=0) + timesteps = torch.arange(num_trained_timesteps)[0:3, None].repeat(1, per_sample_batch) + + residual = model(samples.flatten(0, 1), timesteps.flatten(0, 1)) + pred_prev_sample = scheduler.batch_step_no_noise(residual, timesteps.flatten(0, 1), samples.flatten(0, 1)) + + result_sum = torch.sum(torch.abs(pred_prev_sample)) + result_mean = torch.mean(torch.abs(pred_prev_sample)) + + assert abs(result_sum.item() - 1153.1833) < 1e-2 + assert abs(result_mean.item() - 0.5005) < 1e-3 + + def test_full_loop_no_noise(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + num_trained_timesteps = len(scheduler) + + model = self.dummy_model() + sample = self.dummy_sample_deter + generator = torch.manual_seed(0) + + for t in reversed(range(num_trained_timesteps)): + # 1. predict noise residual + residual = model(sample, t) + + # 2. predict previous mean of sample x_t-1 + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample + + sample = pred_prev_sample + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 258.9606) < 1e-2 + assert abs(result_mean.item() - 0.3372) < 1e-3 + + def test_full_loop_with_v_prediction(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config(prediction_type="v_prediction") + scheduler = scheduler_class(**scheduler_config) + + num_trained_timesteps = len(scheduler) + + model = self.dummy_model() + sample = self.dummy_sample_deter + generator = torch.manual_seed(0) + + for t in reversed(range(num_trained_timesteps)): + # 1. predict noise residual + residual = model(sample, t) + + # 2. predict previous mean of sample x_t-1 + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample + + sample = pred_prev_sample + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 202.0296) < 1e-2 + assert abs(result_mean.item() - 0.2631) < 1e-3 + + def test_custom_timesteps(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [100, 87, 50, 1, 0] + + scheduler.set_timesteps(timesteps=timesteps) + + scheduler_timesteps = scheduler.timesteps + + for i, timestep in enumerate(scheduler_timesteps): + if i == len(timesteps) - 1: + expected_prev_t = -1 + else: + expected_prev_t = timesteps[i + 1] + + prev_t = scheduler.previous_timestep(timestep) + prev_t = prev_t.item() + + self.assertEqual(prev_t, expected_prev_t) + + def test_custom_timesteps_increasing_order(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [100, 87, 50, 51, 0] + + with self.assertRaises(ValueError, msg="`custom_timesteps` must be in descending order."): + scheduler.set_timesteps(timesteps=timesteps) + + def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [100, 87, 50, 1, 0] + num_inference_steps = len(timesteps) + + with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `custom_timesteps`."): + scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps) + + def test_custom_timesteps_too_large(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [scheduler.config.num_train_timesteps] + + with self.assertRaises( + ValueError, + msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}", + ): + scheduler.set_timesteps(timesteps=timesteps) diff --git a/tests/schedulers/test_schedulers.py b/tests/schedulers/test_schedulers.py index 69cddb36dde2..a2d065f388bd 100755 --- a/tests/schedulers/test_schedulers.py +++ b/tests/schedulers/test_schedulers.py @@ -238,6 +238,12 @@ def get_scheduler_config(self): def dummy_model(self): def model(sample, t, *args): + # if t is a tensor, match the number of dimensions of sample + if isinstance(t, torch.Tensor): + num_dims = len(sample.shape) + # pad t with 1s to match num_dims + t = t.reshape(-1, *(1,) * (num_dims - 1)).to(sample.device).to(sample.dtype) + return sample * t / (t + 1) return model From 3ebbaf7c96801271f9e6c21400033b6aa5ffcf29 Mon Sep 17 00:00:00 2001 From: dqueue Date: Tue, 20 Jun 2023 05:09:51 -0700 Subject: [PATCH 145/199] Update control_brightness.mdx (#3825) --- docs/source/en/using-diffusers/control_brightness.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/control_brightness.mdx b/docs/source/en/using-diffusers/control_brightness.mdx index 91ba8692028f..d8c0b1278f60 100644 --- a/docs/source/en/using-diffusers/control_brightness.mdx +++ b/docs/source/en/using-diffusers/control_brightness.mdx @@ -24,7 +24,7 @@ Next, configure the following parameters in the [`DDIMScheduler`]: ```py >>> from diffusers import DiffusionPipeline, DDIMScheduler ->>> pipeline = DiffusioPipeline.from_pretrained("ptx0/pseudo-journey-v2") +>>> pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2") # switch the scheduler in the pipeline to use the DDIMScheduler >>> pipeline.scheduler = DDIMScheduler.from_config( @@ -42,4 +42,4 @@ image = pipeline(prompt, guidance_rescale=0.7).images[0]
-
\ No newline at end of file + From ef3844d3a83583f36d0166be6753d062b3cbd7dc Mon Sep 17 00:00:00 2001 From: Hans Brouwer Date: Wed, 21 Jun 2023 13:11:45 +0200 Subject: [PATCH 146/199] Support ControlNet models with different number of channels in control images (#3815) support ControlNet models with a different hint_channels value (e.g. TemporalNet2) --- src/diffusers/models/controlnet.py | 2 ++ src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 0b0ce0be547f..88562dd37161 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -93,6 +93,7 @@ class ControlNetModel(ModelMixin, ConfigMixin): def __init__( self, in_channels: int = 4, + conditioning_channels: int = 3, flip_sin_to_cos: bool = True, freq_shift: int = 0, down_block_types: Tuple[str] = ( @@ -185,6 +186,7 @@ def __init__( self.controlnet_cond_embedding = ControlNetConditioningEmbedding( conditioning_embedding_channels=block_out_channels[0], block_out_channels=conditioning_embedding_out_channels, + conditioning_channels=conditioning_channels, ) self.down_blocks = nn.ModuleList([]) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index e59b91e486f5..746844ea1e0a 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -286,6 +286,7 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa "use_linear_projection": use_linear_projection, "class_embed_type": class_embed_type, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, + "conditioning_channels": unet_params.hint_channels, } if not controlnet: From 95ea538c7969b74f1da8971dfd3bfe3e794c96cc Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Wed, 21 Jun 2023 07:23:18 -1000 Subject: [PATCH 147/199] Add ddpm kandinsky (#3783) * update doc --------- Co-authored-by: yiyixuxu --- docs/source/en/api/pipelines/kandinsky.mdx | 14 ++++++++++++++ .../pipelines/kandinsky/pipeline_kandinsky.py | 9 +++------ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx index 1cac9810980f..bf551249ef05 100644 --- a/docs/source/en/api/pipelines/kandinsky.mdx +++ b/docs/source/en/api/pipelines/kandinsky.mdx @@ -55,6 +55,20 @@ t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1" t2i_pipe.to("cuda") ``` + + +By default, the text-to-image pipeline use [`DDIMScheduler`], you can change the scheduler to [`DDPMScheduler`] + +```py +scheduler = DDPMScheduler.from_pretrained("kandinsky-community/kandinsky-2-1", subfolder="ddpm_scheduler") +t2i_pipe = DiffusionPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-1", scheduler=scheduler, torch_dtype=torch.float16 +) +t2i_pipe.to("cuda") +``` + + + Now we pass the prompt through the prior to generate image embeddings. The prior returns both the image embeddings corresponding to the prompt and negative/unconditional image embeddings corresponding to an empty string. diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 6de9cf4451de..7b3537ea6895 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -22,7 +22,7 @@ from ...models import UNet2DConditionModel, VQModel from ...pipelines import DiffusionPipeline from ...pipelines.pipeline_utils import ImagePipelineOutput -from ...schedulers import DDIMScheduler +from ...schedulers import DDIMScheduler, DDPMScheduler from ...utils import ( is_accelerate_available, is_accelerate_version, @@ -88,7 +88,7 @@ class KandinskyPipeline(DiffusionPipeline): Frozen text-encoder. tokenizer ([`XLMRobertaTokenizer`]): Tokenizer of class - scheduler ([`DDIMScheduler`]): + scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]): A scheduler to be used in combination with `unet` to generate image latents. unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the image embedding. @@ -101,7 +101,7 @@ def __init__( text_encoder: MultilingualCLIP, tokenizer: XLMRobertaTokenizer, unet: UNet2DConditionModel, - scheduler: DDIMScheduler, + scheduler: Union[DDIMScheduler, DDPMScheduler], movq: VQModel, ): super().__init__() @@ -439,9 +439,6 @@ def __call__( noise_pred, t, latents, - # YiYi notes: only reason this pipeline can't work with unclip scheduler is that can't pass down this argument - # need to use DDPM scheduler instead - # prev_timestep=prev_timestep, generator=generator, ).prev_sample # post-processing From 1f02087607aa70948a2546206c58804b59381a6f Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Wed, 21 Jun 2023 11:07:23 -0700 Subject: [PATCH 148/199] [docs] More API stuff (#3835) * clean up loaders * clean up rest of main class apis * apply feedback --- docs/source/en/_toctree.yml | 2 +- docs/source/en/api/configuration.mdx | 9 +- docs/source/en/api/diffusion_pipeline.mdx | 6 +- docs/source/en/api/image_processor.mdx | 16 +- docs/source/en/api/loaders.mdx | 21 +- docs/source/en/api/logging.mdx | 36 +- docs/source/en/api/outputs.mdx | 6 +- src/diffusers/configuration_utils.py | 47 +-- src/diffusers/image_processor.py | 34 +- src/diffusers/loaders.py | 329 ++++++++---------- .../pipelines/pipeline_flax_utils.py | 4 +- src/diffusers/pipelines/pipeline_utils.py | 244 ++++++------- .../unidiffuser/pipeline_unidiffuser.py | 6 +- src/diffusers/utils/logging.py | 33 +- src/diffusers/utils/outputs.py | 6 +- 15 files changed, 348 insertions(+), 451 deletions(-) diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index a8884ac7dbe9..e904067b31e4 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -149,7 +149,7 @@ - local: api/utilities title: Utilities - local: api/image_processor - title: Vae Image Processor + title: VAE Image Processor title: Main Classes - sections: - local: api/pipelines/overview diff --git a/docs/source/en/api/configuration.mdx b/docs/source/en/api/configuration.mdx index 2bbb42d92538..a10e348acdef 100644 --- a/docs/source/en/api/configuration.mdx +++ b/docs/source/en/api/configuration.mdx @@ -12,8 +12,13 @@ specific language governing permissions and limitations under the License. # Configuration -Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from [`ModelMixin`] inherit from [`ConfigMixin`] which conveniently takes care of storing all the parameters that are -passed to their respective `__init__` methods in a JSON-configuration file. +Schedulers from [`~schedulers.scheduling_utils.SchedulerMixin`] and models from [`ModelMixin`] inherit from [`ConfigMixin`] which stores all the parameters that are passed to their respective `__init__` methods in a JSON-configuration file. + + + +To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`. + + ## ConfigMixin diff --git a/docs/source/en/api/diffusion_pipeline.mdx b/docs/source/en/api/diffusion_pipeline.mdx index a47025a3e94a..d99443002469 100644 --- a/docs/source/en/api/diffusion_pipeline.mdx +++ b/docs/source/en/api/diffusion_pipeline.mdx @@ -12,12 +12,12 @@ specific language governing permissions and limitations under the License. # Pipelines -The [`DiffusionPipeline`] is the easiest way to load any pretrained diffusion pipeline from the [Hub](https://huggingface.co/models?library=diffusers) and use it for inference. +The [`DiffusionPipeline`] is the quickest way to load any pretrained diffusion pipeline from the [Hub](https://huggingface.co/models?library=diffusers) for inference. - + You shouldn't use the [`DiffusionPipeline`] class for training or finetuning a diffusion model. Individual -components (for example, [`UNetModel`] and [`UNetConditionModel`]) of diffusion pipelines are usually trained individually, so we suggest directly working with instead. +components (for example, [`UNet2DModel`] and [`UNet2DConditionModel`]) of diffusion pipelines are usually trained individually, so we suggest directly working with them instead. diff --git a/docs/source/en/api/image_processor.mdx b/docs/source/en/api/image_processor.mdx index e2ed4ad48c19..7fc66f5ee68e 100644 --- a/docs/source/en/api/image_processor.mdx +++ b/docs/source/en/api/image_processor.mdx @@ -10,24 +10,18 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# Image Processor for VAE - -Image processor provides a unified API for Stable Diffusion pipelines to prepare their image inputs for VAE encoding, as well as post-processing their outputs once decoded. This includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and Numpy arrays. - -All pipelines with VAE image processor will accept image inputs in the format of PIL Image, PyTorch tensor, or Numpy array, and will able to return outputs in the format of PIL Image, Pytorch tensor, and Numpy array based on the `output_type` argument from the user. Additionally, the User can pass encoded image latents directly to the pipeline, or ask the pipeline to return latents as output with `output_type = 'pt'` argument. This allows you to take the generated latents from one pipeline and pass it to another pipeline as input, without ever having to leave the latent space. It also makes it much easier to use multiple pipelines together, by passing PyTorch tensors directly between different pipelines. - - -# Image Processor for VAE adapted to LDM3D - -LDM3D Image processor does the same as the Image processor for VAE but accepts both RGB and depth inputs and will return RGB and depth outputs. +# VAE Image Processor +The [`VaeImageProcessor`] provides a unified API for [`StableDiffusionPipeline`]'s to prepare image inputs for VAE encoding and post-processing outputs once they're decoded. This includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays. +All pipelines with [`VaeImageProcessor`] accepts PIL Image, PyTorch tensor, or NumPy arrays as image inputs and returns outputs based on the `output_type` argument by the user. You can pass encoded image latents directly to the pipeline and return latents from the pipeline as a specific output with the `output_type` argument (for example `output_type="pt"`). This allows you to take the generated latents from one pipeline and pass it to another pipeline as input without leaving the latent space. It also makes it much easier to use multiple pipelines together by passing PyTorch tensors directly between different pipelines. ## VaeImageProcessor [[autodoc]] image_processor.VaeImageProcessor - ## VaeImageProcessorLDM3D +The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs. + [[autodoc]] image_processor.VaeImageProcessorLDM3D \ No newline at end of file diff --git a/docs/source/en/api/loaders.mdx b/docs/source/en/api/loaders.mdx index 20134a0afe66..a236a6c70b6c 100644 --- a/docs/source/en/api/loaders.mdx +++ b/docs/source/en/api/loaders.mdx @@ -12,31 +12,26 @@ specific language governing permissions and limitations under the License. # Loaders -There are many ways to train adapter neural networks for diffusion models, such as -- [Textual Inversion](./training/text_inversion.mdx) -- [LoRA](https://github.com/cloneofsimo/lora) -- [Hypernetworks](https://arxiv.org/abs/1609.09106) +Adapters (textual inversion, LoRA, hypernetworks) allow you to modify a diffusion model to generate images in a specific style without training or finetuning the entire model. The adapter weights are typically only a tiny fraction of the pretrained model's which making them very portable. 🤗 Diffusers provides an easy-to-use `LoaderMixin` API to load adapter weights. -Such adapter neural networks often only consist of a fraction of the number of weights compared -to the pretrained model and as such are very portable. The Diffusers library offers an easy-to-use -API to load such adapter neural networks via the [`loaders.py` module](https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders.py). + -**Note**: This module is still highly experimental and prone to future changes. +🧪 The `LoaderMixins` are highly experimental and prone to future changes. To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with `huggingface-cli login`. -## LoaderMixins + -### UNet2DConditionLoadersMixin +## UNet2DConditionLoadersMixin [[autodoc]] loaders.UNet2DConditionLoadersMixin -### TextualInversionLoaderMixin +## TextualInversionLoaderMixin [[autodoc]] loaders.TextualInversionLoaderMixin -### LoraLoaderMixin +## LoraLoaderMixin [[autodoc]] loaders.LoraLoaderMixin -### FromCkptMixin +## FromCkptMixin [[autodoc]] loaders.FromCkptMixin diff --git a/docs/source/en/api/logging.mdx b/docs/source/en/api/logging.mdx index bb973db781ea..5de2716434b8 100644 --- a/docs/source/en/api/logging.mdx +++ b/docs/source/en/api/logging.mdx @@ -12,12 +12,9 @@ specific language governing permissions and limitations under the License. # Logging -🧨 Diffusers has a centralized logging system, so that you can setup the verbosity of the library easily. +🤗 Diffusers has a centralized logging system to easily manage the verbosity of the library. The default verbosity is set to `WARNING`. -Currently the default verbosity of the library is `WARNING`. - -To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity -to the INFO level. +To change the verbosity level, use one of the direct setters. For instance, to change the verbosity to the `INFO` level. ```python import diffusers @@ -33,7 +30,7 @@ DIFFUSERS_VERBOSITY=error ./myprogram.py ``` Additionally, some `warnings` can be disabled by setting the environment variable -`DIFFUSERS_NO_ADVISORY_WARNINGS` to a true value, like *1*. This will disable any warning that is logged using +`DIFFUSERS_NO_ADVISORY_WARNINGS` to a true value, like `1`. This disables any warning logged by [`logger.warning_advice`]. For example: ```bash @@ -52,20 +49,21 @@ logger.warning("WARN") ``` -All the methods of this logging module are documented below, the main ones are +All methods of the logging module are documented below. The main methods are [`logging.get_verbosity`] to get the current level of verbosity in the logger and -[`logging.set_verbosity`] to set the verbosity to the level of your choice. In order (from the least -verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are: - -- `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL` (int value, 50): only report the most - critical errors. -- `diffusers.logging.ERROR` (int value, 40): only report errors. -- `diffusers.logging.WARNING` or `diffusers.logging.WARN` (int value, 30): only reports error and - warnings. This is the default level used by the library. -- `diffusers.logging.INFO` (int value, 20): reports error, warnings and basic information. -- `diffusers.logging.DEBUG` (int value, 10): report all information. - -By default, `tqdm` progress bars will be displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior. +[`logging.set_verbosity`] to set the verbosity to the level of your choice. + +In order from the least verbose to the most verbose: + +| Method | Integer value | Description | +|----------------------------------------------------------:|--------------:|----------------------------------------------------:| +| `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL` | 50 | only report the most critical errors | +| `diffusers.logging.ERROR` | 40 | only report errors | +| `diffusers.logging.WARNING` or `diffusers.logging.WARN` | 30 | only report errors and warnings (default) | +| `diffusers.logging.INFO` | 20 | only report errors, warnings, and basic information | +| `diffusers.logging.DEBUG` | 10 | report all information | + +By default, `tqdm` progress bars are displayed during model download. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] are used to enable or disable this behavior. ## Base setters diff --git a/docs/source/en/api/outputs.mdx b/docs/source/en/api/outputs.mdx index 1e9fbedba35b..ec64d36498ee 100644 --- a/docs/source/en/api/outputs.mdx +++ b/docs/source/en/api/outputs.mdx @@ -10,11 +10,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# BaseOutputs +# Outputs -All models have outputs that are subclasses of [`~utils.BaseOutput`]. Those are -data structures containing all the information returned by the model, but they can also be used as tuples or -dictionaries. +All models outputs are subclasses of [`~utils.BaseOutput`], data structures containing all the information returned by the model. The outputs can also be used as tuples or dictionaries. For example: diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index bb5adf3e9444..1a030e467134 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -81,10 +81,9 @@ def __setitem__(self, name, value): class ConfigMixin: r""" - Base class for all configuration classes. Stores all configuration parameters under `self.config` Also handles all - methods for loading/downloading/saving classes inheriting from [`ConfigMixin`] with - - [`~ConfigMixin.from_config`] - - [`~ConfigMixin.save_config`] + Base class for all configuration classes. All configuration parameters are stored under `self.config`. Also + provides the [`~ConfigMixin.from_config`] and [`~ConfigMixin.save_config`] methods for loading, downloading, and + saving classes that inherit from [`ConfigMixin`]. Class attributes: - **config_name** (`str`) -- A filename under which the config should stored when calling @@ -92,7 +91,7 @@ class ConfigMixin: - **ignore_for_config** (`List[str]`) -- A list of attributes that should not be saved in the config (should be overridden by subclass). - **has_compatibles** (`bool`) -- Whether the class has compatible classes (should be overridden by subclass). - - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the init function + - **_deprecated_kwargs** (`List[str]`) -- Keyword arguments that are deprecated. Note that the `init` function should only have a `kwargs` argument if at least one argument is deprecated (should be overridden by subclass). """ @@ -139,12 +138,12 @@ def __getattr__(self, name: str) -> Any: def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): """ - Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the + Save a configuration object to the directory specified in `save_directory` so that it can be reloaded using the [`~ConfigMixin.from_config`] class method. Args: save_directory (`str` or `os.PathLike`): - Directory where the configuration JSON file will be saved (will be created if it does not exist). + Directory where the configuration JSON file is saved (will be created if it does not exist). """ if os.path.isfile(save_directory): raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") @@ -164,15 +163,14 @@ def from_config(cls, config: Union[FrozenDict, Dict[str, Any]] = None, return_un Parameters: config (`Dict[str, Any]`): - A config dictionary from which the Python class will be instantiated. Make sure to only load - configuration files of compatible classes. + A config dictionary from which the Python class is instantiated. Make sure to only load configuration + files of compatible classes. return_unused_kwargs (`bool`, *optional*, defaults to `False`): Whether kwargs that are not consumed by the Python class should be returned or not. - kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to update the configuration object (after it is loaded) and initiate the Python class. - `**kwargs` are directly passed to the underlying scheduler/model's `__init__` method and eventually - overwrite same named arguments in `config`. + `**kwargs` are passed directly to the underlying scheduler/model's `__init__` method and eventually + overwrite the same named arguments in `config`. Returns: [`ModelMixin`] or [`SchedulerMixin`]: @@ -280,16 +278,16 @@ def load_config( Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to False, any + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`): Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to True, the model - won’t be downloaded from the Hub. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from `diffusers-cli login` (stored in `~/.huggingface`) is used. @@ -307,14 +305,6 @@ def load_config( `dict`: A dictionary of all the parameters stored in a JSON configuration file. - - - To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with - `huggingface-cli login`. You can also activate the special - ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to use this method in a - firewalled environment. - - """ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) force_download = kwargs.pop("force_download", False) @@ -536,10 +526,11 @@ def config(self) -> Dict[str, Any]: def to_json_string(self) -> str: """ - Serializes this instance to a JSON string. + Serializes the configuration instance to a JSON string. Returns: - `str`: String containing all the attributes that make up this configuration instance in JSON format. + `str`: + String containing all the attributes that make up the configuration instance in JSON format. """ config_dict = self._internal_dict if hasattr(self, "_internal_dict") else {} config_dict["_class_name"] = self.__class__.__name__ @@ -560,11 +551,11 @@ def to_json_saveable(value): def to_json_file(self, json_file_path: Union[str, os.PathLike]): """ - Save this instance to a JSON file. + Save the configuration instance's parameters to a JSON file. Args: json_file_path (`str` or `os.PathLike`): - Path to the JSON file in which this configuration instance's parameters will be saved. + Path to the JSON file to save a configuration instance's parameters. """ with open(json_file_path, "w", encoding="utf-8") as writer: writer.write(self.to_json_string()) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 4f3c61208539..2a433ee14d98 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -26,19 +26,18 @@ class VaeImageProcessor(ConfigMixin): """ - Image Processor for VAE + Image processor for VAE. Args: do_resize (`bool`, *optional*, defaults to `True`): Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept - `height` and `width` arguments from `preprocess` method + `height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method. vae_scale_factor (`int`, *optional*, defaults to `8`): - VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this - factor. + VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor. resample (`str`, *optional*, defaults to `lanczos`): Resampling filter to use when resizing the image. do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image to [-1,1] + Whether to normalize the image to [-1,1]. do_convert_rgb (`bool`, *optional*, defaults to be `False`): Whether to convert the images to RGB format. """ @@ -75,7 +74,7 @@ def numpy_to_pil(images: np.ndarray) -> PIL.Image.Image: @staticmethod def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray: """ - Convert a PIL image or a list of PIL images to numpy arrays. + Convert a PIL image or a list of PIL images to NumPy arrays. """ if not isinstance(images, list): images = [images] @@ -87,7 +86,7 @@ def pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.nd @staticmethod def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: """ - Convert a numpy image to a pytorch tensor + Convert a NumPy image to a PyTorch tensor. """ if images.ndim == 3: images = images[..., None] @@ -98,7 +97,7 @@ def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: @staticmethod def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: """ - Convert a pytorch tensor to a numpy image + Convert a PyTorch tensor to a NumPy image. """ images = images.cpu().permute(0, 2, 3, 1).float().numpy() return images @@ -106,14 +105,14 @@ def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: @staticmethod def normalize(images): """ - Normalize an image array to [-1,1] + Normalize an image array to [-1,1]. """ return 2.0 * images - 1.0 @staticmethod def denormalize(images): """ - Denormalize an image array to [0,1] + Denormalize an image array to [0,1]. """ return (images / 2 + 0.5).clamp(0, 1) @@ -132,7 +131,7 @@ def resize( width: Optional[int] = None, ) -> PIL.Image.Image: """ - Resize a PIL image. Both height and width will be downscaled to the next integer multiple of `vae_scale_factor` + Resize a PIL image. Both height and width are downscaled to the next integer multiple of `vae_scale_factor`. """ if height is None: height = image.height @@ -152,7 +151,7 @@ def preprocess( width: Optional[int] = None, ) -> torch.Tensor: """ - Preprocess the image input, accepted formats are PIL images, numpy arrays or pytorch tensors" + Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors. """ supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) if isinstance(image, supported_formats): @@ -255,18 +254,17 @@ def postprocess( class VaeImageProcessorLDM3D(VaeImageProcessor): """ - Image Processor for VAE LDM3D. + Image processor for VAE LDM3D. Args: do_resize (`bool`, *optional*, defaults to `True`): Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. vae_scale_factor (`int`, *optional*, defaults to `8`): - VAE scale factor. If `do_resize` is True, the image will be automatically resized to multiples of this - factor. + VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor. resample (`str`, *optional*, defaults to `lanczos`): Resampling filter to use when resizing the image. do_normalize (`bool`, *optional*, defaults to `True`): - Whether to normalize the image to [-1,1] + Whether to normalize the image to [-1,1]. """ config_name = CONFIG_NAME @@ -284,7 +282,7 @@ def __init__( @staticmethod def numpy_to_pil(images): """ - Convert a numpy image or a batch of images to a PIL image. + Convert a NumPy image or a batch of images to a PIL image. """ if images.ndim == 3: images = images[None, ...] @@ -310,7 +308,7 @@ def rgblike_to_depthmap(image): def numpy_to_depth(self, images): """ - Convert a numpy depth image or a batch of images to a PIL image. + Convert a NumPy depth image or a batch of images to a PIL image. """ if images.ndim == 3: images = images[None, ...] diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 5cddb7690e52..ab1ac7d8bbdf 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -115,63 +115,50 @@ class UNet2DConditionLoadersMixin: def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs): r""" - Load pretrained attention processor layers into `UNet2DConditionModel`. Attention processor layers have to be + Load pretrained attention processor layers into [`UNet2DConditionModel`]. Attention processor layers have to be defined in [`cross_attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py) and be a `torch.nn.Module` class. - - - This function is experimental and might change in the future. - - - Parameters: pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): Can be either: - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids should have an organization name, like `google/ddpm-celebahq-256`. - - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g., - `./my_model_directory/`. + - A string, the model id (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a directory (for example `./my_model_directory`) containing the model weights saved + with [`ModelMixin.save_pretrained`]. - A [torch state dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `diffusers-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. subfolder (`str`, *optional*, defaults to `""`): - In case the relevant files are located inside a subfolder of the model repo (either remote in - huggingface.co or downloaded locally), you can specify the folder name here. + The subfolder location of a model file within a larger model repository on the Hub or locally. mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. + Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. - - - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models). - - """ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) @@ -349,20 +336,21 @@ def save_attn_procs( **kwargs, ): r""" - Save an attention processor to a directory, so that it can be re-loaded using the + Save an attention processor to a directory so that it can be reloaded using the [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method. Arguments: save_directory (`str` or `os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. + Directory to save an attention processor to. Will be created if it doesn't exist. is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful when in distributed training like - TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on - the main process to avoid race conditions. + Whether the process calling this is the main process or not. Useful during distributed training and you + need to call this function on all processes. In this case, set `is_main_process=True` only on the main + process to avoid race conditions. save_function (`Callable`): - The function to use to save the state dictionary. Useful on distributed training like TPUs when one - need to replace `torch.save` by another method. Can be configured with the environment variable + The function to use to save the state dictionary. Useful during distributed training when you need to + replace `torch.save` with another method. Can be configured with the environment variable `DIFFUSERS_SAVE_MODE`. + """ weight_name = weight_name or deprecate( "weights_name", @@ -418,15 +406,14 @@ def save_function(weights, filename): class TextualInversionLoaderMixin: r""" - Mixin class for loading textual inversion tokens and embeddings to the tokenizer and text encoder. + Load textual inversion tokens and embeddings to the tokenizer and text encoder. """ def maybe_convert_prompt(self, prompt: Union[str, List[str]], tokenizer: "PreTrainedTokenizer"): r""" - Maybe convert a prompt into a "multi vector"-compatible prompt. If the prompt includes a token that corresponds - to a multi-vector textual inversion embedding, this function will process the prompt so that the special token - is replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual - inversion token or a textual inversion token that is a single vector, the input prompt is simply returned. + Processes prompts that include a special token corresponding to a multi-vector textual inversion embedding to + be replaced with multiple special tokens each corresponding to one of the vectors. If the prompt has no textual + inversion token or if the textual inversion token is a single vector, the input prompt is returned. Parameters: prompt (`str` or list of `str`): @@ -486,78 +473,61 @@ def load_textual_inversion( **kwargs, ): r""" - Load textual inversion embeddings into the text encoder of stable diffusion pipelines. Both `diffusers` and - `Automatic1111` formats are supported (see example below). - - - - This function is experimental and might change in the future. - - + Load textual inversion embeddings into the text encoder of [`StableDiffusionPipeline`] (both 🤗 Diffusers and + Automatic1111 formats are supported). Parameters: pretrained_model_name_or_path (`str` or `os.PathLike` or `List[str or os.PathLike]` or `Dict` or `List[Dict]`): - Can be either: + Can be either one of the following or a list of them: - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids should have an organization name, like - `"sd-concepts-library/low-poly-hd-logos-icons"`. - - A path to a *directory* containing textual inversion weights, e.g. - `./my_text_inversion_directory/`. - - A path to a *file* containing textual inversion weights, e.g. `./my_text_inversions.pt`. + - A string, the *model id* (for example `sd-concepts-library/low-poly-hd-logos-icons`) of a + pretrained model hosted on the Hub. + - A path to a *directory* (for example `./my_text_inversion_directory/`) containing the textual + inversion weights. + - A path to a *file* (for example `./my_text_inversions.pt`) containing textual inversion weights. - A [torch state dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). - Or a list of those elements. token (`str` or `List[str]`, *optional*): Override the token to use for the textual inversion weights. If `pretrained_model_name_or_path` is a list, then `token` must also be a list of equal length. weight_name (`str`, *optional*): - Name of a custom weight file. This should be used in two cases: + Name of a custom weight file. This should be used when: - - The saved textual inversion file is in `diffusers` format, but was saved under a specific weight - name, such as `text_inv.bin`. - - The saved textual inversion file is in the "Automatic1111" form. + - The saved textual inversion file is in 🤗 Diffusers format, but was saved under a specific weight + name such as `text_inv.bin`. + - The saved textual inversion file is in the Automatic1111 format. cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `diffusers-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. subfolder (`str`, *optional*, defaults to `""`): - In case the relevant files are located inside a subfolder of the model repo (either remote in - huggingface.co or downloaded locally), you can specify the folder name here. - + The subfolder location of a model file within a larger model repository on the Hub or locally. mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. - - - - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models). - - + Mirror source to resolve accessibility issues if you're downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. Example: - To load a textual inversion embedding vector in `diffusers` format: + To load a textual inversion embedding vector in 🤗 Diffusers format: ```py from diffusers import StableDiffusionPipeline @@ -574,8 +544,9 @@ def load_textual_inversion( image.save("cat-backpack.png") ``` - To load a textual inversion embedding vector in Automatic1111 format, make sure to first download the vector, - e.g. from [civitAI](https://civitai.com/models/3036?modelVersionId=9857) and then load the vector locally: + To load a textual inversion embedding vector in Automatic1111 format, make sure to download the vector first + (for example from [civitAI](https://civitai.com/models/3036?modelVersionId=9857)) and then load the vector + locally: ```py from diffusers import StableDiffusionPipeline @@ -766,78 +737,56 @@ def load_textual_inversion( class LoraLoaderMixin: r""" - Utility class for handling the loading LoRA layers into UNet (of class [`UNet2DConditionModel`]) and Text Encoder - (of class [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)). - - - - This function is experimental and might change in the future. - - + Load LoRA layers into [`UNet2DConditionModel`] and + [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel). """ text_encoder_name = TEXT_ENCODER_NAME unet_name = UNET_NAME def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs): r""" - Load pretrained attention processor layers (such as LoRA) into [`UNet2DConditionModel`] and - [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel)). - - - - We support loading A1111 formatted LoRA checkpoints in a limited capacity. - - This function is experimental and might change in the future. - - + Load pretrained LoRA attention processor layers into [`UNet2DConditionModel`] and + [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel). Parameters: pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`): Can be either: - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids should have an organization name, like `google/ddpm-celebahq-256`. - - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g., - `./my_model_directory/`. + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved + with [`ModelMixin.save_pretrained`]. - A [torch state dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict). cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `diffusers-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. subfolder (`str`, *optional*, defaults to `""`): - In case the relevant files are located inside a subfolder of the model repo (either remote in - huggingface.co or downloaded locally), you can specify the folder name here. - + The subfolder location of a model file within a larger model repository on the Hub or locally. mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. - - - - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models). + Mirror source to resolve accessibility issues if you're downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. - """ # Load the main state dict first which has the LoRA layers for either of # UNet and text encoder or both. @@ -1062,7 +1011,7 @@ def _load_text_encoder_attn_procs( proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only(`bool`, *optional*, defaults to `False`): + local_files_only (`bool`, *optional*, defaults to `False`): Whether or not to only look at local files (i.e., do not try to download the model). use_auth_token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated @@ -1210,26 +1159,23 @@ def save_lora_weights( safe_serialization: bool = False, ): r""" - Save the LoRA parameters corresponding to the UNet and the text encoder. + Save the LoRA parameters corresponding to the UNet and text encoder. Arguments: save_directory (`str` or `os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. + Directory to save LoRA parameters to. Will be created if it doesn't exist. unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the UNet. Specifying this helps to make the - serialization process easier and cleaner. Values can be both LoRA torch.nn.Modules layers or torch - weights. + State dict of the LoRA layers corresponding to the UNet. text_encoder_lora_layers (`Dict[str, torch.nn.Module] or `Dict[str, torch.Tensor]`): - State dict of the LoRA layers corresponding to the `text_encoder`. Since the `text_encoder` comes from - `transformers`, we cannot rejig it. That is why we have to explicitly pass the text encoder LoRA state - dict. Values can be both LoRA torch.nn.Modules layers or torch weights. + State dict of the LoRA layers corresponding to the `text_encoder`. Must explicitly pass the text + encoder LoRA state dict because it comes 🤗 Transformers. is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful when in distributed training like - TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on - the main process to avoid race conditions. + Whether the process calling this is the main process or not. Useful during distributed training and you + need to call this function on all processes. In this case, set `is_main_process=True` only on the main + process to avoid race conditions. save_function (`Callable`): - The function to use to save the state dictionary. Useful on distributed training like TPUs when one - need to replace `torch.save` by another method. Can be configured with the environment variable + The function to use to save the state dictionary. Useful during distributed training when you need to + replace `torch.save` with another method. Can be configured with the environment variable `DIFFUSERS_SAVE_MODE`. """ if os.path.isfile(save_directory): @@ -1331,73 +1277,72 @@ def _convert_kohya_lora_to_diffusers(self, state_dict): class FromCkptMixin: - """This helper class allows to directly load .ckpt stable diffusion file_extension - into the respective classes.""" + """ + Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`]. + """ @classmethod def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): r""" - Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights saved in the original .ckpt format. - - The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). + Instantiate a [`DiffusionPipeline`] from pretrained pipeline weights saved in the `.ckpt` format. The pipeline + is set in evaluation mode (`model.eval()`) by default. Parameters: pretrained_model_link_or_path (`str` or `os.PathLike`, *optional*): Can be either: - - A link to the .ckpt file on the Hub. Should be in the format - `"https://huggingface.co//blob/main/"` + - A link to the `.ckpt` file (for example + `"https://huggingface.co//blob/main/.ckpt"`) on the Hub. - A path to a *file* containing all pipeline weights. torch_dtype (`str` or `torch.dtype`, *optional*): - Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype - will be automatically derived from the model's weights. + Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the + dtype is automatically derived from the model's weights. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. local_files_only (`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + Whether to only load local model weights and configuration files or not. If set to True, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `huggingface-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. use_safetensors (`bool`, *optional*, defaults to `None`): - If set to `None`, the pipeline will load the `safetensors` weights if they're available **and** if the - `safetensors` library is installed. If set to `True`, the pipeline will forcibly load the models from - `safetensors` weights. If set to `False` the pipeline will *not* use `safetensors`. - extract_ema (`bool`, *optional*, defaults to `False`): Only relevant for - checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights or not. Defaults - to `False`. Pass `True` to extract the EMA weights. EMA weights usually yield higher quality images for - inference. Non-EMA weights are usually better to continue fine-tuning. + If set to `None`, the safetensors weights are downloaded if they're available **and** if the + safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors + weights. If set to `False`, safetensors weights are not loaded. + extract_ema (`bool`, *optional*, defaults to `False`): + Whether to extract the EMA weights or not. Pass `True` to extract the EMA weights which usually yield + higher quality images for inference. Non-EMA weights are usually better to continue finetuning. upcast_attention (`bool`, *optional*, defaults to `None`): - Whether the attention computation should always be upcasted. This is necessary when running stable + Whether the attention computation should always be upcasted. image_size (`int`, *optional*, defaults to 512): - The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2 - Base. Use 768 for Stable Diffusion v2. + The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable + Diffusion v2 base model. Use 768 for Stable Diffusion v2. prediction_type (`str`, *optional*): - The prediction type that the model was trained on. Use `'epsilon'` for Stable Diffusion v1.X and Stable - Diffusion v2 Base. Use `'v_prediction'` for Stable Diffusion v2. - num_in_channels (`int`, *optional*, defaults to None): + The prediction type the model was trained on. Use `'epsilon'` for all Stable Diffusion v1 models and + the Stable Diffusion v2 base model. Use `'v_prediction'` for Stable Diffusion v2. + num_in_channels (`int`, *optional*, defaults to `None`): The number of input channels. If `None`, it will be automatically inferred. - scheduler_type (`str`, *optional*, defaults to 'pndm'): + scheduler_type (`str`, *optional*, defaults to `"pndm"`): Type of scheduler to use. Should be one of `["pndm", "lms", "heun", "euler", "euler-ancestral", "dpm", "ddim"]`. load_safety_checker (`bool`, *optional*, defaults to `True`): - Whether to load the safety checker or not. Defaults to `True`. + Whether to load the safety checker or not. kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the - specific pipeline class. The overwritten components are then directly passed to the pipelines - `__init__` method. See example below for more information. + Can be used to overwrite load and saveable variables (for example the pipeline components of the + specific pipeline class). The overwritten components are directly passed to the pipelines `__init__` + method. See example below for more information. Examples: diff --git a/src/diffusers/pipelines/pipeline_flax_utils.py b/src/diffusers/pipelines/pipeline_flax_utils.py index 6ab0b80ee655..e1c4b9f53953 100644 --- a/src/diffusers/pipelines/pipeline_flax_utils.py +++ b/src/diffusers/pipelines/pipeline_flax_utils.py @@ -83,8 +83,8 @@ class FlaxImagePipelineOutput(BaseOutput): Args: images (`List[PIL.Image.Image]` or `np.ndarray`) - List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, - num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width, + num_channels)`. """ images: Union[List[PIL.Image.Image], np.ndarray] diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index d32c240dedaf..734af819c852 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -115,8 +115,8 @@ class ImagePipelineOutput(BaseOutput): Args: images (`List[PIL.Image.Image]` or `np.ndarray`) - List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, - num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width, + num_channels)`. """ images: Union[List[PIL.Image.Image], np.ndarray] @@ -129,8 +129,7 @@ class AudioPipelineOutput(BaseOutput): Args: audios (`np.ndarray`) - List of denoised samples of shape `(batch_size, num_channels, sample_rate)`. Numpy array present the - denoised audio samples of the diffusion pipeline. + List of denoised audio samples of a NumPy array of shape `(batch_size, num_channels, sample_rate)`. """ audios: np.ndarray @@ -458,20 +457,20 @@ def load_sub_model( class DiffusionPipeline(ConfigMixin): r""" - Base class for all models. + Base class for all pipelines. - [`DiffusionPipeline`] takes care of storing all components (models, schedulers, processors) for diffusion pipelines - and handles methods for loading, downloading and saving models as well as a few methods common to all pipelines to: + [`DiffusionPipeline`] stores all components (models, schedulers, and processors) for diffusion pipelines and + provides methods for loading, downloading and saving models. It also includes methods to: - move all PyTorch modules to the device of your choice - enabling/disabling the progress bar for the denoising iteration Class attributes: - - **config_name** (`str`) -- name of the config file that will store the class and module names of all - components of the diffusion pipeline. - - **_optional_components** (List[`str`]) -- list of all components that are optional so they don't have to be - passed for the pipeline to function (should be overridden by subclasses). + - **config_name** (`str`) -- The configuration filename that stores the class and module names of all the + diffusion pipeline's components. + - **_optional_components** (List[`str`]) -- List of all optional components that don't have to be passed to the + pipeline to function (should be overridden by subclasses). """ config_name = "model_index.json" _optional_components = [] @@ -541,17 +540,17 @@ def save_pretrained( variant: Optional[str] = None, ): """ - Save all variables of the pipeline that can be saved and loaded as well as the pipelines configuration file to - a directory. A pipeline variable can be saved and loaded if its class implements both a save and loading - method. The pipeline can easily be re-loaded using the [`~DiffusionPipeline.from_pretrained`] class method. + Save all saveable variables of the pipeline to a directory. A pipeline variable can be saved and loaded if its + class implements both a save and loading method. The pipeline is easily reloaded using the + [`~DiffusionPipeline.from_pretrained`] class method. Arguments: save_directory (`str` or `os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. + Directory to save a pipeline to. Will be created if it doesn't exist. safe_serialization (`bool`, *optional*, defaults to `False`): - Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. variant (`str`, *optional*): - If specified, weights are saved in the format pytorch_model..bin. + If specified, weights are saved in the format `pytorch_model..bin`. """ model_index_dict = dict(self.config) model_index_dict.pop("_class_name", None) @@ -714,69 +713,51 @@ def device(self) -> torch.device: @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): r""" - Instantiate a PyTorch diffusion pipeline from pre-trained pipeline weights. + Instantiate a PyTorch diffusion pipeline from pretrained pipeline weights. - The pipeline is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). + The pipeline is set in evaluation mode (`model.eval()`) by default. - The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come - pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning - task. + If you get the error message below, you need to finetune the weights for your downstream task: - The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those - weights are discarded. + ``` + Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match: + - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated + You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + ``` Parameters: pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): Can be either: - - A string, the *repo id* of a pretrained pipeline hosted inside a model repo on - https://huggingface.co/ Valid repo ids have to be located under a user or organization name, like - `CompVis/ldm-text2im-large-256`. - - A path to a *directory* containing pipeline weights saved using - [`~DiffusionPipeline.save_pretrained`], e.g., `./my_pipeline_directory/`. + - A string, the *repo id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline + hosted on the Hub. + - A path to a *directory* (for example `./my_pipeline_directory/`) containing pipeline weights + saved using + [`~DiffusionPipeline.save_pretrained`]. torch_dtype (`str` or `torch.dtype`, *optional*): - Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype - will be automatically derived from the model's weights. + Override the default `torch.dtype` and load the model with another dtype. If "auto" is passed, the + dtype is automatically derived from the model's weights. custom_pipeline (`str`, *optional*): - This is an experimental feature and is likely to change in the future. + 🧪 This is an experimental feature and may change in the future. Can be either: - - A string, the *repo id* of a custom pipeline hosted inside a model repo on - https://huggingface.co/. Valid repo ids have to be located under a user or organization name, - like `hf-internal-testing/diffusers-dummy-pipeline`. - - - - It is required that the model repo has a file, called `pipeline.py` that defines the custom - pipeline. - - - + - A string, the *repo id* (for example `hf-internal-testing/diffusers-dummy-pipeline`) of a custom + pipeline hosted on the Hub. The repository must contain a file called pipeline.py that defines + the custom pipeline. - A string, the *file name* of a community pipeline hosted on GitHub under - https://github.com/huggingface/diffusers/tree/main/examples/community. Valid file names have to - match exactly the file name without `.py` located under the above link, *e.g.* - `clip_guided_stable_diffusion`. - - - - Community pipelines are always loaded from the current `main` branch of GitHub. - - - - - A path to a *directory* containing a custom pipeline, e.g., `./my_pipeline_directory/`. - - - - It is required that the directory has a file, called `pipeline.py` that defines the custom - pipeline. + [Community](https://github.com/huggingface/diffusers/tree/main/examples/community). Valid file + names must match the file name and not the pipeline script (`clip_guided_stable_diffusion` + instead of `clip_guided_stable_diffusion.py`). Community pipelines are always loaded from the + current main branch of GitHub. + - A path to a directory (`./my_pipeline_directory/`) containing a custom pipeline. The directory + must contain a file called `pipeline.py` that defines the custom pipeline. - For more information on how to load and create custom pipelines, please have a look at [Loading and Adding Custom @@ -786,78 +767,71 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`): Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `huggingface-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. - custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of `diffusers` when loading from GitHub): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + custom_revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id similar to - `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a - custom pipeline from GitHub. + `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a + custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub. mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. specify the folder name here. + Mirror source to resolve accessibility issues if you’re downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): - A map that specifies where each submodule should go. It doesn't need to be refined to each - parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the + A map that specifies where each submodule should go. It doesn’t need to be defined for each + parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the same device. - To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For + Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For more information about each option see [designing a device map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). max_memory (`Dict`, *optional*): - A dictionary device identifier to maximum memory. Will default to the maximum memory available for each - GPU and the available CPU RAM if unset. + A dictionary device identifier for the maximum memory. Will default to the maximum memory available for + each GPU and the available CPU RAM if unset. offload_folder (`str` or `os.PathLike`, *optional*): - If the `device_map` contains any value `"disk"`, the folder where we will offload weights. + The path to offload weights if device_map contains the value `"disk"`. offload_state_dict (`bool`, *optional*): - If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU - RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to - `True` when there is some disk offload. + If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if + the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True` + when there is some disk offload. low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): - Speed up model loading by not initializing the weights and only loading the pre-trained weights. This - also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the - model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch, - setting this argument to `True` will raise an error. + Speed up model loading only loading the pretrained weights and not initializing the weights. This also + tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. + Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this + argument to `True` will raise an error. use_safetensors (`bool`, *optional*, defaults to `None`): - If set to `None`, the pipeline will load the `safetensors` weights if they're available **and** if the - `safetensors` library is installed. If set to `True`, the pipeline will forcibly load the models from - `safetensors` weights. If set to `False` the pipeline will *not* use `safetensors`. + If set to `None`, the safetensors weights are downloaded if they're available **and** if the + safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors + weights. If set to `False`, safetensors weights are not loaded. kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to overwrite load - and saveable variables - *i.e.* the pipeline components - of the - specific pipeline class. The overwritten components are then directly passed to the pipelines - `__init__` method. See example below for more information. + Can be used to overwrite load and saveable variables (the pipeline components of the specific pipeline + class). The overwritten components are passed directly to the pipelines `__init__` method. See example + below for more information. variant (`str`, *optional*): - If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is - ignored when using `from_flax`. - - - - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models), *e.g.* `"runwayml/stable-diffusion-v1-5"` - - + Load weights from a specified variant filename such as `"fp16"` or `"ema"`. This is ignored when + loading `from_flax`. - Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use - this method in a firewalled environment. + To use private or [gated](https://huggingface.co/docs/hub/models-gated#gated-models) models, log-in with + `huggingface-cli login`. @@ -1108,12 +1082,12 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: Parameters: pretrained_model_name (`str` or `os.PathLike`, *optional*): - A string, the repository id (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline + A string, the *repository id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline hosted on the Hub. custom_pipeline (`str`, *optional*): Can be either: - - A string, the repository id (for example `CompVis/ldm-text2im-large-256`) of a pretrained + - A string, the *repository id* (for example `CompVis/ldm-text2im-large-256`) of a pretrained pipeline hosted on the Hub. The repository must contain a file called `pipeline.py` that defines the custom pipeline. @@ -1139,27 +1113,26 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to resume downloading the model weights and configuration files. If set to False, any + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`): Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to True, the model - won’t be downloaded from the Hub. + local_files_only (`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier allowed by Git. - custom_revision (`str`, *optional*, defaults to `"main"` when loading from the Hub and to local version of - `diffusers` when loading from GitHub): + custom_revision (`str`, *optional*, defaults to `"main"`): The specific model version to use. It can be a branch name, a tag name, or a commit id similar to - `revision` when loading a custom pipeline from the Hub. It can be a diffusers version when loading a - custom pipeline from GitHub. + `revision` when loading a custom pipeline from the Hub. It can be a 🤗 Diffusers version when loading a + custom pipeline from GitHub, otherwise it defaults to `"main"` when loading from the Hub. mirror (`str`, *optional*): Mirror source to resolve accessibility issues if you're downloading a model in China. We do not guarantee the timeliness or safety of the source, and you should refer to the mirror site for more @@ -1365,9 +1338,11 @@ def _get_signature_keys(obj): @property def components(self) -> Dict[str, Any]: r""" - The `self.components` property can be useful to run different pipelines with the same weights and - configurations to not have to re-allocate memory. + configurations without reallocating additional memory. + + Returns (`dict`): + A dictionary containing all the modules needed to initialize the pipeline. Examples: @@ -1382,9 +1357,6 @@ def components(self) -> Dict[str, Any]: >>> img2img = StableDiffusionImg2ImgPipeline(**text2img.components) >>> inpaint = StableDiffusionInpaintPipeline(**text2img.components) ``` - - Returns: - A dictionary containing all the modules needed to initialize the pipeline. """ expected_modules, optional_parameters = self._get_signature_keys(self) components = { @@ -1402,7 +1374,7 @@ def components(self) -> Dict[str, Any]: @staticmethod def numpy_to_pil(images): """ - Convert a numpy image or a batch of images to a PIL image. + Convert a NumPy image or a batch of images to a PIL image. """ return numpy_to_pil(images) @@ -1426,13 +1398,17 @@ def set_progress_bar_config(self, **kwargs): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): r""" - Enable memory efficient attention as implemented in xformers. + Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). + + When this option is enabled, you should observe lower GPU memory usage and a potential speed up during + inference. Speed up during training is not guaranteed. - When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference - time. Speed up at training time is not guaranteed. + - Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention - is used. + ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes + precedent. + + Parameters: attention_op (`Callable`, *optional*): @@ -1458,7 +1434,7 @@ def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Call def disable_xformers_memory_efficient_attention(self): r""" - Disable memory efficient attention as implemented in xformers. + Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). """ self.set_use_memory_efficient_attention_xformers(False) @@ -1486,8 +1462,8 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful to save some memory in exchange for a small speed decrease. Args: slice_size (`str` or `int`, *optional*, defaults to `"auto"`): @@ -1500,8 +1476,8 @@ def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto def disable_attention_slicing(self): r""" - Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go - back to computing attention in one step. + Disable sliced attention computation. If `enable_attention_slicing` was previously called, attention is + computed in one step. """ # set slice_size = `None` to disable `attention slicing` self.enable_attention_slicing(None) diff --git a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py index ecc457b4cb94..e25da12414e7 100644 --- a/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py +++ b/src/diffusers/pipelines/unidiffuser/pipeline_unidiffuser.py @@ -68,11 +68,11 @@ class ImageTextPipelineOutput(BaseOutput): Args: images (`List[PIL.Image.Image]` or `np.ndarray`) - List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, - num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width, + num_channels)`. text (`List[str]` or `List[List[str]]`) List of generated text strings of length `batch_size` or a list of list of strings whose outer list has - length `batch_size`. Text generated by the diffusion pipeline. + length `batch_size`. """ images: Optional[Union[List[PIL.Image.Image], np.ndarray]] diff --git a/src/diffusers/utils/logging.py b/src/diffusers/utils/logging.py index 3308d117e994..4ccc57cd69d5 100644 --- a/src/diffusers/utils/logging.py +++ b/src/diffusers/utils/logging.py @@ -124,22 +124,19 @@ def get_logger(name: Optional[str] = None) -> logging.Logger: def get_verbosity() -> int: """ - Return the current level for the 🤗 Diffusers' root logger as an int. + Return the current level for the 🤗 Diffusers' root logger as an `int`. Returns: - `int`: The logging level. + `int`: + Logging level integers which can be one of: - + - `50`: `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL` + - `40`: `diffusers.logging.ERROR` + - `30`: `diffusers.logging.WARNING` or `diffusers.logging.WARN` + - `20`: `diffusers.logging.INFO` + - `10`: `diffusers.logging.DEBUG` - 🤗 Diffusers has following logging levels: - - - 50: `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL` - - 40: `diffusers.logging.ERROR` - - 30: `diffusers.logging.WARNING` or `diffusers.logging.WARN` - - 20: `diffusers.logging.INFO` - - 10: `diffusers.logging.DEBUG` - - """ + """ _configure_library_root_logger() return _get_library_root_logger().getEffectiveLevel() @@ -151,7 +148,7 @@ def set_verbosity(verbosity: int) -> None: Args: verbosity (`int`): - Logging level, e.g., one of: + Logging level which can be one of: - `diffusers.logging.CRITICAL` or `diffusers.logging.FATAL` - `diffusers.logging.ERROR` @@ -185,7 +182,7 @@ def set_verbosity_error(): def disable_default_handler() -> None: - """Disable the default handler of the HuggingFace Diffusers' root logger.""" + """Disable the default handler of the 🤗 Diffusers' root logger.""" _configure_library_root_logger() @@ -194,7 +191,7 @@ def disable_default_handler() -> None: def enable_default_handler() -> None: - """Enable the default handler of the HuggingFace Diffusers' root logger.""" + """Enable the default handler of the 🤗 Diffusers' root logger.""" _configure_library_root_logger() @@ -241,9 +238,9 @@ def enable_propagation() -> None: def enable_explicit_format() -> None: """ - Enable explicit formatting for every HuggingFace Diffusers' logger. The explicit formatter is as follows: + Enable explicit formatting for every 🤗 Diffusers' logger. The explicit formatter is as follows: ``` - [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE + [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE ``` All handlers currently bound to the root logger are affected by this method. """ @@ -256,7 +253,7 @@ def enable_explicit_format() -> None: def reset_format() -> None: """ - Resets the formatting for HuggingFace Diffusers' loggers. + Resets the formatting for 🤗 Diffusers' loggers. All handlers currently bound to the root logger are affected by this method. """ diff --git a/src/diffusers/utils/outputs.py b/src/diffusers/utils/outputs.py index b6e8a219e129..37b11561d1e1 100644 --- a/src/diffusers/utils/outputs.py +++ b/src/diffusers/utils/outputs.py @@ -41,12 +41,12 @@ class BaseOutput(OrderedDict): """ Base class for all model outputs as dataclass. Has a `__getitem__` that allows indexing by integer or slice (like a tuple) or strings (like a dictionary) that will ignore the `None` attributes. Otherwise behaves like a regular - python dictionary. + Python dictionary. - You can't unpack a `BaseOutput` directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple - before. + You can't unpack a [`BaseOutput`] directly. Use the [`~utils.BaseOutput.to_tuple`] method to convert it to a tuple + first. """ From 0bab447670f47c28df60fbd2f6a0f833f75a16f5 Mon Sep 17 00:00:00 2001 From: Will Berman Date: Wed, 21 Jun 2023 12:35:38 -0700 Subject: [PATCH 149/199] relax tol attention conversion test (#3842) --- tests/models/test_attention_processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_attention_processor.py b/tests/models/test_attention_processor.py index f9b5924ca5e0..fadee4a9e337 100644 --- a/tests/models/test_attention_processor.py +++ b/tests/models/test_attention_processor.py @@ -115,5 +115,5 @@ def test_conversion_when_using_device_map(self): output_type="np", ).images - self.assertTrue(np.allclose(pre_conversion, conversion)) - self.assertTrue(np.allclose(conversion, after_conversion)) + self.assertTrue(np.allclose(pre_conversion, conversion, atol=1e-5)) + self.assertTrue(np.allclose(conversion, after_conversion, atol=1e-5)) From 13e781f9a5693eff2b7f3d6848d9ef205b24e493 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 22 Jun 2023 15:56:55 +0530 Subject: [PATCH 150/199] fix: random module seeding (#3846) --- src/diffusers/training_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/diffusers/training_utils.py b/src/diffusers/training_utils.py index df9c7e882682..eaa9ed64554b 100644 --- a/src/diffusers/training_utils.py +++ b/src/diffusers/training_utils.py @@ -1,6 +1,6 @@ import contextlib import copy -from random import random +import random from typing import Any, Dict, Iterable, Optional, Union import numpy as np From 0c6d1bc985d2373d742d323283994f3dc2e50965 Mon Sep 17 00:00:00 2001 From: Robert Dargavel Smith Date: Thu, 22 Jun 2023 11:27:39 +0100 Subject: [PATCH 151/199] fix audio_diffusion tests (#3850) --- .../audio_diffusion/test_audio_diffusion.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/audio_diffusion/test_audio_diffusion.py b/tests/pipelines/audio_diffusion/test_audio_diffusion.py index 8c20f011cb86..c8c4b7221cc8 100644 --- a/tests/pipelines/audio_diffusion/test_audio_diffusion.py +++ b/tests/pipelines/audio_diffusion/test_audio_diffusion.py @@ -99,7 +99,10 @@ def dummy_vqvae_and_unet(self): @slow def test_audio_diffusion(self): device = "cpu" # ensure determinism for the device-dependent torch.Generator - mel = Mel() + mel = Mel( + x_res=self.dummy_unet.config.sample_size[1], + y_res=self.dummy_unet.config.sample_size[0], + ) scheduler = DDPMScheduler() pipe = AudioDiffusionPipeline(vqvae=None, unet=self.dummy_unet, mel=mel, scheduler=scheduler) @@ -127,6 +130,11 @@ def test_audio_diffusion(self): assert np.abs(image_slice.flatten() - expected_slice).max() == 0 assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() == 0 + mel = Mel( + x_res=self.dummy_vqvae_and_unet[0].config.sample_size[1], + y_res=self.dummy_vqvae_and_unet[0].config.sample_size[0], + ) + scheduler = DDIMScheduler() dummy_vqvae_and_unet = self.dummy_vqvae_and_unet pipe = AudioDiffusionPipeline( @@ -154,13 +162,15 @@ def test_audio_diffusion(self): pipe = AudioDiffusionPipeline( vqvae=self.dummy_vqvae_and_unet[0], unet=dummy_unet_condition, mel=mel, scheduler=scheduler ) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) np.random.seed(0) encoding = torch.rand((1, 1, 10)) output = pipe(generator=generator, encoding=encoding) image = output.images[0] image_slice = np.frombuffer(image.tobytes(), dtype="uint8")[:10] - expected_slice = np.array([120, 139, 147, 123, 124, 96, 115, 121, 126, 144]) + expected_slice = np.array([107, 103, 120, 127, 142, 122, 113, 122, 97, 111]) assert np.abs(image_slice.flatten() - expected_slice).max() == 0 From 88d269461ca9b5acfae3dedd732438266f526109 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 22 Jun 2023 13:52:48 +0200 Subject: [PATCH 152/199] Correct bad attn naming (#3797) * relax tolerance slightly * correct incorrect naming * correct namingc * correct more * Apply suggestions from code review * Fix more * Correct more * correct incorrect naming * Update src/diffusers/models/controlnet.py * Correct flax * Correct renaming * Correct blocks * Fix more * Correct more * mkae style * mkae style * mkae style * mkae style * mkae style * Fix flax * mkae style * rename * rename * rename attn head dim to attention_head_dim * correct flax * make style * improve * Correct more * make style * fix more * mkae style * Update src/diffusers/models/controlnet_flax.py * Apply suggestions from code review Co-authored-by: Pedro Cuenca --------- Co-authored-by: Pedro Cuenca --- src/diffusers/models/controlnet.py | 26 ++- src/diffusers/models/controlnet_flax.py | 22 +- src/diffusers/models/unet_2d.py | 6 +- src/diffusers/models/unet_2d_blocks.py | 212 +++++++++++------- src/diffusers/models/unet_2d_blocks_flax.py | 24 +- src/diffusers/models/unet_2d_condition.py | 35 ++- .../models/unet_2d_condition_flax.py | 26 ++- src/diffusers/models/unet_3d_blocks.py | 44 ++-- src/diffusers/models/unet_3d_condition.py | 30 ++- src/diffusers/models/vae.py | 8 +- src/diffusers/models/vae_flax.py | 10 +- .../versatile_diffusion/modeling_text_unet.py | 88 +++++--- tests/models/test_models_unet_2d.py | 2 +- 13 files changed, 339 insertions(+), 194 deletions(-) diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 88562dd37161..8660c3f9a5d3 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -112,6 +112,7 @@ def __init__( norm_eps: float = 1e-5, cross_attention_dim: int = 1280, attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, use_linear_projection: bool = False, class_embed_type: Optional[str] = None, num_class_embeds: Optional[int] = None, @@ -124,6 +125,14 @@ def __init__( ): super().__init__() + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + # Check inputs if len(block_out_channels) != len(down_block_types): raise ValueError( @@ -135,9 +144,9 @@ def __init__( f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) - if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): raise ValueError( - f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." ) # input @@ -198,6 +207,9 @@ def __init__( if isinstance(attention_head_dim, int): attention_head_dim = (attention_head_dim,) * len(down_block_types) + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + # down output_channel = block_out_channels[0] @@ -221,7 +233,8 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attention_head_dim[i], + num_attention_heads=num_attention_heads[i], + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, downsample_padding=downsample_padding, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention[i], @@ -255,7 +268,7 @@ def __init__( output_scale_factor=mid_block_scale_factor, resnet_time_scale_shift=resnet_time_scale_shift, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attention_head_dim[-1], + num_attention_heads=num_attention_heads[-1], resnet_groups=norm_num_groups, use_linear_projection=use_linear_projection, upcast_attention=upcast_attention, @@ -292,6 +305,7 @@ def from_unet( norm_eps=unet.config.norm_eps, cross_attention_dim=unet.config.cross_attention_dim, attention_head_dim=unet.config.attention_head_dim, + num_attention_heads=unet.config.num_attention_heads, use_linear_projection=unet.config.use_linear_projection, class_embed_type=unet.config.class_embed_type, num_class_embeds=unet.config.num_class_embeds, @@ -390,8 +404,8 @@ def set_attention_slice(self, slice_size): slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` - must be a multiple of `slice_size`. + provided, uses as many slices as `num_attention_heads // slice_size`. In this case, + `num_attention_heads` must be a multiple of `slice_size`. """ sliceable_head_dims = [] diff --git a/src/diffusers/models/controlnet_flax.py b/src/diffusers/models/controlnet_flax.py index 3adefa84ea68..cff451edcdc5 100644 --- a/src/diffusers/models/controlnet_flax.py +++ b/src/diffusers/models/controlnet_flax.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Tuple, Union +from typing import Optional, Tuple, Union import flax import flax.linen as nn @@ -129,6 +129,8 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin): The number of layers per block. attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int` or `Tuple[int]`, *optional*): + The number of attention heads. cross_attention_dim (`int`, *optional*, defaults to 768): The dimension of the cross attention features. dropout (`float`, *optional*, defaults to 0): @@ -155,6 +157,7 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin): block_out_channels: Tuple[int] = (320, 640, 1280, 1280) layers_per_block: int = 2 attention_head_dim: Union[int, Tuple[int]] = 8 + num_attention_heads: Optional[Union[int, Tuple[int]]] = None cross_attention_dim: int = 1280 dropout: float = 0.0 use_linear_projection: bool = False @@ -182,6 +185,14 @@ def setup(self): block_out_channels = self.block_out_channels time_embed_dim = block_out_channels[0] * 4 + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = self.num_attention_heads or self.attention_head_dim + # input self.conv_in = nn.Conv( block_out_channels[0], @@ -206,9 +217,8 @@ def setup(self): if isinstance(only_cross_attention, bool): only_cross_attention = (only_cross_attention,) * len(self.down_block_types) - attention_head_dim = self.attention_head_dim - if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim,) * len(self.down_block_types) + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(self.down_block_types) # down down_blocks = [] @@ -237,7 +247,7 @@ def setup(self): out_channels=output_channel, dropout=self.dropout, num_layers=self.layers_per_block, - attn_num_head_channels=attention_head_dim[i], + num_attention_heads=num_attention_heads[i], add_downsample=not is_final_block, use_linear_projection=self.use_linear_projection, only_cross_attention=only_cross_attention[i], @@ -285,7 +295,7 @@ def setup(self): self.mid_block = FlaxUNetMidBlock2DCrossAttn( in_channels=mid_block_channel, dropout=self.dropout, - attn_num_head_channels=attention_head_dim[-1], + num_attention_heads=num_attention_heads[-1], use_linear_projection=self.use_linear_projection, dtype=self.dtype, ) diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py index 2a6a1b9de5f2..4a752fa94a99 100644 --- a/src/diffusers/models/unet_2d.py +++ b/src/diffusers/models/unet_2d.py @@ -164,7 +164,7 @@ def __init__( resnet_eps=norm_eps, resnet_act_fn=act_fn, resnet_groups=norm_num_groups, - attn_num_head_channels=attention_head_dim, + attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, ) @@ -178,7 +178,7 @@ def __init__( resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, resnet_time_scale_shift=resnet_time_scale_shift, - attn_num_head_channels=attention_head_dim, + attention_head_dim=attention_head_dim if attention_head_dim is not None else block_out_channels[-1], resnet_groups=norm_num_groups, add_attention=add_attention, ) @@ -204,7 +204,7 @@ def __init__( resnet_eps=norm_eps, resnet_act_fn=act_fn, resnet_groups=norm_num_groups, - attn_num_head_channels=attention_head_dim, + attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel, resnet_time_scale_shift=resnet_time_scale_shift, ) self.up_blocks.append(up_block) diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index 674e58d7180e..eee7e6023e88 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -18,7 +18,7 @@ import torch.nn.functional as F from torch import nn -from ..utils import is_torch_version +from ..utils import is_torch_version, logging from .attention import AdaGroupNorm from .attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0 from .dual_transformer_2d import DualTransformer2DModel @@ -26,6 +26,9 @@ from .transformer_2d import Transformer2DModel +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + def get_down_block( down_block_type, num_layers, @@ -35,7 +38,7 @@ def get_down_block( add_downsample, resnet_eps, resnet_act_fn, - attn_num_head_channels, + num_attention_heads=None, resnet_groups=None, cross_attention_dim=None, downsample_padding=None, @@ -47,7 +50,15 @@ def get_down_block( resnet_skip_time_act=False, resnet_out_scale_factor=1.0, cross_attention_norm=None, + attention_head_dim=None, ): + # If attn head dim is not defined, we default it to the number of heads + if attention_head_dim is None: + logger.warn( + f"It is recommended to provide `attention_head_dim` when calling `get_down_block`. Defaulting `attention_head_dim` to {num_attention_heads}." + ) + attention_head_dim = num_attention_heads + down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type if down_block_type == "DownBlock2D": return DownBlock2D( @@ -87,7 +98,7 @@ def get_down_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, downsample_padding=downsample_padding, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, ) elif down_block_type == "CrossAttnDownBlock2D": @@ -104,7 +115,7 @@ def get_down_block( resnet_groups=resnet_groups, downsample_padding=downsample_padding, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + num_attention_heads=num_attention_heads, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, @@ -124,7 +135,7 @@ def get_down_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, skip_time_act=resnet_skip_time_act, output_scale_factor=resnet_out_scale_factor, @@ -152,8 +163,7 @@ def get_down_block( add_downsample=add_downsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, - downsample_padding=downsample_padding, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, ) elif down_block_type == "DownEncoderBlock2D": @@ -178,7 +188,7 @@ def get_down_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, downsample_padding=downsample_padding, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, ) elif down_block_type == "KDownBlock2D": @@ -201,7 +211,7 @@ def get_down_block( resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, add_self_attention=True if not add_downsample else False, ) raise ValueError(f"{down_block_type} does not exist.") @@ -217,7 +227,7 @@ def get_up_block( add_upsample, resnet_eps, resnet_act_fn, - attn_num_head_channels, + num_attention_heads=None, resnet_groups=None, cross_attention_dim=None, dual_cross_attention=False, @@ -228,7 +238,15 @@ def get_up_block( resnet_skip_time_act=False, resnet_out_scale_factor=1.0, cross_attention_norm=None, + attention_head_dim=None, ): + # If attn head dim is not defined, we default it to the number of heads + if attention_head_dim is None: + logger.warn( + f"It is recommended to provide `attention_head_dim` when calling `get_up_block`. Defaulting `attention_head_dim` to {num_attention_heads}." + ) + attention_head_dim = num_attention_heads + up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type if up_block_type == "UpBlock2D": return UpBlock2D( @@ -272,7 +290,7 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + num_attention_heads=num_attention_heads, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, @@ -293,7 +311,7 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, skip_time_act=resnet_skip_time_act, output_scale_factor=resnet_out_scale_factor, @@ -311,7 +329,7 @@ def get_up_block( resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, ) elif up_block_type == "SkipUpBlock2D": @@ -336,7 +354,7 @@ def get_up_block( add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, ) elif up_block_type == "UpDecoderBlock2D": @@ -360,7 +378,7 @@ def get_up_block( resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, temb_channels=temb_channels, ) @@ -384,7 +402,7 @@ def get_up_block( resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + attention_head_dim=attention_head_dim, ) raise ValueError(f"{up_block_type} does not exist.") @@ -403,7 +421,7 @@ def __init__( resnet_groups: int = 32, resnet_pre_norm: bool = True, add_attention: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, ): super().__init__() @@ -427,13 +445,19 @@ def __init__( ] attentions = [] + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {in_channels}." + ) + attention_head_dim = in_channels + for _ in range(num_layers): if self.add_attention: attentions.append( Attention( in_channels, - heads=in_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else in_channels, + heads=in_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=resnet_groups if resnet_time_scale_shift == "default" else None, @@ -487,7 +511,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, output_scale_factor=1.0, cross_attention_dim=1280, dual_cross_attention=False, @@ -497,7 +521,7 @@ def __init__( super().__init__() self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # there is always at least one resnet @@ -521,8 +545,8 @@ def __init__( if not dual_cross_attention: attentions.append( Transformer2DModel( - attn_num_head_channels, - in_channels // attn_num_head_channels, + num_attention_heads, + in_channels // num_attention_heads, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -534,8 +558,8 @@ def __init__( else: attentions.append( DualTransformer2DModel( - attn_num_head_channels, - in_channels // attn_num_head_channels, + num_attention_heads, + in_channels // num_attention_heads, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -596,7 +620,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, cross_attention_dim=1280, skip_time_act=False, @@ -607,10 +631,10 @@ def __init__( self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.attention_head_dim = attention_head_dim resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) - self.num_heads = in_channels // self.attn_num_head_channels + self.num_heads = in_channels // self.attention_head_dim # there is always at least one resnet resnets = [ @@ -640,7 +664,7 @@ def __init__( query_dim=in_channels, cross_attention_dim=in_channels, heads=self.num_heads, - dim_head=attn_num_head_channels, + dim_head=self.attention_head_dim, added_kv_proj_dim=cross_attention_dim, norm_num_groups=resnet_groups, bias=True, @@ -720,7 +744,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, downsample_padding=1, add_downsample=True, @@ -729,6 +753,12 @@ def __init__( resnets = [] attentions = [] + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}." + ) + attention_head_dim = out_channels + for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels resnets.append( @@ -748,8 +778,8 @@ def __init__( attentions.append( Attention( out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, + heads=out_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=resnet_groups, @@ -804,7 +834,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, cross_attention_dim=1280, output_scale_factor=1.0, downsample_padding=1, @@ -819,7 +849,7 @@ def __init__( attentions = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels @@ -840,8 +870,8 @@ def __init__( if not dual_cross_attention: attentions.append( Transformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -854,8 +884,8 @@ def __init__( else: attentions.append( DualTransformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1099,7 +1129,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, add_downsample=True, downsample_padding=1, @@ -1108,6 +1138,12 @@ def __init__( resnets = [] attentions = [] + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}." + ) + attention_head_dim = out_channels + for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels resnets.append( @@ -1127,8 +1163,8 @@ def __init__( attentions.append( Attention( out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, + heads=out_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=resnet_groups, @@ -1177,15 +1213,20 @@ def __init__( resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=np.sqrt(2.0), - downsample_padding=1, add_downsample=True, ): super().__init__() self.attentions = nn.ModuleList([]) self.resnets = nn.ModuleList([]) + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}." + ) + attention_head_dim = out_channels + for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels self.resnets.append( @@ -1206,8 +1247,8 @@ def __init__( self.attentions.append( Attention( out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, + heads=out_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=32, @@ -1451,7 +1492,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, cross_attention_dim=1280, output_scale_factor=1.0, add_downsample=True, @@ -1466,8 +1507,8 @@ def __init__( resnets = [] attentions = [] - self.attn_num_head_channels = attn_num_head_channels - self.num_heads = out_channels // self.attn_num_head_channels + self.attention_head_dim = attention_head_dim + self.num_heads = out_channels // self.attention_head_dim for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels @@ -1496,7 +1537,7 @@ def __init__( query_dim=out_channels, cross_attention_dim=out_channels, heads=self.num_heads, - dim_head=attn_num_head_channels, + dim_head=attention_head_dim, added_kv_proj_dim=cross_attention_dim, norm_num_groups=resnet_groups, bias=True, @@ -1686,7 +1727,7 @@ def __init__( num_layers: int = 4, resnet_group_size: int = 32, add_downsample=True, - attn_num_head_channels: int = 64, + attention_head_dim: int = 64, add_self_attention: bool = False, resnet_eps: float = 1e-5, resnet_act_fn: str = "gelu", @@ -1719,8 +1760,8 @@ def __init__( attentions.append( KAttentionBlock( out_channels, - out_channels // attn_num_head_channels, - attn_num_head_channels, + out_channels // attention_head_dim, + attention_head_dim, cross_attention_dim=cross_attention_dim, temb_channels=temb_channels, attention_bias=True, @@ -1817,7 +1858,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, add_upsample=True, ): @@ -1825,6 +1866,12 @@ def __init__( resnets = [] attentions = [] + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}." + ) + attention_head_dim = out_channels + for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels resnet_in_channels = prev_output_channel if i == 0 else out_channels @@ -1846,8 +1893,8 @@ def __init__( attentions.append( Attention( out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, + heads=out_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=resnet_groups, @@ -1897,7 +1944,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, cross_attention_dim=1280, output_scale_factor=1.0, add_upsample=True, @@ -1911,7 +1958,7 @@ def __init__( attentions = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels @@ -1934,8 +1981,8 @@ def __init__( if not dual_cross_attention: attentions.append( Transformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1948,8 +1995,8 @@ def __init__( else: attentions.append( DualTransformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -2178,7 +2225,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, add_upsample=True, temb_channels=None, @@ -2187,6 +2234,12 @@ def __init__( resnets = [] attentions = [] + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}." + ) + attention_head_dim = out_channels + for i in range(num_layers): input_channels = in_channels if i == 0 else out_channels @@ -2207,8 +2260,8 @@ def __init__( attentions.append( Attention( out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, + heads=out_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=resnet_groups if resnet_time_scale_shift != "spatial" else None, @@ -2253,9 +2306,8 @@ def __init__( resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=np.sqrt(2.0), - upsample_padding=1, add_upsample=True, ): super().__init__() @@ -2282,11 +2334,17 @@ def __init__( ) ) + if attention_head_dim is None: + logger.warn( + f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `out_channels`: {out_channels}." + ) + attention_head_dim = out_channels + self.attentions.append( Attention( out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, + heads=out_channels // attention_head_dim, + dim_head=attention_head_dim, rescale_output_factor=output_scale_factor, eps=resnet_eps, norm_num_groups=32, @@ -2563,7 +2621,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, cross_attention_dim=1280, output_scale_factor=1.0, add_upsample=True, @@ -2576,9 +2634,9 @@ def __init__( attentions = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.attention_head_dim = attention_head_dim - self.num_heads = out_channels // self.attn_num_head_channels + self.num_heads = out_channels // self.attention_head_dim for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels @@ -2609,7 +2667,7 @@ def __init__( query_dim=out_channels, cross_attention_dim=out_channels, heads=self.num_heads, - dim_head=attn_num_head_channels, + dim_head=self.attention_head_dim, added_kv_proj_dim=cross_attention_dim, norm_num_groups=resnet_groups, bias=True, @@ -2804,7 +2862,7 @@ def __init__( resnet_eps: float = 1e-5, resnet_act_fn: str = "gelu", resnet_group_size: int = 32, - attn_num_head_channels=1, # attention dim_head + attention_head_dim=1, # attention dim_head cross_attention_dim: int = 768, add_upsample: bool = True, upcast_attention: bool = False, @@ -2818,7 +2876,7 @@ def __init__( add_self_attention = True if is_first_block else False self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.attention_head_dim = attention_head_dim # in_channels, and out_channels for the block (k-unet) k_in_channels = out_channels if is_first_block else 2 * out_channels @@ -2854,10 +2912,10 @@ def __init__( attentions.append( KAttentionBlock( k_out_channels if (i == num_layers - 1) else out_channels, - k_out_channels // attn_num_head_channels + k_out_channels // attention_head_dim if (i == num_layers - 1) - else out_channels // attn_num_head_channels, - attn_num_head_channels, + else out_channels // attention_head_dim, + attention_head_dim, cross_attention_dim=cross_attention_dim, temb_channels=temb_channels, attention_bias=True, diff --git a/src/diffusers/models/unet_2d_blocks_flax.py b/src/diffusers/models/unet_2d_blocks_flax.py index b8126c5f5930..0d1447570dda 100644 --- a/src/diffusers/models/unet_2d_blocks_flax.py +++ b/src/diffusers/models/unet_2d_blocks_flax.py @@ -33,7 +33,7 @@ class FlaxCrossAttnDownBlock2D(nn.Module): Dropout rate num_layers (:obj:`int`, *optional*, defaults to 1): Number of attention blocks layers - attn_num_head_channels (:obj:`int`, *optional*, defaults to 1): + num_attention_heads (:obj:`int`, *optional*, defaults to 1): Number of attention heads of each spatial transformer block add_downsample (:obj:`bool`, *optional*, defaults to `True`): Whether to add downsampling layer before each final output @@ -46,7 +46,7 @@ class FlaxCrossAttnDownBlock2D(nn.Module): out_channels: int dropout: float = 0.0 num_layers: int = 1 - attn_num_head_channels: int = 1 + num_attention_heads: int = 1 add_downsample: bool = True use_linear_projection: bool = False only_cross_attention: bool = False @@ -70,8 +70,8 @@ def setup(self): attn_block = FlaxTransformer2DModel( in_channels=self.out_channels, - n_heads=self.attn_num_head_channels, - d_head=self.out_channels // self.attn_num_head_channels, + n_heads=self.num_attention_heads, + d_head=self.out_channels // self.num_attention_heads, depth=1, use_linear_projection=self.use_linear_projection, only_cross_attention=self.only_cross_attention, @@ -172,7 +172,7 @@ class FlaxCrossAttnUpBlock2D(nn.Module): Dropout rate num_layers (:obj:`int`, *optional*, defaults to 1): Number of attention blocks layers - attn_num_head_channels (:obj:`int`, *optional*, defaults to 1): + num_attention_heads (:obj:`int`, *optional*, defaults to 1): Number of attention heads of each spatial transformer block add_upsample (:obj:`bool`, *optional*, defaults to `True`): Whether to add upsampling layer before each final output @@ -186,7 +186,7 @@ class FlaxCrossAttnUpBlock2D(nn.Module): prev_output_channel: int dropout: float = 0.0 num_layers: int = 1 - attn_num_head_channels: int = 1 + num_attention_heads: int = 1 add_upsample: bool = True use_linear_projection: bool = False only_cross_attention: bool = False @@ -211,8 +211,8 @@ def setup(self): attn_block = FlaxTransformer2DModel( in_channels=self.out_channels, - n_heads=self.attn_num_head_channels, - d_head=self.out_channels // self.attn_num_head_channels, + n_heads=self.num_attention_heads, + d_head=self.out_channels // self.num_attention_heads, depth=1, use_linear_projection=self.use_linear_projection, only_cross_attention=self.only_cross_attention, @@ -317,7 +317,7 @@ class FlaxUNetMidBlock2DCrossAttn(nn.Module): Dropout rate num_layers (:obj:`int`, *optional*, defaults to 1): Number of attention blocks layers - attn_num_head_channels (:obj:`int`, *optional*, defaults to 1): + num_attention_heads (:obj:`int`, *optional*, defaults to 1): Number of attention heads of each spatial transformer block use_memory_efficient_attention (`bool`, *optional*, defaults to `False`): enable memory efficient attention https://arxiv.org/abs/2112.05682 @@ -327,7 +327,7 @@ class FlaxUNetMidBlock2DCrossAttn(nn.Module): in_channels: int dropout: float = 0.0 num_layers: int = 1 - attn_num_head_channels: int = 1 + num_attention_heads: int = 1 use_linear_projection: bool = False use_memory_efficient_attention: bool = False dtype: jnp.dtype = jnp.float32 @@ -348,8 +348,8 @@ def setup(self): for _ in range(self.num_layers): attn_block = FlaxTransformer2DModel( in_channels=self.in_channels, - n_heads=self.attn_num_head_channels, - d_head=self.in_channels // self.attn_num_head_channels, + n_heads=self.num_attention_heads, + d_head=self.in_channels // self.num_attention_heads, depth=1, use_linear_projection=self.use_linear_projection, use_memory_efficient_attention=self.use_memory_efficient_attention, diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index dda21fd80479..7bca5c336c57 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -103,6 +103,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int`, *optional*): + The number of attention heads. If not defined, defaults to `attention_head_dim` resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`. class_embed_type (`str`, *optional*, defaults to None): @@ -169,6 +171,7 @@ def __init__( encoder_hid_dim: Optional[int] = None, encoder_hid_dim_type: Optional[str] = None, attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, dual_cross_attention: bool = False, use_linear_projection: bool = False, class_embed_type: Optional[str] = None, @@ -195,6 +198,14 @@ def __init__( self.sample_size = sample_size + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( @@ -211,6 +222,11 @@ def __init__( f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." @@ -353,6 +369,9 @@ def __init__( if mid_block_only_cross_attention is None: mid_block_only_cross_attention = False + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + if isinstance(attention_head_dim, int): attention_head_dim = (attention_head_dim,) * len(down_block_types) @@ -388,7 +407,7 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=cross_attention_dim[i], - attn_num_head_channels=attention_head_dim[i], + num_attention_heads=num_attention_heads[i], downsample_padding=downsample_padding, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, @@ -398,6 +417,7 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, ) self.down_blocks.append(down_block) @@ -411,7 +431,7 @@ def __init__( output_scale_factor=mid_block_scale_factor, resnet_time_scale_shift=resnet_time_scale_shift, cross_attention_dim=cross_attention_dim[-1], - attn_num_head_channels=attention_head_dim[-1], + num_attention_heads=num_attention_heads[-1], resnet_groups=norm_num_groups, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, @@ -425,7 +445,7 @@ def __init__( resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, cross_attention_dim=cross_attention_dim[-1], - attn_num_head_channels=attention_head_dim[-1], + attention_head_dim=attention_head_dim[-1], resnet_groups=norm_num_groups, resnet_time_scale_shift=resnet_time_scale_shift, skip_time_act=resnet_skip_time_act, @@ -442,7 +462,7 @@ def __init__( # up reversed_block_out_channels = list(reversed(block_out_channels)) - reversed_attention_head_dim = list(reversed(attention_head_dim)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) reversed_layers_per_block = list(reversed(layers_per_block)) reversed_cross_attention_dim = list(reversed(cross_attention_dim)) only_cross_attention = list(reversed(only_cross_attention)) @@ -474,7 +494,7 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=reversed_cross_attention_dim[i], - attn_num_head_channels=reversed_attention_head_dim[i], + num_attention_heads=reversed_num_attention_heads[i], dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention[i], @@ -483,6 +503,7 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, ) self.up_blocks.append(up_block) prev_output_channel = output_channel @@ -575,8 +596,8 @@ def set_attention_slice(self, slice_size): slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` - must be a multiple of `slice_size`. + provided, uses as many slices as `num_attention_heads // slice_size`. In this case, + `num_attention_heads` must be a multiple of `slice_size`. """ sliceable_head_dims = [] diff --git a/src/diffusers/models/unet_2d_condition_flax.py b/src/diffusers/models/unet_2d_condition_flax.py index 3c2f4a88ab7f..73f7e9263ff2 100644 --- a/src/diffusers/models/unet_2d_condition_flax.py +++ b/src/diffusers/models/unet_2d_condition_flax.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Tuple, Union +from typing import Optional, Tuple, Union import flax import flax.linen as nn @@ -81,6 +81,8 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): The number of layers per block. attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int` or `Tuple[int]`, *optional*): + The number of attention heads. cross_attention_dim (`int`, *optional*, defaults to 768): The dimension of the cross attention features. dropout (`float`, *optional*, defaults to 0): @@ -107,6 +109,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): block_out_channels: Tuple[int] = (320, 640, 1280, 1280) layers_per_block: int = 2 attention_head_dim: Union[int, Tuple[int]] = 8 + num_attention_heads: Optional[Union[int, Tuple[int]]] = None cross_attention_dim: int = 1280 dropout: float = 0.0 use_linear_projection: bool = False @@ -131,6 +134,14 @@ def setup(self): block_out_channels = self.block_out_channels time_embed_dim = block_out_channels[0] * 4 + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = self.num_attention_heads or self.attention_head_dim + # input self.conv_in = nn.Conv( block_out_channels[0], @@ -150,9 +161,8 @@ def setup(self): if isinstance(only_cross_attention, bool): only_cross_attention = (only_cross_attention,) * len(self.down_block_types) - attention_head_dim = self.attention_head_dim - if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim,) * len(self.down_block_types) + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(self.down_block_types) # down down_blocks = [] @@ -168,7 +178,7 @@ def setup(self): out_channels=output_channel, dropout=self.dropout, num_layers=self.layers_per_block, - attn_num_head_channels=attention_head_dim[i], + num_attention_heads=num_attention_heads[i], add_downsample=not is_final_block, use_linear_projection=self.use_linear_projection, only_cross_attention=only_cross_attention[i], @@ -192,7 +202,7 @@ def setup(self): self.mid_block = FlaxUNetMidBlock2DCrossAttn( in_channels=block_out_channels[-1], dropout=self.dropout, - attn_num_head_channels=attention_head_dim[-1], + num_attention_heads=num_attention_heads[-1], use_linear_projection=self.use_linear_projection, use_memory_efficient_attention=self.use_memory_efficient_attention, dtype=self.dtype, @@ -201,7 +211,7 @@ def setup(self): # up up_blocks = [] reversed_block_out_channels = list(reversed(block_out_channels)) - reversed_attention_head_dim = list(reversed(attention_head_dim)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) only_cross_attention = list(reversed(only_cross_attention)) output_channel = reversed_block_out_channels[0] for i, up_block_type in enumerate(self.up_block_types): @@ -217,7 +227,7 @@ def setup(self): out_channels=output_channel, prev_output_channel=prev_output_channel, num_layers=self.layers_per_block + 1, - attn_num_head_channels=reversed_attention_head_dim[i], + num_attention_heads=reversed_num_attention_heads[i], add_upsample=not is_final_block, dropout=self.dropout, use_linear_projection=self.use_linear_projection, diff --git a/src/diffusers/models/unet_3d_blocks.py b/src/diffusers/models/unet_3d_blocks.py index 2c86171610bf..73bfa401932f 100644 --- a/src/diffusers/models/unet_3d_blocks.py +++ b/src/diffusers/models/unet_3d_blocks.py @@ -29,7 +29,7 @@ def get_down_block( add_downsample, resnet_eps, resnet_act_fn, - attn_num_head_channels, + num_attention_heads, resnet_groups=None, cross_attention_dim=None, downsample_padding=None, @@ -66,7 +66,7 @@ def get_down_block( resnet_groups=resnet_groups, downsample_padding=downsample_padding, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + num_attention_heads=num_attention_heads, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, @@ -86,7 +86,7 @@ def get_up_block( add_upsample, resnet_eps, resnet_act_fn, - attn_num_head_channels, + num_attention_heads, resnet_groups=None, cross_attention_dim=None, dual_cross_attention=False, @@ -122,7 +122,7 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + num_attention_heads=num_attention_heads, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, @@ -144,7 +144,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, output_scale_factor=1.0, cross_attention_dim=1280, dual_cross_attention=False, @@ -154,7 +154,7 @@ def __init__( super().__init__() self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # there is always at least one resnet @@ -185,8 +185,8 @@ def __init__( for _ in range(num_layers): attentions.append( Transformer2DModel( - in_channels // attn_num_head_channels, - attn_num_head_channels, + in_channels // num_attention_heads, + num_attention_heads, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -197,8 +197,8 @@ def __init__( ) temp_attentions.append( TransformerTemporalModel( - in_channels // attn_num_head_channels, - attn_num_head_channels, + in_channels // num_attention_heads, + num_attention_heads, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -273,7 +273,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, cross_attention_dim=1280, output_scale_factor=1.0, downsample_padding=1, @@ -290,7 +290,7 @@ def __init__( temp_convs = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels @@ -317,8 +317,8 @@ def __init__( ) attentions.append( Transformer2DModel( - out_channels // attn_num_head_channels, - attn_num_head_channels, + out_channels // num_attention_heads, + num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -330,8 +330,8 @@ def __init__( ) temp_attentions.append( TransformerTemporalModel( - out_channels // attn_num_head_channels, - attn_num_head_channels, + out_channels // num_attention_heads, + num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -486,7 +486,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, cross_attention_dim=1280, output_scale_factor=1.0, add_upsample=True, @@ -502,7 +502,7 @@ def __init__( temp_attentions = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels @@ -531,8 +531,8 @@ def __init__( ) attentions.append( Transformer2DModel( - out_channels // attn_num_head_channels, - attn_num_head_channels, + out_channels // num_attention_heads, + num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -544,8 +544,8 @@ def __init__( ) temp_attentions.append( TransformerTemporalModel( - out_channels // attn_num_head_channels, - attn_num_head_channels, + out_channels // num_attention_heads, + num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index 6fb5dfa30ebf..aa6aa542b158 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -79,6 +79,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int`, *optional*): The number of attention heads. """ _supports_gradient_checkpointing = False @@ -105,11 +106,20 @@ def __init__( norm_eps: float = 1e-5, cross_attention_dim: int = 1024, attention_head_dim: Union[int, Tuple[int]] = 64, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, ): super().__init__() self.sample_size = sample_size + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( @@ -121,9 +131,9 @@ def __init__( f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." ) - if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): raise ValueError( - f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}." + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." ) # input @@ -156,8 +166,8 @@ def __init__( self.down_blocks = nn.ModuleList([]) self.up_blocks = nn.ModuleList([]) - if isinstance(attention_head_dim, int): - attention_head_dim = (attention_head_dim,) * len(down_block_types) + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) # down output_channel = block_out_channels[0] @@ -177,7 +187,7 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attention_head_dim[i], + num_attention_heads=num_attention_heads[i], downsample_padding=downsample_padding, dual_cross_attention=False, ) @@ -191,7 +201,7 @@ def __init__( resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attention_head_dim[-1], + num_attention_heads=num_attention_heads[-1], resnet_groups=norm_num_groups, dual_cross_attention=False, ) @@ -201,7 +211,7 @@ def __init__( # up reversed_block_out_channels = list(reversed(block_out_channels)) - reversed_attention_head_dim = list(reversed(attention_head_dim)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) output_channel = reversed_block_out_channels[0] for i, up_block_type in enumerate(up_block_types): @@ -230,7 +240,7 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=reversed_attention_head_dim[i], + num_attention_heads=reversed_num_attention_heads[i], dual_cross_attention=False, ) self.up_blocks.append(up_block) @@ -288,8 +298,8 @@ def set_attention_slice(self, slice_size): slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` - must be a multiple of `slice_size`. + provided, uses as many slices as `num_attention_heads // slice_size`. In this case, + `num_attention_heads` must be a multiple of `slice_size`. """ sliceable_head_dims = [] diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py index dd4af0efcfd9..b54e3964f183 100644 --- a/src/diffusers/models/vae.py +++ b/src/diffusers/models/vae.py @@ -79,7 +79,7 @@ def __init__( downsample_padding=0, resnet_act_fn=act_fn, resnet_groups=norm_num_groups, - attn_num_head_channels=None, + attention_head_dim=output_channel, temb_channels=None, ) self.down_blocks.append(down_block) @@ -91,7 +91,7 @@ def __init__( resnet_act_fn=act_fn, output_scale_factor=1, resnet_time_scale_shift="default", - attn_num_head_channels=None, + attention_head_dim=block_out_channels[-1], resnet_groups=norm_num_groups, temb_channels=None, ) @@ -184,7 +184,7 @@ def __init__( resnet_act_fn=act_fn, output_scale_factor=1, resnet_time_scale_shift="default" if norm_type == "group" else norm_type, - attn_num_head_channels=None, + attention_head_dim=block_out_channels[-1], resnet_groups=norm_num_groups, temb_channels=temb_channels, ) @@ -208,7 +208,7 @@ def __init__( resnet_eps=1e-6, resnet_act_fn=act_fn, resnet_groups=norm_num_groups, - attn_num_head_channels=None, + attention_head_dim=output_channel, temb_channels=temb_channels, resnet_time_scale_shift=norm_type, ) diff --git a/src/diffusers/models/vae_flax.py b/src/diffusers/models/vae_flax.py index 994e3bb06adc..9812954db76d 100644 --- a/src/diffusers/models/vae_flax.py +++ b/src/diffusers/models/vae_flax.py @@ -396,7 +396,7 @@ class FlaxUNetMidBlock2D(nn.Module): Number of Resnet layer block resnet_groups (:obj:`int`, *optional*, defaults to `32`): The number of groups to use for the Resnet and Attention block group norm - attn_num_head_channels (:obj:`int`, *optional*, defaults to `1`): + num_attention_heads (:obj:`int`, *optional*, defaults to `1`): Number of attention heads for each attention block dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32): Parameters `dtype` @@ -405,7 +405,7 @@ class FlaxUNetMidBlock2D(nn.Module): dropout: float = 0.0 num_layers: int = 1 resnet_groups: int = 32 - attn_num_head_channels: int = 1 + num_attention_heads: int = 1 dtype: jnp.dtype = jnp.float32 def setup(self): @@ -427,7 +427,7 @@ def setup(self): for _ in range(self.num_layers): attn_block = FlaxAttentionBlock( channels=self.in_channels, - num_head_channels=self.attn_num_head_channels, + num_head_channels=self.num_attention_heads, num_groups=resnet_groups, dtype=self.dtype, ) @@ -532,7 +532,7 @@ def setup(self): self.mid_block = FlaxUNetMidBlock2D( in_channels=block_out_channels[-1], resnet_groups=self.norm_num_groups, - attn_num_head_channels=None, + num_attention_heads=None, dtype=self.dtype, ) @@ -625,7 +625,7 @@ def setup(self): self.mid_block = FlaxUNetMidBlock2D( in_channels=block_out_channels[-1], resnet_groups=self.norm_num_groups, - attn_num_head_channels=None, + num_attention_heads=None, dtype=self.dtype, ) diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index f11729451299..0dd2351e6076 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -41,7 +41,7 @@ def get_down_block( add_downsample, resnet_eps, resnet_act_fn, - attn_num_head_channels, + num_attention_heads, resnet_groups=None, cross_attention_dim=None, downsample_padding=None, @@ -82,7 +82,7 @@ def get_down_block( resnet_groups=resnet_groups, downsample_padding=downsample_padding, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + num_attention_heads=num_attention_heads, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, @@ -101,7 +101,7 @@ def get_up_block( add_upsample, resnet_eps, resnet_act_fn, - attn_num_head_channels, + num_attention_heads, resnet_groups=None, cross_attention_dim=None, dual_cross_attention=False, @@ -141,7 +141,7 @@ def get_up_block( resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, cross_attention_dim=cross_attention_dim, - attn_num_head_channels=attn_num_head_channels, + num_attention_heads=num_attention_heads, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention, @@ -196,6 +196,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. + num_attention_heads (`int`, *optional*): + The number of attention heads. If not defined, defaults to `attention_head_dim` resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`. class_embed_type (`str`, *optional*, defaults to None): @@ -267,6 +269,7 @@ def __init__( encoder_hid_dim: Optional[int] = None, encoder_hid_dim_type: Optional[str] = None, attention_head_dim: Union[int, Tuple[int]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int]]] = None, dual_cross_attention: bool = False, use_linear_projection: bool = False, class_embed_type: Optional[str] = None, @@ -293,6 +296,14 @@ def __init__( self.sample_size = sample_size + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + # Check inputs if len(down_block_types) != len(up_block_types): raise ValueError( @@ -312,6 +323,12 @@ def __init__( f" `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." ) + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + "Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`:" + f" {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types): raise ValueError( "Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`:" @@ -457,6 +474,9 @@ def __init__( if mid_block_only_cross_attention is None: mid_block_only_cross_attention = False + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + if isinstance(attention_head_dim, int): attention_head_dim = (attention_head_dim,) * len(down_block_types) @@ -492,7 +512,7 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=cross_attention_dim[i], - attn_num_head_channels=attention_head_dim[i], + num_attention_heads=num_attention_heads[i], downsample_padding=downsample_padding, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, @@ -502,6 +522,7 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, ) self.down_blocks.append(down_block) @@ -515,7 +536,7 @@ def __init__( output_scale_factor=mid_block_scale_factor, resnet_time_scale_shift=resnet_time_scale_shift, cross_attention_dim=cross_attention_dim[-1], - attn_num_head_channels=attention_head_dim[-1], + num_attention_heads=num_attention_heads[-1], resnet_groups=norm_num_groups, dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, @@ -529,7 +550,7 @@ def __init__( resnet_act_fn=act_fn, output_scale_factor=mid_block_scale_factor, cross_attention_dim=cross_attention_dim[-1], - attn_num_head_channels=attention_head_dim[-1], + attention_head_dim=attention_head_dim[-1], resnet_groups=norm_num_groups, resnet_time_scale_shift=resnet_time_scale_shift, skip_time_act=resnet_skip_time_act, @@ -546,7 +567,7 @@ def __init__( # up reversed_block_out_channels = list(reversed(block_out_channels)) - reversed_attention_head_dim = list(reversed(attention_head_dim)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) reversed_layers_per_block = list(reversed(layers_per_block)) reversed_cross_attention_dim = list(reversed(cross_attention_dim)) only_cross_attention = list(reversed(only_cross_attention)) @@ -578,7 +599,7 @@ def __init__( resnet_act_fn=act_fn, resnet_groups=norm_num_groups, cross_attention_dim=reversed_cross_attention_dim[i], - attn_num_head_channels=reversed_attention_head_dim[i], + num_attention_heads=reversed_num_attention_heads[i], dual_cross_attention=dual_cross_attention, use_linear_projection=use_linear_projection, only_cross_attention=only_cross_attention[i], @@ -587,6 +608,7 @@ def __init__( resnet_skip_time_act=resnet_skip_time_act, resnet_out_scale_factor=resnet_out_scale_factor, cross_attention_norm=cross_attention_norm, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, ) self.up_blocks.append(up_block) prev_output_channel = output_channel @@ -679,8 +701,8 @@ def set_attention_slice(self, slice_size): slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` - must be a multiple of `slice_size`. + provided, uses as many slices as `num_attention_heads // slice_size`. In this case, + `num_attention_heads` must be a multiple of `slice_size`. """ sliceable_head_dims = [] @@ -1192,7 +1214,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, cross_attention_dim=1280, output_scale_factor=1.0, downsample_padding=1, @@ -1207,7 +1229,7 @@ def __init__( attentions = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads for i in range(num_layers): in_channels = in_channels if i == 0 else out_channels @@ -1228,8 +1250,8 @@ def __init__( if not dual_cross_attention: attentions.append( Transformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1242,8 +1264,8 @@ def __init__( else: attentions.append( DualTransformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1426,7 +1448,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, cross_attention_dim=1280, output_scale_factor=1.0, add_upsample=True, @@ -1440,7 +1462,7 @@ def __init__( attentions = [] self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads for i in range(num_layers): res_skip_channels = in_channels if (i == num_layers - 1) else out_channels @@ -1463,8 +1485,8 @@ def __init__( if not dual_cross_attention: attentions.append( Transformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1477,8 +1499,8 @@ def __init__( else: attentions.append( DualTransformer2DModel( - attn_num_head_channels, - out_channels // attn_num_head_channels, + num_attention_heads, + out_channels // num_attention_heads, in_channels=out_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1572,7 +1594,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + num_attention_heads=1, output_scale_factor=1.0, cross_attention_dim=1280, dual_cross_attention=False, @@ -1582,7 +1604,7 @@ def __init__( super().__init__() self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.num_attention_heads = num_attention_heads resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) # there is always at least one resnet @@ -1606,8 +1628,8 @@ def __init__( if not dual_cross_attention: attentions.append( Transformer2DModel( - attn_num_head_channels, - in_channels // attn_num_head_channels, + num_attention_heads, + in_channels // num_attention_heads, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1619,8 +1641,8 @@ def __init__( else: attentions.append( DualTransformer2DModel( - attn_num_head_channels, - in_channels // attn_num_head_channels, + num_attention_heads, + in_channels // num_attention_heads, in_channels=in_channels, num_layers=1, cross_attention_dim=cross_attention_dim, @@ -1682,7 +1704,7 @@ def __init__( resnet_act_fn: str = "swish", resnet_groups: int = 32, resnet_pre_norm: bool = True, - attn_num_head_channels=1, + attention_head_dim=1, output_scale_factor=1.0, cross_attention_dim=1280, skip_time_act=False, @@ -1693,10 +1715,10 @@ def __init__( self.has_cross_attention = True - self.attn_num_head_channels = attn_num_head_channels + self.attention_head_dim = attention_head_dim resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) - self.num_heads = in_channels // self.attn_num_head_channels + self.num_heads = in_channels // self.attention_head_dim # there is always at least one resnet resnets = [ @@ -1726,7 +1748,7 @@ def __init__( query_dim=in_channels, cross_attention_dim=in_channels, heads=self.num_heads, - dim_head=attn_num_head_channels, + dim_head=self.attention_head_dim, added_kv_proj_dim=cross_attention_dim, norm_num_groups=resnet_groups, bias=True, diff --git a/tests/models/test_models_unet_2d.py b/tests/models/test_models_unet_2d.py index 4857afb85257..bb5335ca3088 100644 --- a/tests/models/test_models_unet_2d.py +++ b/tests/models/test_models_unet_2d.py @@ -59,7 +59,7 @@ def prepare_init_args_and_inputs_for_common(self): "block_out_channels": (32, 64), "down_block_types": ("DownBlock2D", "AttnDownBlock2D"), "up_block_types": ("AttnUpBlock2D", "UpBlock2D"), - "attention_head_dim": None, + "attention_head_dim": 3, "out_channels": 3, "in_channels": 3, "layers_per_block": 2, From 5df2acf7d299346e2cb5ff921cb499ca774c6213 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 22 Jun 2023 13:52:59 +0200 Subject: [PATCH 153/199] [Conversion] Small fixes (#3848) * [Conversion] Small fixes * Update src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py --- scripts/convert_vae_pt_to_diffusers.py | 12 ++++++++++-- .../pipelines/stable_diffusion/convert_from_ckpt.py | 5 +++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/scripts/convert_vae_pt_to_diffusers.py b/scripts/convert_vae_pt_to_diffusers.py index 4762ffcf8d00..a8ba48bc001c 100644 --- a/scripts/convert_vae_pt_to_diffusers.py +++ b/scripts/convert_vae_pt_to_diffusers.py @@ -129,11 +129,19 @@ def vae_pt_to_vae_diffuser( original_config = OmegaConf.load(io_obj) image_size = 512 device = "cuda" if torch.cuda.is_available() else "cpu" - checkpoint = torch.load(checkpoint_path, map_location=device) + if checkpoint_path.endswith("safetensors"): + from safetensors import safe_open + + checkpoint = {} + with safe_open(checkpoint_path, framework="pt", device="cpu") as f: + for key in f.keys(): + checkpoint[key] = f.get_tensor(key) + else: + checkpoint = torch.load(checkpoint_path, map_location=device)["state_dict"] # Convert the VAE model. vae_config = create_vae_diffusers_config(original_config, image_size=image_size) - converted_vae_checkpoint = custom_convert_ldm_vae_checkpoint(checkpoint["state_dict"], vae_config) + converted_vae_checkpoint = custom_convert_ldm_vae_checkpoint(checkpoint, vae_config) vae = AutoencoderKL(**vae_config) vae.load_state_dict(converted_vae_checkpoint) diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 746844ea1e0a..3b3724f0d010 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -286,10 +286,11 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa "use_linear_projection": use_linear_projection, "class_embed_type": class_embed_type, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, - "conditioning_channels": unet_params.hint_channels, } - if not controlnet: + if controlnet: + config["conditioning_channels"] = unet_params.hint_channels + else: config["out_channels"] = unet_params.out_channels config["up_block_types"] = tuple(up_block_types) From 5e3f8fff40604ed2332e9f07b3796b15b43b91bb Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 22 Jun 2023 13:53:27 +0200 Subject: [PATCH 154/199] Fix some audio tests (#3841) * Fix some audio tests * make style * fix * make style --- tests/pipelines/audioldm/test_audioldm.py | 10 ++++++++-- tests/pipelines/test_pipelines_common.py | 7 +++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/pipelines/audioldm/test_audioldm.py b/tests/pipelines/audioldm/test_audioldm.py index 0825fc36a266..1c0e1e2d9a90 100644 --- a/tests/pipelines/audioldm/test_audioldm.py +++ b/tests/pipelines/audioldm/test_audioldm.py @@ -36,7 +36,7 @@ PNDMScheduler, UNet2DConditionModel, ) -from diffusers.utils import slow, torch_device +from diffusers.utils import is_xformers_available, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS, TEXT_TO_AUDIO_PARAMS @@ -361,9 +361,15 @@ def test_attention_slicing_forward_pass(self): def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(test_mean_pixel_difference=False) + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False) + @slow -# @require_torch_gpu class AudioLDMPipelineSlowTests(unittest.TestCase): def tearDown(self): super().tearDown() diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index fac04bdbe30f..008a8a2e6367 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -640,7 +640,9 @@ def test_cpu_offload_forward_pass(self, expected_max_diff=1e-4): def test_xformers_attention_forwardGenerator_pass(self): self._test_xformers_attention_forwardGenerator_pass() - def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=True, expected_max_diff=1e-4): + def _test_xformers_attention_forwardGenerator_pass( + self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-4 + ): if not self.test_xformers_attention: return @@ -660,7 +662,8 @@ def _test_xformers_attention_forwardGenerator_pass(self, test_max_difference=Tru max_diff = np.abs(output_with_offload - output_without_offload).max() self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results") - assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0]) + if test_mean_pixel_difference: + assert_mean_pixel_difference(output_with_offload[0], output_without_offload[0]) def test_progress_bar(self): components = self.get_dummy_components() From fc6acb6b97e93d58cb22b5fee52d884d77ce84d8 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 22 Jun 2023 17:54:35 +0530 Subject: [PATCH 155/199] [Docs] add: contributor note in the paradigms docs. (#3852) add: contributor note in the paradigms docs. --- docs/source/en/api/pipelines/paradigms.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/api/pipelines/paradigms.mdx b/docs/source/en/api/pipelines/paradigms.mdx index ec399e6b96d4..938751c4874e 100644 --- a/docs/source/en/api/pipelines/paradigms.mdx +++ b/docs/source/en/api/pipelines/paradigms.mdx @@ -31,6 +31,8 @@ Resources: |---|---|:---:| | [StableDiffusionParadigmsPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py) | *Faster Text-to-Image Generation* | | +This pipeline was contributed by [`AndyShih12`](https://github.com/AndyShih12) in this [PR](https://github.com/huggingface/diffusers/pull/3716/). + ## Usage example ```python From 61916fefc400d1b77171490b6ff7d53e509ed9de Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Sat, 24 Jun 2023 21:17:11 +0200 Subject: [PATCH 156/199] Update Habana Gaudi doc (#3863) * Update Habana Gaudi doc * Fix typo --- docs/source/en/optimization/habana.mdx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/en/optimization/habana.mdx b/docs/source/en/optimization/habana.mdx index 7092c89352db..24846615c95c 100644 --- a/docs/source/en/optimization/habana.mdx +++ b/docs/source/en/optimization/habana.mdx @@ -16,8 +16,8 @@ specific language governing permissions and limitations under the License. ## Requirements -- Optimum Habana 1.5 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it. -- SynapseAI 1.9. +- Optimum Habana 1.6 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it. +- SynapseAI 1.10. ## Inference Pipeline @@ -41,7 +41,7 @@ pipeline = GaudiStableDiffusionPipeline.from_pretrained( scheduler=scheduler, use_habana=True, use_hpu_graphs=True, - gaudi_config="Habana/stable-diffusion", + gaudi_config="Habana/stable-diffusion-2", ) ``` @@ -62,18 +62,18 @@ For more information, check out Optimum Habana's [documentation](https://hugging ## Benchmark -Here are the latencies for Habana first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi configuration (mixed precision bf16/fp32): +Here are the latencies for Habana first-generation Gaudi and Gaudi2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) and [Habana/stable-diffusion-2](https://huggingface.co/Habana/stable-diffusion-2) Gaudi configurations (mixed precision bf16/fp32): - [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) (512x512 resolution): | | Latency (batch size = 1) | Throughput (batch size = 8) | | ---------------------- |:------------------------:|:---------------------------:| -| first-generation Gaudi | 4.22s | 0.29 images/s | -| Gaudi2 | 1.70s | 0.925 images/s | +| first-generation Gaudi | 3.80s | 0.308 images/s | +| Gaudi2 | 1.33s | 1.081 images/s | - [Stable Diffusion v2.1](https://huggingface.co/stabilityai/stable-diffusion-2-1) (768x768 resolution): | | Latency (batch size = 1) | Throughput | | ---------------------- |:------------------------:|:-------------------------------:| -| first-generation Gaudi | 23.3s | 0.045 images/s (batch size = 2) | -| Gaudi2 | 7.75s | 0.14 images/s (batch size = 5) | +| first-generation Gaudi | 10.2s | 0.108 images/s (batch size = 4) | +| Gaudi2 | 3.17s | 0.379 images/s (batch size = 8) | From 9a45d7fb768d0ee55303cc888667f3b2d9f4c273 Mon Sep 17 00:00:00 2001 From: Joachim Blaafjell Holwech Date: Tue, 27 Jun 2023 01:04:11 +0200 Subject: [PATCH 157/199] Add guidance start/stop (#3770) * Add guidance start/stop * Add guidance start/stop to inpaint class * Black formatting * Add support for guidance for multicontrolnet * Add inclusive end * Improve design * correct imports * Finish * Finish all * Correct more * make style --------- Co-authored-by: Patrick von Platen --- .../controlnet/pipeline_controlnet.py | 61 +++++++++++++++++- .../controlnet/pipeline_controlnet_img2img.py | 60 +++++++++++++++++- .../controlnet/pipeline_controlnet_inpaint.py | 62 ++++++++++++++++++- tests/pipelines/controlnet/test_controlnet.py | 43 +++++++++++++ .../controlnet/test_controlnet_img2img.py | 43 +++++++++++++ .../controlnet/test_controlnet_inpaint.py | 43 +++++++++++++ 6 files changed, 305 insertions(+), 7 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index 974d28fd5b05..dddfc3591b66 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -491,6 +491,8 @@ def check_inputs( prompt_embeds=None, negative_prompt_embeds=None, controlnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, ): if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) @@ -593,6 +595,27 @@ def check_inputs( else: assert False + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + if isinstance(self.controlnet, MultiControlNetModel): + if len(control_guidance_start) != len(self.controlnet.nets): + raise ValueError( + f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) image_is_tensor = isinstance(image, torch.Tensor) @@ -709,6 +732,8 @@ def __call__( cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 1.0, guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, ): r""" Function invoked when calling the pipeline for generation. @@ -784,6 +809,10 @@ def __call__( guess_mode (`bool`, *optional*, defaults to `False`): In this mode, the ControlNet encoder will try best to recognize the content of the input image even if you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the controlnet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the controlnet stops applying. Examples: @@ -794,6 +823,18 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [ + control_guidance_end + ] # 1. Check inputs. Raise error if not correct self.check_inputs( @@ -804,6 +845,8 @@ def __call__( prompt_embeds, negative_prompt_embeds, controlnet_conditioning_scale, + control_guidance_start, + control_guidance_end, ) # 2. Define call parameters @@ -820,8 +863,6 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet - if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) @@ -904,6 +945,15 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 7.1 Create tensor stating which controlnets to keep + controlnet_keep = [] + for i in range(num_inference_steps): + keeps = [ + 1.0 - float(i / num_inference_steps < s or (i + 1) / num_inference_steps > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if len(keeps) == 1 else keeps) + # 8. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -922,12 +972,17 @@ def __call__( control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + cond_scale = controlnet_conditioning_scale * controlnet_keep[i] + down_block_res_samples, mid_block_res_sample = self.controlnet( control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=image, - conditioning_scale=controlnet_conditioning_scale, + conditioning_scale=cond_scale, guess_mode=guess_mode, return_dict=False, ) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index febe4c8a5734..c7a0db96e8c0 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -517,6 +517,8 @@ def check_inputs( prompt_embeds=None, negative_prompt_embeds=None, controlnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, ): if (callback_steps is None) or ( callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) @@ -619,6 +621,27 @@ def check_inputs( else: assert False + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + if isinstance(self.controlnet, MultiControlNetModel): + if len(control_guidance_start) != len(self.controlnet.nets): + raise ValueError( + f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -796,6 +819,8 @@ def __call__( cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.8, guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, ): r""" Function invoked when calling the pipeline for generation. @@ -876,6 +901,10 @@ def __call__( guess_mode (`bool`, *optional*, defaults to `False`): In this mode, the ControlNet encoder will try best to recognize the content of the input image even if you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the controlnet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the controlnet stops applying. Examples: @@ -886,6 +915,19 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [ + control_guidance_end + ] + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, @@ -895,6 +937,8 @@ def __call__( prompt_embeds, negative_prompt_embeds, controlnet_conditioning_scale, + control_guidance_start, + control_guidance_end, ) # 2. Define call parameters @@ -994,6 +1038,15 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 7.1 Create tensor stating which controlnets to keep + controlnet_keep = [] + for i in range(num_inference_steps): + keeps = [ + 1.0 - float(i / num_inference_steps < s or (i + 1) / num_inference_steps > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if len(keeps) == 1 else keeps) + # 8. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -1012,12 +1065,17 @@ def __call__( control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + cond_scale = controlnet_conditioning_scale * controlnet_keep[i] + down_block_res_samples, mid_block_res_sample = self.controlnet( control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=control_image, - conditioning_scale=controlnet_conditioning_scale, + conditioning_scale=cond_scale, guess_mode=guess_mode, return_dict=False, ) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index cab689d1d4e2..bfaaaae49401 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -646,6 +646,8 @@ def check_inputs( prompt_embeds=None, negative_prompt_embeds=None, controlnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") @@ -751,6 +753,27 @@ def check_inputs( else: assert False + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + if isinstance(self.controlnet, MultiControlNetModel): + if len(control_guidance_start) != len(self.controlnet.nets): + raise ValueError( + f"`control_guidance_start`: {control_guidance_start} has {len(control_guidance_start)} elements but there are {len(self.controlnet.nets)} controlnets available. Make sure to provide {len(self.controlnet.nets)}." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + # Copied from diffusers.pipelines.controlnet.pipeline_controlnet.StableDiffusionControlNetPipeline.check_image def check_image(self, image, prompt, prompt_embeds): image_is_pil = isinstance(image, PIL.Image.Image) @@ -990,6 +1013,8 @@ def __call__( cross_attention_kwargs: Optional[Dict[str, Any]] = None, controlnet_conditioning_scale: Union[float, List[float]] = 0.5, guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, ): r""" Function invoked when calling the pipeline for generation. @@ -1073,6 +1098,10 @@ def __call__( guess_mode (`bool`, *optional*, defaults to `False`): In this mode, the ControlNet encoder will try best to recognize the content of the input image even if you remove all prompts. The `guidance_scale` between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the controlnet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the controlnet stops applying. Examples: @@ -1083,9 +1112,22 @@ def __call__( list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" (nsfw) content, according to the `safety_checker`. """ + controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet + # 0. Default height and width to unet height, width = self._default_height_width(height, width, image) + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + mult = len(controlnet.nets) if isinstance(controlnet, MultiControlNetModel) else 1 + control_guidance_start, control_guidance_end = mult * [control_guidance_start], mult * [ + control_guidance_end + ] + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, @@ -1097,6 +1139,8 @@ def __call__( prompt_embeds, negative_prompt_embeds, controlnet_conditioning_scale, + control_guidance_start, + control_guidance_end, ) # 2. Define call parameters @@ -1113,8 +1157,6 @@ def __call__( # corresponds to doing no classifier free guidance. do_classifier_free_guidance = guidance_scale > 1.0 - controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet - if isinstance(controlnet, MultiControlNetModel) and isinstance(controlnet_conditioning_scale, float): controlnet_conditioning_scale = [controlnet_conditioning_scale] * len(controlnet.nets) @@ -1231,6 +1273,15 @@ def __call__( # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 7.1 Create tensor stating which controlnets to keep + controlnet_keep = [] + for i in range(num_inference_steps): + keeps = [ + 1.0 - float(i / num_inference_steps < s or (i + 1) / num_inference_steps > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + controlnet_keep.append(keeps[0] if len(keeps) == 1 else keeps) + # 8. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: @@ -1249,12 +1300,17 @@ def __call__( control_model_input = latent_model_input controlnet_prompt_embeds = prompt_embeds + if isinstance(controlnet_keep[i], list): + cond_scale = [c * s for c, s in zip(controlnet_conditioning_scale, controlnet_keep[i])] + else: + cond_scale = controlnet_conditioning_scale * controlnet_keep[i] + down_block_res_samples, mid_block_res_sample = self.controlnet( control_model_input, t, encoder_hidden_states=controlnet_prompt_embeds, controlnet_cond=control_image, - conditioning_scale=controlnet_conditioning_scale, + conditioning_scale=cond_scale, guess_mode=guess_mode, return_dict=False, ) diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 37d0f722fa70..906e1e7ee66f 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -226,6 +226,12 @@ def get_dummy_components(self): cross_attention_dim=32, ) torch.manual_seed(0) + + def init_weights(m): + if isinstance(m, torch.nn.Conv2d): + torch.nn.init.normal(m.weight) + m.bias.data.fill_(1.0) + controlnet1 = ControlNetModel( block_out_channels=(32, 64), layers_per_block=2, @@ -234,6 +240,8 @@ def get_dummy_components(self): cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), ) + controlnet1.controlnet_down_blocks.apply(init_weights) + torch.manual_seed(0) controlnet2 = ControlNetModel( block_out_channels=(32, 64), @@ -243,6 +251,8 @@ def get_dummy_components(self): cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), ) + controlnet2.controlnet_down_blocks.apply(init_weights) + torch.manual_seed(0) scheduler = DDIMScheduler( beta_start=0.00085, @@ -321,6 +331,39 @@ def get_dummy_inputs(self, device, seed=0): return inputs + def test_control_guidance_switch(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + + scale = 10.0 + steps = 4 + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_1 = pipe(**inputs)[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0] + + # make sure that all outputs are different + assert np.sum(np.abs(output_1 - output_2)) > 1e-3 + assert np.sum(np.abs(output_1 - output_3)) > 1e-3 + assert np.sum(np.abs(output_1 - output_4)) > 1e-3 + def test_attention_slicing_forward_pass(self): return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index 18262149bb49..bc0e96b2f92b 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -180,6 +180,12 @@ def get_dummy_components(self): cross_attention_dim=32, ) torch.manual_seed(0) + + def init_weights(m): + if isinstance(m, torch.nn.Conv2d): + torch.nn.init.normal(m.weight) + m.bias.data.fill_(1.0) + controlnet1 = ControlNetModel( block_out_channels=(32, 64), layers_per_block=2, @@ -188,6 +194,8 @@ def get_dummy_components(self): cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), ) + controlnet1.controlnet_down_blocks.apply(init_weights) + torch.manual_seed(0) controlnet2 = ControlNetModel( block_out_channels=(32, 64), @@ -197,6 +205,8 @@ def get_dummy_components(self): cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), ) + controlnet2.controlnet_down_blocks.apply(init_weights) + torch.manual_seed(0) scheduler = DDIMScheduler( beta_start=0.00085, @@ -279,6 +289,39 @@ def get_dummy_inputs(self, device, seed=0): return inputs + def test_control_guidance_switch(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + + scale = 10.0 + steps = 4 + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_1 = pipe(**inputs)[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0] + + # make sure that all outputs are different + assert np.sum(np.abs(output_1 - output_2)) > 1e-3 + assert np.sum(np.abs(output_1 - output_3)) > 1e-3 + assert np.sum(np.abs(output_1 - output_4)) > 1e-3 + def test_attention_slicing_forward_pass(self): return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index b351ccfbf8f9..81647d968b6b 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -255,6 +255,12 @@ def get_dummy_components(self): cross_attention_dim=32, ) torch.manual_seed(0) + + def init_weights(m): + if isinstance(m, torch.nn.Conv2d): + torch.nn.init.normal(m.weight) + m.bias.data.fill_(1.0) + controlnet1 = ControlNetModel( block_out_channels=(32, 64), layers_per_block=2, @@ -263,6 +269,8 @@ def get_dummy_components(self): cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), ) + controlnet1.controlnet_down_blocks.apply(init_weights) + torch.manual_seed(0) controlnet2 = ControlNetModel( block_out_channels=(32, 64), @@ -272,6 +280,8 @@ def get_dummy_components(self): cross_attention_dim=32, conditioning_embedding_out_channels=(16, 32), ) + controlnet2.controlnet_down_blocks.apply(init_weights) + torch.manual_seed(0) scheduler = DDIMScheduler( beta_start=0.00085, @@ -357,6 +367,39 @@ def get_dummy_inputs(self, device, seed=0): return inputs + def test_control_guidance_switch(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe.to(torch_device) + + scale = 10.0 + steps = 4 + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_1 = pipe(**inputs)[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_2 = pipe(**inputs, control_guidance_start=0.1, control_guidance_end=0.2)[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_3 = pipe(**inputs, control_guidance_start=[0.1, 0.3], control_guidance_end=[0.2, 0.7])[0] + + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = steps + inputs["controlnet_conditioning_scale"] = scale + output_4 = pipe(**inputs, control_guidance_start=0.4, control_guidance_end=[0.5, 0.8])[0] + + # make sure that all outputs are different + assert np.sum(np.abs(output_1 - output_2)) > 1e-3 + assert np.sum(np.abs(output_1 - output_3)) > 1e-3 + assert np.sum(np.abs(output_1 - output_4)) > 1e-3 + def test_attention_slicing_forward_pass(self): return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3) From 0bf6aeb885e624b17233a29bed8dbbe62c56d48e Mon Sep 17 00:00:00 2001 From: Saurav Maheshkar Date: Wed, 28 Jun 2023 17:01:32 +0530 Subject: [PATCH 158/199] feat: rename single-letter vars in `resnet.py` (#3868) feat: rename single-letter vars --- src/diffusers/models/resnet.py | 52 +++++++++++++++++----------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py index 52f01552c528..24c3b07e7cb6 100644 --- a/src/diffusers/models/resnet.py +++ b/src/diffusers/models/resnet.py @@ -95,9 +95,9 @@ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name= assert self.channels == self.out_channels self.conv = nn.AvgPool1d(kernel_size=stride, stride=stride) - def forward(self, x): - assert x.shape[1] == self.channels - return self.conv(x) + def forward(self, inputs): + assert inputs.shape[1] == self.channels + return self.conv(inputs) class Upsample2D(nn.Module): @@ -431,13 +431,13 @@ def __init__(self, pad_mode="reflect"): self.pad = kernel_1d.shape[1] // 2 - 1 self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False) - def forward(self, x): - x = F.pad(x, (self.pad,) * 4, self.pad_mode) - weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) - indices = torch.arange(x.shape[1], device=x.device) - kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1) + def forward(self, inputs): + inputs = F.pad(inputs, (self.pad,) * 4, self.pad_mode) + weight = inputs.new_zeros([inputs.shape[1], inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) + indices = torch.arange(inputs.shape[1], device=inputs.device) + kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1) weight[indices, indices] = kernel - return F.conv2d(x, weight, stride=2) + return F.conv2d(inputs, weight, stride=2) class KUpsample2D(nn.Module): @@ -448,13 +448,13 @@ def __init__(self, pad_mode="reflect"): self.pad = kernel_1d.shape[1] // 2 - 1 self.register_buffer("kernel", kernel_1d.T @ kernel_1d, persistent=False) - def forward(self, x): - x = F.pad(x, ((self.pad + 1) // 2,) * 4, self.pad_mode) - weight = x.new_zeros([x.shape[1], x.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) - indices = torch.arange(x.shape[1], device=x.device) - kernel = self.kernel.to(weight)[None, :].expand(x.shape[1], -1, -1) + def forward(self, inputs): + inputs = F.pad(inputs, ((self.pad + 1) // 2,) * 4, self.pad_mode) + weight = inputs.new_zeros([inputs.shape[1], inputs.shape[1], self.kernel.shape[0], self.kernel.shape[1]]) + indices = torch.arange(inputs.shape[1], device=inputs.device) + kernel = self.kernel.to(weight)[None, :].expand(inputs.shape[1], -1, -1) weight[indices, indices] = kernel - return F.conv_transpose2d(x, weight, stride=2, padding=self.pad * 2 + 1) + return F.conv_transpose2d(inputs, weight, stride=2, padding=self.pad * 2 + 1) class ResnetBlock2D(nn.Module): @@ -664,13 +664,13 @@ def __init__(self, inp_channels, out_channels, kernel_size, n_groups=8): self.group_norm = nn.GroupNorm(n_groups, out_channels) self.mish = nn.Mish() - def forward(self, x): - x = self.conv1d(x) - x = rearrange_dims(x) - x = self.group_norm(x) - x = rearrange_dims(x) - x = self.mish(x) - return x + def forward(self, inputs): + intermediate_repr = self.conv1d(inputs) + intermediate_repr = rearrange_dims(intermediate_repr) + intermediate_repr = self.group_norm(intermediate_repr) + intermediate_repr = rearrange_dims(intermediate_repr) + output = self.mish(intermediate_repr) + return output # unet_rl.py @@ -687,10 +687,10 @@ def __init__(self, inp_channels, out_channels, embed_dim, kernel_size=5): nn.Conv1d(inp_channels, out_channels, 1) if inp_channels != out_channels else nn.Identity() ) - def forward(self, x, t): + def forward(self, inputs, t): """ Args: - x : [ batch_size x inp_channels x horizon ] + inputs : [ batch_size x inp_channels x horizon ] t : [ batch_size x embed_dim ] returns: @@ -698,9 +698,9 @@ def forward(self, x, t): """ t = self.time_emb_act(t) t = self.time_emb(t) - out = self.conv_in(x) + rearrange_dims(t) + out = self.conv_in(inputs) + rearrange_dims(t) out = self.conv_out(out) - return out + self.residual_conv(x) + return out + self.residual_conv(inputs) def upsample_2d(hidden_states, kernel=None, factor=2, gain=1): From 35bac5edecdd599dde5f472ecd9244e50096fe83 Mon Sep 17 00:00:00 2001 From: Vincent Neemie <92559302+VincentNeemie@users.noreply.github.com> Date: Wed, 28 Jun 2023 09:36:33 -0300 Subject: [PATCH 159/199] Fixing the global_step key not found (#3844) * Fixing the global_step key not found * Apply suggestions from code review --------- Co-authored-by: Patrick von Platen From 219636f7e4a3bc0ffb5ec74f35ea837787a7a3e8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 28 Jun 2023 13:29:36 +0000 Subject: [PATCH 160/199] improve tolerance --- tests/models/test_models_unet_3d_condition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 2d3edfffd39c..3f29d0a41e18 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -296,7 +296,7 @@ def test_lora_save_load_safetensors(self): with torch.no_grad(): new_sample = new_model(**inputs_dict, cross_attention_kwargs={"scale": 0.5}).sample - assert (sample - new_sample).abs().max() < 3e-4 + assert (sample - new_sample).abs().max() < 3e-3 # LoRA and no LoRA should NOT be the same assert (sample - old_sample).abs().max() > 1e-4 From 150013060eb25c5091888cccaf206ac6ae23d407 Mon Sep 17 00:00:00 2001 From: Wadim Korablin Date: Wed, 28 Jun 2023 15:29:48 +0200 Subject: [PATCH 161/199] Support for manual CLIP loading in StableDiffusionPipeline - txt2img. (#3832) * Support for manual CLIP loading in StableDiffusionPipeline - txt2img. * Update src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py * Update variables & according docs to match previous style. * Updated to match style & quality of 'diffusers' --------- Co-authored-by: Patrick von Platen --- src/diffusers/loaders.py | 15 +++++++++++ .../stable_diffusion/convert_from_ckpt.py | 25 ++++++++++++++++--- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index ab1ac7d8bbdf..1bdd33fa80cb 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -1339,6 +1339,17 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): "ddim"]`. load_safety_checker (`bool`, *optional*, defaults to `True`): Whether to load the safety checker or not. + text_encoder (`CLIPTextModel`, *optional*, defaults to `None`): + An instance of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel) to use, + specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) + variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if + needed. + tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`): + An instance of + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by + itself, if needed. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to overwrite load and saveable variables (for example the pipeline components of the specific pipeline class). The overwritten components are directly passed to the pipelines `__init__` @@ -1383,6 +1394,8 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): upcast_attention = kwargs.pop("upcast_attention", None) load_safety_checker = kwargs.pop("load_safety_checker", True) prediction_type = kwargs.pop("prediction_type", None) + text_encoder = kwargs.pop("text_encoder", None) + tokenizer = kwargs.pop("tokenizer", None) torch_dtype = kwargs.pop("torch_dtype", None) @@ -1463,6 +1476,8 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): upcast_attention=upcast_attention, load_safety_checker=load_safety_checker, prediction_type=prediction_type, + text_encoder=text_encoder, + tokenizer=tokenizer, ) if torch_dtype is not None: diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 3b3724f0d010..ba62f8d7f79e 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -734,8 +734,12 @@ def _copy_layers(hf_layers, pt_layers): return hf_model -def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False): - text_model = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) +def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None): + text_model = ( + CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only) + if text_encoder is None + else text_encoder + ) keys = list(checkpoint.keys()) @@ -1025,6 +1029,8 @@ def download_from_original_stable_diffusion_ckpt( load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, local_files_only=False, + text_encoder=None, + tokenizer=None, ) -> DiffusionPipeline: """ Load a Stable Diffusion pipeline object from a CompVis-style `.ckpt`/`.safetensors` file and (ideally) a `.yaml` @@ -1072,6 +1078,15 @@ def download_from_original_stable_diffusion_ckpt( The pipeline class to use. Pass `None` to determine automatically. local_files_only (`bool`, *optional*, defaults to `False`): Whether or not to only look at local files (i.e., do not try to download the model). + text_encoder (`CLIPTextModel`, *optional*, defaults to `None`): + An instance of [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel) + to use, specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) + variant. If this parameter is `None`, the function will load a new instance of [CLIP] by itself, if needed. + tokenizer (`CLIPTokenizer`, *optional*, defaults to `None`): + An instance of + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + to use. If this parameter is `None`, the function will load a new instance of [CLIPTokenizer] by itself, if + needed. return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ @@ -1327,8 +1342,10 @@ def download_from_original_stable_diffusion_ckpt( feature_extractor=feature_extractor, ) elif model_type == "FrozenCLIPEmbedder": - text_model = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) - tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + text_model = convert_ldm_clip_checkpoint( + checkpoint, local_files_only=local_files_only, text_encoder=text_encoder + ) + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") if tokenizer is None else tokenizer if load_safety_checker: safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") From c7469ebe74a90bb8b704f3d5c1bdfe2d9240dabe Mon Sep 17 00:00:00 2001 From: Uranus Date: Wed, 28 Jun 2023 21:44:29 +0800 Subject: [PATCH 162/199] fix sde add noise typo (#3839) * fix sde typo * fix code style --- src/diffusers/schedulers/scheduling_sde_ve.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py index b92db0f048f0..339edfbb02eb 100644 --- a/src/diffusers/schedulers/scheduling_sde_ve.py +++ b/src/diffusers/schedulers/scheduling_sde_ve.py @@ -276,7 +276,11 @@ def add_noise( # Make sure sigmas and timesteps have the same device and dtype as original_samples timesteps = timesteps.to(original_samples.device) sigmas = self.discrete_sigmas.to(original_samples.device)[timesteps] - noise = torch.randn_like(original_samples) * sigmas[:, None, None, None] + noise = ( + noise * sigmas[:, None, None, None] + if noise is not None + else torch.randn_like(original_samples) * sigmas[:, None, None, None] + ) noisy_samples = noise + original_samples return noisy_samples From 49949f321d9b034440b52e54937fd2df3027bf0a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Wed, 28 Jun 2023 22:05:25 +0530 Subject: [PATCH 163/199] [Tests] add test for checking soft dependencies. (#3847) * add test for checking soft dependencies. * address patrick's comments. * dependency tests should not run twice. * debugging. * up. --- .github/workflows/pr_dependency_test.yml | 32 +++++++++++++++++++ .github/workflows/pr_tests.yml | 2 +- setup.py | 3 ++ src/diffusers/dependency_versions_table.py | 3 ++ tests/others/test_dependencies.py | 37 ++++++++++++++++++++++ 5 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pr_dependency_test.yml create mode 100644 tests/others/test_dependencies.py diff --git a/.github/workflows/pr_dependency_test.yml b/.github/workflows/pr_dependency_test.yml new file mode 100644 index 000000000000..baa83e20bf30 --- /dev/null +++ b/.github/workflows/pr_dependency_test.yml @@ -0,0 +1,32 @@ +name: Run dependency tests + +on: + pull_request: + branches: + - main + push: + branches: + - main + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + check_dependencies: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.7" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + pip install pytest + - name: Check for soft dependencies + run: | + pytest tests/others/test_dependencies.py + \ No newline at end of file diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 162b1ba83d66..88c424ee49d3 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -81,7 +81,7 @@ jobs: if: ${{ matrix.config.framework == 'pytorch_models' }} run: | python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \ - -s -v -k "not Flax and not Onnx" \ + -s -v -k "not Flax and not Onnx and not Dependency" \ --make-reports=tests_${{ matrix.config.report }} \ tests/models tests/schedulers tests/others diff --git a/setup.py b/setup.py index 9dab0b903f24..d6b083c22821 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,8 @@ "jaxlib>=0.1.65", "Jinja2", "k-diffusion>=0.0.12", + "torchsde", + "note_seq", "librosa", "numpy", "omegaconf", @@ -106,6 +108,7 @@ "safetensors", "sentencepiece>=0.1.91,!=0.1.92", "scipy", + "onnx", "regex!=2019.12.17", "requests", "tensorboard", diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 19a843470ee1..423d6c5347cd 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -18,6 +18,8 @@ "jaxlib": "jaxlib>=0.1.65", "Jinja2": "Jinja2", "k-diffusion": "k-diffusion>=0.0.12", + "torchsde": "torchsde", + "note_seq": "note_seq", "librosa": "librosa", "numpy": "numpy", "omegaconf": "omegaconf", @@ -30,6 +32,7 @@ "safetensors": "safetensors", "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", "scipy": "scipy", + "onnx": "onnx", "regex": "regex!=2019.12.17", "requests": "requests", "tensorboard": "tensorboard", diff --git a/tests/others/test_dependencies.py b/tests/others/test_dependencies.py new file mode 100644 index 000000000000..9bee7a0db3ed --- /dev/null +++ b/tests/others/test_dependencies.py @@ -0,0 +1,37 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import unittest + + +class DependencyTester(unittest.TestCase): + def test_diffusers_import(self): + try: + import diffusers # noqa: F401 + except ImportError: + assert False + + def test_backend_registration(self): + import diffusers + from diffusers.dependency_versions_table import deps + + all_classes = inspect.getmembers(diffusers, inspect.isclass) + + for cls_name, cls_module in all_classes: + if "dummy_" in cls_module.__module__: + for backend in cls_module._backends: + if backend == "k_diffusion": + backend = "k-diffusion" + assert backend in deps, f"{backend} is not in the deps table!" From cdf2ae8a8426d198a108242dc933c39763c8ccc3 Mon Sep 17 00:00:00 2001 From: takuoko Date: Thu, 29 Jun 2023 20:39:59 +0900 Subject: [PATCH 164/199] [Enhance] Add LoRA rank args in train_text_to_image_lora (#3866) * add rank args in lora finetune * del network_alpha --- examples/text_to_image/train_text_to_image_lora.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 7c2601d8e9b5..29259e408eff 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -343,6 +343,12 @@ def parse_args(): "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.") + parser.add_argument( + "--rank", + type=int, + default=4, + help=("The dimension of the LoRA update matrices."), + ) args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) @@ -464,7 +470,11 @@ def main(): block_id = int(name[len("down_blocks.")]) hidden_size = unet.config.block_out_channels[block_id] - lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=hidden_size, cross_attention_dim=cross_attention_dim) + lora_attn_procs[name] = LoRAAttnProcessor( + hidden_size=hidden_size, + cross_attention_dim=cross_attention_dim, + rank=args.rank, + ) unet.set_attn_processor(lora_attn_procs) From 174dcd697faf88370f1e7b2eeabb059dd8f1b2f4 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 29 Jun 2023 17:24:39 -0700 Subject: [PATCH 165/199] [docs] Model API (#3562) * add modelmixin and unets * remove old model page * minor fixes * fix unet2dcondition * add vqmodel and autoencoderkl * add rest of models * fix autoencoderkl path * fix toctree * fix toctree again * apply feedback * apply feedback * fix copies * fix controlnet copy * fix copies --- docs/source/en/_toctree.yml | 26 ++- docs/source/en/api/models.mdx | 107 ---------- docs/source/en/api/models/autoencoderkl.mdx | 31 +++ docs/source/en/api/models/controlnet.mdx | 23 +++ docs/source/en/api/models/overview.mdx | 12 ++ .../en/api/models/prior_transformer.mdx | 16 ++ docs/source/en/api/models/transformer2d.mdx | 29 +++ .../en/api/models/transformer_temporal.mdx | 11 + docs/source/en/api/models/unet.mdx | 13 ++ docs/source/en/api/models/unet2d-cond.mdx | 19 ++ docs/source/en/api/models/unet2d.mdx | 13 ++ docs/source/en/api/models/unet3d-cond.mdx | 13 ++ docs/source/en/api/models/vq.mdx | 15 ++ src/diffusers/models/autoencoder_kl.py | 80 +++++--- src/diffusers/models/controlnet.py | 125 ++++++++++-- src/diffusers/models/controlnet_flax.py | 41 ++-- src/diffusers/models/modeling_flax_utils.py | 120 ++++++----- src/diffusers/models/modeling_utils.py | 193 +++++++++--------- src/diffusers/models/prior_transformer.py | 43 ++-- src/diffusers/models/transformer_2d.py | 77 ++++--- src/diffusers/models/transformer_temporal.py | 45 ++-- src/diffusers/models/unet_1d.py | 56 ++--- src/diffusers/models/unet_2d.py | 59 +++--- src/diffusers/models/unet_2d_condition.py | 115 ++++++----- .../models/unet_2d_condition_flax.py | 31 ++- src/diffusers/models/unet_3d_condition.py | 61 +++--- src/diffusers/models/vae.py | 2 +- src/diffusers/models/vae_flax.py | 61 +++--- src/diffusers/models/vq_model.py | 33 +-- .../versatile_diffusion/modeling_text_unet.py | 111 +++++----- 30 files changed, 928 insertions(+), 653 deletions(-) delete mode 100644 docs/source/en/api/models.mdx create mode 100644 docs/source/en/api/models/autoencoderkl.mdx create mode 100644 docs/source/en/api/models/controlnet.mdx create mode 100644 docs/source/en/api/models/overview.mdx create mode 100644 docs/source/en/api/models/prior_transformer.mdx create mode 100644 docs/source/en/api/models/transformer2d.mdx create mode 100644 docs/source/en/api/models/transformer_temporal.mdx create mode 100644 docs/source/en/api/models/unet.mdx create mode 100644 docs/source/en/api/models/unet2d-cond.mdx create mode 100644 docs/source/en/api/models/unet2d.mdx create mode 100644 docs/source/en/api/models/unet3d-cond.mdx create mode 100644 docs/source/en/api/models/vq.mdx diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index e904067b31e4..72808df049c9 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -132,8 +132,6 @@ title: Conceptual Guides - sections: - sections: - - local: api/models - title: Models - local: api/attnprocessor title: Attention Processor - local: api/diffusion_pipeline @@ -151,6 +149,30 @@ - local: api/image_processor title: VAE Image Processor title: Main Classes + - sections: + - local: api/models/overview + title: Overview + - local: api/models/unet + title: UNet1DModel + - local: api/models/unet2d + title: UNet2DModel + - local: api/models/unet2d-cond + title: UNet2DConditionModel + - local: api/models/unet3d-cond + title: UNet3DConditionModel + - local: api/models/vq + title: VQModel + - local: api/models/autoencoderkl + title: AutoencoderKL + - local: api/models/transformer2d + title: Transformer2D + - local: api/models/transformer_temporal + title: Transformer Temporal + - local: api/models/prior_transformer + title: Prior Transformer + - local: api/models/controlnet + title: ControlNet + title: Models - sections: - local: api/pipelines/overview title: Overview diff --git a/docs/source/en/api/models.mdx b/docs/source/en/api/models.mdx deleted file mode 100644 index 74291f9173ea..000000000000 --- a/docs/source/en/api/models.mdx +++ /dev/null @@ -1,107 +0,0 @@ - - -# Models - -Diffusers contains pretrained models for popular algorithms and modules for creating the next set of diffusion models. -The primary function of these models is to denoise an input sample, by modeling the distribution \\(p_{\theta}(x_{t-1}|x_{t})\\). -The models are built on the base class ['ModelMixin'] that is a `torch.nn.module` with basic functionality for saving and loading models both locally and from the HuggingFace hub. - -## ModelMixin -[[autodoc]] ModelMixin - -## UNet2DOutput -[[autodoc]] models.unet_2d.UNet2DOutput - -## UNet2DModel -[[autodoc]] UNet2DModel - -## UNet1DOutput -[[autodoc]] models.unet_1d.UNet1DOutput - -## UNet1DModel -[[autodoc]] UNet1DModel - -## UNet2DConditionOutput -[[autodoc]] models.unet_2d_condition.UNet2DConditionOutput - -## UNet2DConditionModel -[[autodoc]] UNet2DConditionModel - -## UNet3DConditionOutput -[[autodoc]] models.unet_3d_condition.UNet3DConditionOutput - -## UNet3DConditionModel -[[autodoc]] UNet3DConditionModel - -## DecoderOutput -[[autodoc]] models.vae.DecoderOutput - -## VQEncoderOutput -[[autodoc]] models.vq_model.VQEncoderOutput - -## VQModel -[[autodoc]] VQModel - -## AutoencoderKLOutput -[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput - -## AutoencoderKL -[[autodoc]] AutoencoderKL - -## Transformer2DModel -[[autodoc]] Transformer2DModel - -## Transformer2DModelOutput -[[autodoc]] models.transformer_2d.Transformer2DModelOutput - -## TransformerTemporalModel -[[autodoc]] models.transformer_temporal.TransformerTemporalModel - -## Transformer2DModelOutput -[[autodoc]] models.transformer_temporal.TransformerTemporalModelOutput - -## PriorTransformer -[[autodoc]] models.prior_transformer.PriorTransformer - -## PriorTransformerOutput -[[autodoc]] models.prior_transformer.PriorTransformerOutput - -## ControlNetOutput -[[autodoc]] models.controlnet.ControlNetOutput - -## ControlNetModel -[[autodoc]] ControlNetModel - -## FlaxModelMixin -[[autodoc]] FlaxModelMixin - -## FlaxUNet2DConditionOutput -[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionOutput - -## FlaxUNet2DConditionModel -[[autodoc]] FlaxUNet2DConditionModel - -## FlaxDecoderOutput -[[autodoc]] models.vae_flax.FlaxDecoderOutput - -## FlaxAutoencoderKLOutput -[[autodoc]] models.vae_flax.FlaxAutoencoderKLOutput - -## FlaxAutoencoderKL -[[autodoc]] FlaxAutoencoderKL - -## FlaxControlNetOutput -[[autodoc]] models.controlnet_flax.FlaxControlNetOutput - -## FlaxControlNetModel -[[autodoc]] FlaxControlNetModel diff --git a/docs/source/en/api/models/autoencoderkl.mdx b/docs/source/en/api/models/autoencoderkl.mdx new file mode 100644 index 000000000000..542fc27cd582 --- /dev/null +++ b/docs/source/en/api/models/autoencoderkl.mdx @@ -0,0 +1,31 @@ +# AutoencoderKL + +The variational autoencoder (VAE) model with KL loss was introduced in [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114v11) by Diederik P. Kingma and Max Welling. The model is used in 🤗 Diffusers to encode images into latents and to decode latent representations into images. + +The abstract from the paper is: + +*How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions are two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.* + +## AutoencoderKL + +[[autodoc]] AutoencoderKL + +## AutoencoderKLOutput + +[[autodoc]] models.autoencoder_kl.AutoencoderKLOutput + +## DecoderOutput + +[[autodoc]] models.vae.DecoderOutput + +## FlaxAutoencoderKL + +[[autodoc]] FlaxAutoencoderKL + +## FlaxAutoencoderKLOutput + +[[autodoc]] models.vae_flax.FlaxAutoencoderKLOutput + +## FlaxDecoderOutput + +[[autodoc]] models.vae_flax.FlaxDecoderOutput \ No newline at end of file diff --git a/docs/source/en/api/models/controlnet.mdx b/docs/source/en/api/models/controlnet.mdx new file mode 100644 index 000000000000..ae2d06edbbd4 --- /dev/null +++ b/docs/source/en/api/models/controlnet.mdx @@ -0,0 +1,23 @@ +# ControlNet + +The ControlNet model was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala. It provides a greater degree of control over text-to-image generation by conditioning the model on additional inputs such as edge maps, depth maps, segmentation maps, and keypoints for pose detection. + +The abstract from the paper is: + +*We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions. The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k). Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices. Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data. We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc. This may enrich the methods to control large diffusion models and further facilitate related applications.* + +## ControlNetModel + +[[autodoc]] ControlNetModel + +## ControlNetOutput + +[[autodoc]] models.controlnet.ControlNetOutput + +## FlaxControlNetModel + +[[autodoc]] FlaxControlNetModel + +## FlaxControlNetOutput + +[[autodoc]] models.controlnet_flax.FlaxControlNetOutput \ No newline at end of file diff --git a/docs/source/en/api/models/overview.mdx b/docs/source/en/api/models/overview.mdx new file mode 100644 index 000000000000..cc94861fba27 --- /dev/null +++ b/docs/source/en/api/models/overview.mdx @@ -0,0 +1,12 @@ +# Models + +🤗 Diffusers provides pretrained models for popular algorithms and modules to create custom diffusion systems. The primary function of models is to denoise an input sample as modeled by the distribution \\(p_{\theta}(x_{t-1}|x_{t})\\). + +All models are built from the base [`ModelMixin`] class which is a [`torch.nn.module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html) providing basic functionality for saving and loading models, locally and from the Hugging Face Hub. + +## ModelMixin +[[autodoc]] ModelMixin + +## FlaxModelMixin + +[[autodoc]] FlaxModelMixin \ No newline at end of file diff --git a/docs/source/en/api/models/prior_transformer.mdx b/docs/source/en/api/models/prior_transformer.mdx new file mode 100644 index 000000000000..1d2b799ed323 --- /dev/null +++ b/docs/source/en/api/models/prior_transformer.mdx @@ -0,0 +1,16 @@ +# Prior Transformer + +The Prior Transformer was originally introduced in [Hierarchical Text-Conditional Image Generation with CLIP Latents +](https://huggingface.co/papers/2204.06125) by Ramesh et al. It is used to predict CLIP image embeddings from CLIP text embeddings; image embeddings are predicted through a denoising diffusion process. + +The abstract from the paper is: + +*Contrastive models like CLIP have been shown to learn robust representations of images that capture both semantics and style. To leverage these representations for image generation, we propose a two-stage model: a prior that generates a CLIP image embedding given a text caption, and a decoder that generates an image conditioned on the image embedding. We show that explicitly generating image representations improves image diversity with minimal loss in photorealism and caption similarity. Our decoders conditioned on image representations can also produce variations of an image that preserve both its semantics and style, while varying the non-essential details absent from the image representation. Moreover, the joint embedding space of CLIP enables language-guided image manipulations in a zero-shot fashion. We use diffusion models for the decoder and experiment with both autoregressive and diffusion models for the prior, finding that the latter are computationally more efficient and produce higher-quality samples.* + +## PriorTransformer + +[[autodoc]] PriorTransformer + +## PriorTransformerOutput + +[[autodoc]] models.prior_transformer.PriorTransformerOutput \ No newline at end of file diff --git a/docs/source/en/api/models/transformer2d.mdx b/docs/source/en/api/models/transformer2d.mdx new file mode 100644 index 000000000000..4ad2b00b6f23 --- /dev/null +++ b/docs/source/en/api/models/transformer2d.mdx @@ -0,0 +1,29 @@ +# Transformer2D + +A Transformer model for image-like data from [CompVis](https://huggingface.co/CompVis) that is based on the [Vision Transformer](https://huggingface.co/papers/2010.11929) introduced by Dosovitskiy et al. The [`Transformer2DModel`] accepts discrete (classes of vector embeddings) or continuous (actual embeddings) inputs. + +When the input is **continuous**: + +1. Project the input and reshape it to `(batch_size, sequence_length, feature_dimension)`. +2. Apply the Transformer blocks in the standard way. +3. Reshape to image. + +When the input is **discrete**: + + + +It is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised image don't contain a prediction for the masked pixel because the unnoised image cannot be masked. + + + +1. Convert input (classes of latent pixels) to embeddings and apply positional embeddings. +2. Apply the Transformer blocks in the standard way. +3. Predict classes of unnoised image. + +## Transformer2DModel + +[[autodoc]] Transformer2DModel + +## Transformer2DModelOutput + +[[autodoc]] models.transformer_2d.Transformer2DModelOutput diff --git a/docs/source/en/api/models/transformer_temporal.mdx b/docs/source/en/api/models/transformer_temporal.mdx new file mode 100644 index 000000000000..d67cf717f92b --- /dev/null +++ b/docs/source/en/api/models/transformer_temporal.mdx @@ -0,0 +1,11 @@ +# Transformer Temporal + +A Transformer model for video-like data. + +## TransformerTemporalModel + +[[autodoc]] models.transformer_temporal.TransformerTemporalModel + +## TransformerTemporalModelOutput + +[[autodoc]] models.transformer_temporal.TransformerTemporalModelOutput \ No newline at end of file diff --git a/docs/source/en/api/models/unet.mdx b/docs/source/en/api/models/unet.mdx new file mode 100644 index 000000000000..9a488a3231a6 --- /dev/null +++ b/docs/source/en/api/models/unet.mdx @@ -0,0 +1,13 @@ +# UNet1DModel + +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 1D UNet model. + +The abstract from the paper is: + +*There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net.* + +## UNet1DModel +[[autodoc]] UNet1DModel + +## UNet1DOutput +[[autodoc]] models.unet_1d.UNet1DOutput \ No newline at end of file diff --git a/docs/source/en/api/models/unet2d-cond.mdx b/docs/source/en/api/models/unet2d-cond.mdx new file mode 100644 index 000000000000..a669b02a7fe8 --- /dev/null +++ b/docs/source/en/api/models/unet2d-cond.mdx @@ -0,0 +1,19 @@ +# UNet2DConditionModel + +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet conditional model. + +The abstract from the paper is: + +*There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net.* + +## UNet2DConditionModel +[[autodoc]] UNet2DConditionModel + +## UNet2DConditionOutput +[[autodoc]] models.unet_2d_condition.UNet2DConditionOutput + +## FlaxUNet2DConditionModel +[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionModel + +## FlaxUNet2DConditionOutput +[[autodoc]] models.unet_2d_condition_flax.FlaxUNet2DConditionOutput \ No newline at end of file diff --git a/docs/source/en/api/models/unet2d.mdx b/docs/source/en/api/models/unet2d.mdx new file mode 100644 index 000000000000..29e8163f646c --- /dev/null +++ b/docs/source/en/api/models/unet2d.mdx @@ -0,0 +1,13 @@ +# UNet2DModel + +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 2D UNet model. + +The abstract from the paper is: + +*There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net.* + +## UNet2DModel +[[autodoc]] UNet2DModel + +## UNet2DOutput +[[autodoc]] models.unet_2d.UNet2DOutput \ No newline at end of file diff --git a/docs/source/en/api/models/unet3d-cond.mdx b/docs/source/en/api/models/unet3d-cond.mdx new file mode 100644 index 000000000000..83dbb514c8dd --- /dev/null +++ b/docs/source/en/api/models/unet3d-cond.mdx @@ -0,0 +1,13 @@ +# UNet3DConditionModel + +The [UNet](https://huggingface.co/papers/1505.04597) model was originally introduced by Ronneberger et al for biomedical image segmentation, but it is also commonly used in 🤗 Diffusers because it outputs images that are the same size as the input. It is one of the most important components of a diffusion system because it facilitates the actual diffusion process. There are several variants of the UNet model in 🤗 Diffusers, depending on it's number of dimensions and whether it is a conditional model or not. This is a 3D UNet conditional model. + +The abstract from the paper is: + +*There is large consent that successful training of deep networks requires many thousand annotated training samples. In this paper, we present a network and training strategy that relies on the strong use of data augmentation to use the available annotated samples more efficiently. The architecture consists of a contracting path to capture context and a symmetric expanding path that enables precise localization. We show that such a network can be trained end-to-end from very few images and outperforms the prior best method (a sliding-window convolutional network) on the ISBI challenge for segmentation of neuronal structures in electron microscopic stacks. Using the same network trained on transmitted light microscopy images (phase contrast and DIC) we won the ISBI cell tracking challenge 2015 in these categories by a large margin. Moreover, the network is fast. Segmentation of a 512x512 image takes less than a second on a recent GPU. The full implementation (based on Caffe) and the trained networks are available at http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net.* + +## UNet3DConditionModel +[[autodoc]] UNet3DConditionModel + +## UNet3DConditionOutput +[[autodoc]] models.unet_3d_condition.UNet3DConditionOutput \ No newline at end of file diff --git a/docs/source/en/api/models/vq.mdx b/docs/source/en/api/models/vq.mdx new file mode 100644 index 000000000000..cdb6761468a8 --- /dev/null +++ b/docs/source/en/api/models/vq.mdx @@ -0,0 +1,15 @@ +# VQModel + +The VQ-VAE model was introduced in [Neural Discrete Representation Learning](https://huggingface.co/papers/1711.00937) by Aaron van den Oord, Oriol Vinyals and Koray Kavukcuoglu. The model is used in 🤗 Diffusers to decode latent representations into images. Unlike [`AutoencoderKL`], the [`VQModel`] works in a quantized latent space. + +The abstract from the paper is: + +*Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of "posterior collapse" -- where the latents are ignored when they are paired with a powerful autoregressive decoder -- typically observed in the VAE framework. Pairing these representations with an autoregressive prior, the model can generate high quality images, videos, and speech as well as doing high quality speaker conversion and unsupervised learning of phonemes, providing further evidence of the utility of the learnt representations.* + +## VQModel + +[[autodoc]] VQModel + +## VQEncoderOutput + +[[autodoc]] models.vq_model.VQEncoderOutput \ No newline at end of file diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py index 71785431320f..d61281a53e7c 100644 --- a/src/diffusers/models/autoencoder_kl.py +++ b/src/diffusers/models/autoencoder_kl.py @@ -39,24 +39,24 @@ class AutoencoderKLOutput(BaseOutput): class AutoencoderKL(ModelMixin, ConfigMixin): - r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma - and Max Welling. + r""" + A VAE model with KL loss for encoding images into latents and decoding latent representations into images. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the model (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: in_channels (int, *optional*, defaults to 3): Number of channels in the input image. out_channels (int, *optional*, defaults to 3): Number of channels in the output. - down_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types. - up_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to : - obj:`(64,)`): Tuple of block output channels. + down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`): + Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`): + Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`): + Tuple of block output channels. act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space. - sample_size (`int`, *optional*, defaults to `32`): TODO + sample_size (`int`, *optional*, defaults to `32`): Sample input size. scaling_factor (`float`, *optional*, defaults to 0.18215): The component-wise standard deviation of the trained latent space computed using the first batch of the training set. This is used to scale the latent space to have unit variance when training the diffusion @@ -131,15 +131,15 @@ def _set_gradient_checkpointing(self, module, value=False): def enable_tiling(self, use_tiling: bool = True): r""" Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful to save a large amount of memory and to allow - the processing of larger images. + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. """ self.use_tiling = use_tiling def disable_tiling(self): r""" - Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to - computing decoding in one step. + Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing + decoding in one step. """ self.enable_tiling(False) @@ -152,7 +152,7 @@ def enable_slicing(self): def disable_slicing(self): r""" - Disable sliced VAE decoding. If `enable_slicing` was previously invoked, this method will go back to computing + Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing decoding in one step. """ self.use_slicing = False @@ -185,11 +185,15 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" + Sets the attention processor to use to compute attention. + Parameters: - `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor - of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. """ count = len(self.attn_processors.keys()) @@ -274,14 +278,21 @@ def blend_h(self, a, b, blend_extent): def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput: r"""Encode a batch of images using a tiled encoder. - Args: When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several - steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is: - different from non-tiled encoding due to each tile using a different encoder. To avoid tiling artifacts, the + steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is + different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the - look of the output, but they should be much less noticeable. - x (`torch.FloatTensor`): Input batch of images. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`AutoencoderKLOutput`] instead of a plain tuple. + output, but they should be much less noticeable. + + Args: + x (`torch.FloatTensor`): Input batch of images. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. + + Returns: + [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`: + If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain + `tuple` is returned. """ overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) @@ -319,17 +330,18 @@ def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> Autoen return AutoencoderKLOutput(latent_dist=posterior) def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: - r"""Decode a batch of images using a tiled decoder. + r""" + Decode a batch of images using a tiled decoder. Args: - When this option is enabled, the VAE will split the input tensor into tiles to compute decoding in several - steps. This is useful to keep memory use constant regardless of image size. The end result of tiled decoding is: - different from non-tiled decoding due to each tile using a different decoder. To avoid tiling artifacts, the - tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the - look of the output, but they should be much less noticeable. - z (`torch.FloatTensor`): Input batch of latent vectors. return_dict (`bool`, *optional*, defaults to - `True`): - Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + z (`torch.FloatTensor`): Input batch of latent vectors. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. + + Returns: + [`~models.vae.DecoderOutput`] or `tuple`: + If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is + returned. """ overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor) diff --git a/src/diffusers/models/controlnet.py b/src/diffusers/models/controlnet.py index 8660c3f9a5d3..b0f566020079 100644 --- a/src/diffusers/models/controlnet.py +++ b/src/diffusers/models/controlnet.py @@ -37,6 +37,20 @@ @dataclass class ControlNetOutput(BaseOutput): + """ + The output of [`ControlNetModel`]. + + Args: + down_block_res_samples (`tuple[torch.Tensor]`): + A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should + be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be + used to condition the original UNet's downsampling activations. + mid_down_block_re_sample (`torch.Tensor`): + The activation of the midde block (the lowest sample resolution). Each tensor should be of shape + `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`. + Output can be used to condition the original UNet's middle block activation. + """ + down_block_res_samples: Tuple[torch.Tensor] mid_block_res_sample: torch.Tensor @@ -87,6 +101,58 @@ def forward(self, conditioning): class ControlNetModel(ModelMixin, ConfigMixin): + """ + A ControlNet model. + + Args: + in_channels (`int`, defaults to 4): + The number of channels in the input sample. + flip_sin_to_cos (`bool`, defaults to `True`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, defaults to 0): + The frequency shift to apply to the time embedding. + down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`): + block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, defaults to 2): + The number of layers per block. + downsample_padding (`int`, defaults to 1): + The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, defaults to 1): + The scale factor to use for the mid block. + act_fn (`str`, defaults to "silu"): + The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the normalization. If None, normalization and activation layers is skipped + in post-processing. + norm_eps (`float`, defaults to 1e-5): + The epsilon to use for the normalization. + cross_attention_dim (`int`, defaults to 1280): + The dimension of the cross attention features. + attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8): + The dimension of the attention heads. + use_linear_projection (`bool`, defaults to `False`): + class_embed_type (`str`, *optional*, defaults to `None`): + The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None, + `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + num_class_embeds (`int`, *optional*, defaults to 0): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing + class conditioning with `class_embed_type` equal to `None`. + upcast_attention (`bool`, defaults to `False`): + resnet_time_scale_shift (`str`, defaults to `"default"`): + Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`. + projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`): + The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when + `class_embed_type="projection"`. + controlnet_conditioning_channel_order (`str`, defaults to `"rgb"`): + The channel order of conditional image. Will convert to `rgb` if it's `bgr`. + conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`): + The tuple of output channel for each block in the `conditioning_embedding` layer. + global_pool_conditions (`bool`, defaults to `False`): + """ + _supports_gradient_checkpointing = True @register_to_config @@ -283,12 +349,12 @@ def from_unet( load_weights_from_unet: bool = True, ): r""" - Instantiate Controlnet class from UNet2DConditionModel. + Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`]. Parameters: unet (`UNet2DConditionModel`): - UNet model which weights are copied to the ControlNet. Note that all configuration options are also - copied where applicable. + The UNet model weights to copy to the [`ControlNetModel`]. All configuration options are also copied + where applicable. """ controlnet = cls( in_channels=unet.config.in_channels, @@ -357,11 +423,15 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" + Sets the attention processor to use to compute attention. + Parameters: - `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor - of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. """ count = len(self.attn_processors.keys()) @@ -397,15 +467,15 @@ def set_attention_slice(self, slice_size): r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `num_attention_heads // slice_size`. In this case, - `num_attention_heads` must be a multiple of `slice_size`. + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. """ sliceable_head_dims = [] @@ -476,6 +546,37 @@ def forward( guess_mode: bool = False, return_dict: bool = True, ) -> Union[ControlNetOutput, Tuple]: + """ + The [`ControlNetModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor. + timestep (`Union[torch.Tensor, float, int]`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): + The encoder hidden states. + controlnet_cond (`torch.FloatTensor`): + The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. + conditioning_scale (`float`, defaults to `1.0`): + The scale factor for ControlNet outputs. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond (`torch.Tensor`, *optional*, defaults to `None`): + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + cross_attention_kwargs(`dict[str]`, *optional*, defaults to `None`): + A kwargs dictionary that if specified is passed along to the `AttnProcessor`. + guess_mode (`bool`, defaults to `False`): + In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if + you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended. + return_dict (`bool`, defaults to `True`): + Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple. + + Returns: + [`~models.controlnet.ControlNetOutput`] **or** `tuple`: + If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is + returned where the first element is the sample tensor. + """ # check channel order channel_order = self.config.controlnet_conditioning_channel_order diff --git a/src/diffusers/models/controlnet_flax.py b/src/diffusers/models/controlnet_flax.py index cff451edcdc5..a826df48e41a 100644 --- a/src/diffusers/models/controlnet_flax.py +++ b/src/diffusers/models/controlnet_flax.py @@ -32,6 +32,14 @@ @flax.struct.dataclass class FlaxControlNetOutput(BaseOutput): + """ + The output of [`FlaxControlNetModel`]. + + Args: + down_block_res_samples (`jnp.ndarray`): + mid_block_res_sample (`jnp.ndarray`): + """ + down_block_res_samples: jnp.ndarray mid_block_res_sample: jnp.ndarray @@ -95,21 +103,17 @@ def __call__(self, conditioning): @flax_register_to_config class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin): r""" - Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN - [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized - training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the - convolution size. We use a tiny network E(·) of four convolution layers with 4 × 4 kernels and 2 × 2 strides - (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full - model) to encode image-space conditions ... into feature maps ..." - - This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the models (such as downloading or saving, etc.) - - Also, this model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) - subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to + A ControlNet model. + + This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it’s generic methods + implemented for all models (such as downloading or saving). + + This model is also a Flax Linen [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) + subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its general usage and behavior. - Finally, this model supports inherent JAX features such as: + Inherent JAX features such as the following are supported: + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) @@ -120,9 +124,8 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin): The size of the input sample. in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample. - down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): - The tuple of downsample blocks to use. The corresponding class names will be: "FlaxCrossAttnDownBlock2D", - "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D" + down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`): + The tuple of downsample blocks to use. block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): The tuple of output channels for each block. layers_per_block (`int`, *optional*, defaults to 2): @@ -139,11 +142,9 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin): Whether to flip the sin to cos in the time embedding. freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding. controlnet_conditioning_channel_order (`str`, *optional*, defaults to `rgb`): - The channel order of conditional image. Will convert it to `rgb` if it's `bgr` + The channel order of conditional image. Will convert to `rgb` if it's `bgr`. conditioning_embedding_out_channels (`tuple`, *optional*, defaults to `(16, 32, 96, 256)`): - The tuple of output channel for each block in conditioning_embedding layer - - + The tuple of output channel for each block in the `conditioning_embedding` layer. """ sample_size: int = 32 in_channels: int = 4 diff --git a/src/diffusers/models/modeling_flax_utils.py b/src/diffusers/models/modeling_flax_utils.py index 58c492a974a3..9a6e1b3bba3d 100644 --- a/src/diffusers/models/modeling_flax_utils.py +++ b/src/diffusers/models/modeling_flax_utils.py @@ -44,10 +44,12 @@ class FlaxModelMixin: r""" - Base class for all flax models. + Base class for all Flax models. - [`FlaxModelMixin`] takes care of storing the configuration of the models and handles methods for loading, - downloading and saving models. + [`FlaxModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and + saving models. + + - **config_name** ([`str`]) -- Filename to save a model to when calling [`~FlaxModelMixin.save_pretrained`]. """ config_name = CONFIG_NAME _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] @@ -89,15 +91,15 @@ def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None): Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast the `params` in place. - This method can be used on TPU to explicitly convert the model parameters to bfloat16 precision to do full + This method can be used on a TPU to explicitly convert the model parameters to bfloat16 precision to do full half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed. Arguments: params (`Union[Dict, FrozenDict]`): A `PyTree` of model parameters. mask (`Union[Dict, FrozenDict]`): - A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params - you want to cast, and should be `False` for those you want to skip. + A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True` + for params you want to cast, and `False` for those you want to skip. Examples: @@ -132,8 +134,8 @@ def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None): params (`Union[Dict, FrozenDict]`): A `PyTree` of model parameters. mask (`Union[Dict, FrozenDict]`): - A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params - you want to cast, and should be `False` for those you want to skip + A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True` + for params you want to cast, and `False` for those you want to skip. Examples: @@ -155,15 +157,15 @@ def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None): Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the `params` in place. - This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full + This method can be used on a GPU to explicitly convert the model parameters to float16 precision to do full half-precision training or to save weights in float16 for inference in order to save memory and improve speed. Arguments: params (`Union[Dict, FrozenDict]`): A `PyTree` of model parameters. mask (`Union[Dict, FrozenDict]`): - A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params - you want to cast, and should be `False` for those you want to skip + A `PyTree` with same structure as the `params` tree. The leaves should be booleans. It should be `True` + for params you want to cast, and `False` for those you want to skip. Examples: @@ -201,71 +203,68 @@ def from_pretrained( **kwargs, ): r""" - Instantiate a pretrained flax model from a pre-trained model configuration. - - The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come - pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning - task. - - The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those - weights are discarded. + Instantiate a pretrained Flax model from a pretrained model configuration. Parameters: pretrained_model_name_or_path (`str` or `os.PathLike`): Can be either: - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids are namespaced under a user or organization name, like - `runwayml/stable-diffusion-v1-5`. - - A path to a *directory* containing model weights saved using [`~ModelMixin.save_pretrained`], - e.g., `./my_model_directory/`. + - A string, the *model id* (for example `runwayml/stable-diffusion-v1-5`) of a pretrained model + hosted on the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved + using [`~FlaxModelMixin.save_pretrained`]. dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If - specified all the computation will be performed with the given `dtype`. + specified, all the computation will be performed with the given `dtype`. + + + + This only specifies the dtype of the *computation* and does not influence the dtype of model + parameters. - **Note that this only specifies the dtype of the computation and does not influence the dtype of model - parameters.** + If you wish to change the dtype of the model parameters, see [`~FlaxModelMixin.to_fp16`] and + [`~FlaxModelMixin.to_bf16`]. + + - If you wish to change the dtype of the model parameters, see [`~ModelMixin.to_fp16`] and - [`~ModelMixin.to_bf16`]. model_args (sequence of positional arguments, *optional*): - All remaining positional arguments will be passed to the underlying model's `__init__` method. + All remaining positional arguments are passed to the underlying model's `__init__` method. cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. from_pt (`bool`, *optional*, defaults to `False`): Load the model weights from a PyTorch checkpoint save file. kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., - `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + Can be used to update the configuration object (after it is loaded) and initiate the model (for + example, `output_attentions=True`). Behaves differently depending on whether a `config` is provided or automatically loaded: - - If a configuration is provided with `config`, `**kwargs` will be directly passed to the - underlying model's `__init__` method (we assume all relevant updates to the configuration have - already been done) - - If a configuration is not provided, `kwargs` will be first passed to the configuration class - initialization function ([`~ConfigMixin.from_config`]). Each key of `kwargs` that corresponds to - a configuration attribute will be used to override said attribute with the supplied `kwargs` - value. Remaining keys that do not correspond to any configuration attribute will be passed to the - underlying model's `__init__` function. + - If a configuration is provided with `config`, `kwargs` are directly passed to the underlying + model's `__init__` method (we assume all relevant updates to the configuration have already been + done). + - If a configuration is not provided, `kwargs` are first passed to the configuration class + initialization function [`~ConfigMixin.from_config`]. Each key of the `kwargs` that corresponds + to a configuration attribute is used to override said attribute with the supplied `kwargs` value. + Remaining keys that do not correspond to any configuration attribute are passed to the underlying + model's `__init__` function. Examples: @@ -276,7 +275,16 @@ def from_pretrained( >>> model, params = FlaxUNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5") >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable). >>> model, params = FlaxUNet2DConditionModel.from_pretrained("./test/saved_model/") - ```""" + ``` + + If you get the error message below, you need to finetune the weights for your downstream task: + + ```bash + Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match: + - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated + You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + ``` + """ config = kwargs.pop("config", None) cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) force_download = kwargs.pop("force_download", False) @@ -491,18 +499,18 @@ def save_pretrained( is_main_process: bool = True, ): """ - Save a model and its configuration file to a directory, so that it can be re-loaded using the - `[`~FlaxModelMixin.from_pretrained`]` class method + Save a model and its configuration file to a directory so that it can be reloaded using the + [`~FlaxModelMixin.from_pretrained`] class method. Arguments: save_directory (`str` or `os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. + Directory to save a model and its configuration file to. Will be created if it doesn't exist. params (`Union[Dict, FrozenDict]`): A `PyTree` of model parameters. is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful when in distributed training like - TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on - the main process to avoid race conditions. + Whether the process calling this is the main process or not. Useful during distributed training and you + need to call this function on all processes. In this case, set `is_main_process=True` only on the main + process to avoid race conditions. """ if os.path.isfile(save_directory): logger.error(f"Provided path ({save_directory}) should be a directory, not a file") diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index 9e9b5cde4675..cc8df3fe6d69 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -154,11 +154,10 @@ class ModelMixin(torch.nn.Module): r""" Base class for all models. - [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading - and saving models. + [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and + saving models. - - **config_name** ([`str`]) -- A filename under which the model should be stored when calling - [`~models.ModelMixin.save_pretrained`]. + - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`]. """ config_name = CONFIG_NAME _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] @@ -190,18 +189,13 @@ def __getattr__(self, name: str) -> Any: def is_gradient_checkpointing(self) -> bool: """ Whether gradient checkpointing is activated for this model or not. - - Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint - activations". """ return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules()) def enable_gradient_checkpointing(self): """ - Activates gradient checkpointing for the current model. - - Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint - activations". + Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or + *checkpoint activations* in other frameworks). """ if not self._supports_gradient_checkpointing: raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") @@ -209,10 +203,8 @@ def enable_gradient_checkpointing(self): def disable_gradient_checkpointing(self): """ - Deactivates gradient checkpointing for the current model. - - Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint - activations". + Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or + *checkpoint activations* in other frameworks). """ if self._supports_gradient_checkpointing: self.apply(partial(self._set_gradient_checkpointing, value=False)) @@ -236,13 +228,17 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None): r""" - Enable memory efficient attention as implemented in xformers. + Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). - When this option is enabled, you should observe lower GPU memory usage and a potential speed up at inference - time. Speed up at training time is not guaranteed. + When this option is enabled, you should observe lower GPU memory usage and a potential speed up during + inference. Speed up during training is not guaranteed. - Warning: When Memory Efficient Attention and Sliced attention are both enabled, the Memory Efficient Attention - is used. + + + ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes + precedent. + + Parameters: attention_op (`Callable`, *optional*): @@ -268,7 +264,7 @@ def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Call def disable_xformers_memory_efficient_attention(self): r""" - Disable memory efficient attention as implemented in xformers. + Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). """ self.set_use_memory_efficient_attention_xformers(False) @@ -281,24 +277,24 @@ def save_pretrained( variant: Optional[str] = None, ): """ - Save a model and its configuration file to a directory, so that it can be re-loaded using the - `[`~models.ModelMixin.from_pretrained`]` class method. + Save a model and its configuration file to a directory so that it can be reloaded using the + [`~models.ModelMixin.from_pretrained`] class method. Arguments: save_directory (`str` or `os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. + Directory to save a model and its configuration file to. Will be created if it doesn't exist. is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful when in distributed training like - TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on - the main process to avoid race conditions. + Whether the process calling this is the main process or not. Useful during distributed training and you + need to call this function on all processes. In this case, set `is_main_process=True` only on the main + process to avoid race conditions. save_function (`Callable`): - The function to use to save the state dictionary. Useful on distributed training like TPUs when one - need to replace `torch.save` by another method. Can be configured with the environment variable + The function to use to save the state dictionary. Useful during distributed training when you need to + replace `torch.save` with another method. Can be configured with the environment variable `DIFFUSERS_SAVE_MODE`. safe_serialization (`bool`, *optional*, defaults to `False`): - Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. variant (`str`, *optional*): - If specified, weights are saved in the format pytorch_model..bin. + If specified, weights are saved in the format `pytorch_model..bin`. """ if safe_serialization and not is_safetensors_available(): raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") @@ -335,107 +331,108 @@ def save_pretrained( @classmethod def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): r""" - Instantiate a pretrained pytorch model from a pre-trained model configuration. + Instantiate a pretrained PyTorch model from a pretrained model configuration. - The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train - the model, you should first set it back in training mode with `model.train()`. - - The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come - pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning - task. - - The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those - weights are discarded. + The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To + train the model, set it back in training mode with `model.train()`. Parameters: pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): Can be either: - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids should have an organization name, like `google/ddpm-celebahq-256`. - - A path to a *directory* containing model weights saved using [`~ModelMixin.save_config`], e.g., - `./my_model_directory/`. + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved + with [`~ModelMixin.save_pretrained`]. cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. torch_dtype (`str` or `torch.dtype`, *optional*): - Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the dtype - will be automatically derived from the model's weights. + Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the + dtype is automatically derived from the model's weights. force_download (`bool`, *optional*, defaults to `False`): Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. resume_download (`bool`, *optional*, defaults to `False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. + Whether or not to resume downloading the model weights and configuration files. If set to `False`, any + incompletely downloaded files are deleted. proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - output_loading_info(`bool`, *optional*, defaults to `False`): + output_loading_info (`bool`, *optional*, defaults to `False`): Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. local_files_only(`bool`, *optional*, defaults to `False`): - Whether or not to only look at local files (i.e., do not try to download the model). + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. use_auth_token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated - when running `diffusers-cli login` (stored in `~/.huggingface`). + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any - identifier allowed by git. + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. from_flax (`bool`, *optional*, defaults to `False`): Load the model weights from a Flax checkpoint save file. subfolder (`str`, *optional*, defaults to `""`): - In case the relevant files are located inside a subfolder of the model repo (either remote in - huggingface.co or downloaded locally), you can specify the folder name here. - + The subfolder location of a model file within a larger model repository on the Hub or locally. mirror (`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. + Mirror source to resolve accessibility issues if you're downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): - A map that specifies where each submodule should go. It doesn't need to be refined to each - parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the + A map that specifies where each submodule should go. It doesn't need to be defined for each + parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the same device. - To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For + Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For more information about each option see [designing a device map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). max_memory (`Dict`, *optional*): - A dictionary device identifier to maximum memory. Will default to the maximum memory available for each - GPU and the available CPU RAM if unset. + A dictionary device identifier for the maximum memory. Will default to the maximum memory available for + each GPU and the available CPU RAM if unset. offload_folder (`str` or `os.PathLike`, *optional*): - If the `device_map` contains any value `"disk"`, the folder where we will offload weights. + The path to offload weights if `device_map` contains the value `"disk"`. offload_state_dict (`bool`, *optional*): - If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU - RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to - `True` when there is some disk offload. + If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if + the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True` + when there is some disk offload. low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): - Speed up model loading by not initializing the weights and only loading the pre-trained weights. This - also tries to not use more than 1x model size in CPU memory (including peak memory) while loading the - model. This is only supported when torch version >= 1.9.0. If you are using an older version of torch, - setting this argument to `True` will raise an error. + Speed up model loading only loading the pretrained weights and not initializing the weights. This also + tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. + Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this + argument to `True` will raise an error. variant (`str`, *optional*): - If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is - ignored when using `from_flax`. + Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when + loading `from_flax`. use_safetensors (`bool`, *optional*, defaults to `None`): - If set to `None`, the `safetensors` weights will be downloaded if they're available **and** if the - `safetensors` library is installed. If set to `True`, the model will be forcibly loaded from - `safetensors` weights. If set to `False`, loading will *not* use `safetensors`. + If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the + `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors` + weights. If set to `False`, `safetensors` weights are not loaded. - It is required to be logged in (`huggingface-cli login`) when you want to use private or [gated - models](https://huggingface.co/docs/hub/models-gated#gated-models). + To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with + `huggingface-cli login`. You can also activate the special + ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a + firewalled environment. - + Example: - Activate the special ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use - this method in a firewalled environment. + ```py + from diffusers import UNet2DConditionModel - + unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet") + ``` + + If you get the error message below, you need to finetune the weights for your downstream task: + ```bash + Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match: + - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated + You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + ``` """ cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE) ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) @@ -852,17 +849,27 @@ def dtype(self) -> torch.dtype: def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: """ - Get number of (optionally, trainable or non-embeddings) parameters in the module. + Get number of (trainable or non-embedding) parameters in the module. Args: only_trainable (`bool`, *optional*, defaults to `False`): - Whether or not to return only the number of trainable parameters - + Whether or not to return only the number of trainable parameters. exclude_embeddings (`bool`, *optional*, defaults to `False`): - Whether or not to return only the number of non-embeddings parameters + Whether or not to return only the number of non-embedding parameters. Returns: `int`: The number of parameters. + + Example: + + ```py + from diffusers import UNet2DConditionModel + + model_id = "runwayml/stable-diffusion-v1-5" + unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet") + unet.num_parameters(only_trainable=True) + 859520964 + ``` """ if exclude_embeddings: diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py index 58804f2672b9..47785a93e939 100644 --- a/src/diffusers/models/prior_transformer.py +++ b/src/diffusers/models/prior_transformer.py @@ -16,6 +16,8 @@ @dataclass class PriorTransformerOutput(BaseOutput): """ + The output of [`PriorTransformer`]. + Args: predicted_image_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): The predicted CLIP image embedding conditioned on the CLIP text embedding input. @@ -26,27 +28,20 @@ class PriorTransformerOutput(BaseOutput): class PriorTransformer(ModelMixin, ConfigMixin): """ - The prior transformer from unCLIP is used to predict CLIP image embeddings from CLIP text embeddings. Note that the - transformer predicts the image embeddings through a denoising diffusion process. - - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the models (such as downloading or saving, etc.) - - For more details, see the original paper: https://arxiv.org/abs/2204.06125 + A Prior Transformer model. Parameters: num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention. attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head. num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use. - embedding_dim (`int`, *optional*, defaults to 768): The dimension of the CLIP embeddings. Note that CLIP - image embeddings and text embeddings are both the same dimension. - num_embeddings (`int`, *optional*, defaults to 77): The max number of clip embeddings allowed. I.e. the - length of the prompt after it has been tokenized. + embedding_dim (`int`, *optional*, defaults to 768): + The dimension of the CLIP embeddings. Image embeddings and text embeddings are both the same dimension. + num_embeddings (`int`, *optional*, defaults to 77): The max number of CLIP embeddings allowed (the + length of the prompt after it has been tokenized). additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the - projected hidden_states. The actual length of the used hidden_states is `num_embeddings + + projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings + additional_embeddings`. dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. - """ @register_to_config @@ -133,11 +128,15 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" + Sets the attention processor to use to compute attention. + Parameters: - `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor - of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. """ count = len(self.attn_processors.keys()) @@ -178,10 +177,12 @@ def forward( return_dict: bool = True, ): """ + The [`PriorTransformer`] forward method. + Args: hidden_states (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): - x_t, the currently predicted image embeddings. - timestep (`torch.long`): + The currently predicted image embeddings. + timestep (`torch.LongTensor`): Current denoising step. proj_embedding (`torch.FloatTensor` of shape `(batch_size, embedding_dim)`): Projected embedding vector the denoising process is conditioned on. @@ -190,13 +191,13 @@ def forward( attention_mask (`torch.BoolTensor` of shape `(batch_size, num_embeddings)`): Text mask for the text embeddings. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.prior_transformer.PriorTransformerOutput`] instead of a plain + Whether or not to return a [`~models.prior_transformer.PriorTransformerOutput`] instead of a plain tuple. Returns: [`~models.prior_transformer.PriorTransformerOutput`] or `tuple`: - [`~models.prior_transformer.PriorTransformerOutput`] if `return_dict` is True, otherwise a `tuple`. When - returning a tuple, the first element is the sample tensor. + If return_dict is True, a [`~models.prior_transformer.PriorTransformerOutput`] is returned, otherwise a + tuple is returned where the first element is the sample tensor. """ batch_size = hidden_states.shape[0] diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py index ec4cb371845f..83da16838ae2 100644 --- a/src/diffusers/models/transformer_2d.py +++ b/src/diffusers/models/transformer_2d.py @@ -29,10 +29,12 @@ @dataclass class Transformer2DModelOutput(BaseOutput): """ + The output of [`Transformer2DModel`]. + Args: sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` or `(batch size, num_vector_embeds - 1, num_latent_pixels)` if [`Transformer2DModel`] is discrete): - Hidden states conditioned on `encoder_hidden_states` input. If discrete, returns probability distributions - for the unnoised latent pixels. + The hidden states output conditioned on the `encoder_hidden_states` input. If discrete, returns probability + distributions for the unnoised latent pixels. """ sample: torch.FloatTensor @@ -40,40 +42,30 @@ class Transformer2DModelOutput(BaseOutput): class Transformer2DModel(ModelMixin, ConfigMixin): """ - Transformer model for image-like data. Takes either discrete (classes of vector embeddings) or continuous (actual - embeddings) inputs. - - When input is continuous: First, project the input (aka embedding) and reshape to b, t, d. Then apply standard - transformer action. Finally, reshape to image. - - When input is discrete: First, input (classes of latent pixels) is converted to embeddings and has positional - embeddings applied, see `ImagePositionalEmbeddings`. Then apply standard transformer action. Finally, predict - classes of unnoised image. - - Note that it is assumed one of the input classes is the masked latent pixel. The predicted classes of the unnoised - image do not contain a prediction for the masked pixel as the unnoised image cannot be masked. + A 2D Transformer model for image-like data. Parameters: num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. in_channels (`int`, *optional*): - Pass if the input is continuous. The number of channels in the input and output. + The number of channels in the input and output (specify if the input is **continuous**). num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. - cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. - sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. - Note that this is fixed at training time as it is used for learning a number of position embeddings. See - `ImagePositionalEmbeddings`. + cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. + sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**). + This is fixed during training since it is used to learn a number of position embeddings. num_vector_embeds (`int`, *optional*): - Pass if the input is discrete. The number of classes of the vector embeddings of the latent pixels. + The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**). Includes the class for the masked latent pixel. - activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. - num_embeds_ada_norm ( `int`, *optional*): Pass if at least one of the norm_layers is `AdaLayerNorm`. - The number of diffusion steps used during training. Note that this is fixed at training time as it is used - to learn a number of embeddings that are added to the hidden states. During inference, you can denoise for - up to but not more than steps than `num_embeds_ada_norm`. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward. + num_embeds_ada_norm ( `int`, *optional*): + The number of diffusion steps used during training. Pass if at least one of the norm_layers is + `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are + added to the hidden states. + + During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`. attention_bias (`bool`, *optional*): - Configure if the TransformerBlocks' attention should contain a bias parameter. + Configure if the `TransformerBlocks` attention should contain a bias parameter. """ @register_to_config @@ -223,31 +215,34 @@ def forward( return_dict: bool = True, ): """ + The [`Transformer2DModel`] forward method. + Args: - hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continuous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + Input `hidden_states`. encoder_hidden_states ( `torch.FloatTensor` of shape `(batch size, sequence len, embed dims)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.LongTensor`, *optional*): - Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`. class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): - Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels - conditioning. - encoder_attention_mask ( `torch.Tensor`, *optional* ). - Cross-attention mask, applied to encoder_hidden_states. Two formats supported: - Mask `(batch, sequence_length)` True = keep, False = discard. Bias `(batch, 1, sequence_length)` 0 - = keep, -10000 = discard. - If ndim == 2: will be interpreted as a mask, then converted into a bias consistent with the format + Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in + `AdaLayerZeroNorm`. + encoder_attention_mask ( `torch.Tensor`, *optional*): + Cross-attention mask applied to `encoder_hidden_states`. Two formats supported: + + * Mask `(batch, sequence_length)` True = keep, False = discard. + * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard. + + If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format above. This bias will be added to the cross-attention scores. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. Returns: - [`~models.transformer_2d.Transformer2DModelOutput`] or `tuple`: - [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple`. When - returning a tuple, the first element is the sample tensor. + If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a + `tuple` where the first element is the sample tensor. """ # ensure attention_mask is a bias, and give it a singleton query_tokens dimension. # we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward. diff --git a/src/diffusers/models/transformer_temporal.py b/src/diffusers/models/transformer_temporal.py index ece88b8db2d5..cfafdb055bcf 100644 --- a/src/diffusers/models/transformer_temporal.py +++ b/src/diffusers/models/transformer_temporal.py @@ -26,9 +26,11 @@ @dataclass class TransformerTemporalModelOutput(BaseOutput): """ + The output of [`TransformerTemporalModel`]. + Args: - sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`) - Hidden states conditioned on `encoder_hidden_states` input. + sample (`torch.FloatTensor` of shape `(batch_size x num_frames, num_channels, height, width)`): + The hidden states output conditioned on `encoder_hidden_states` input. """ sample: torch.FloatTensor @@ -36,24 +38,23 @@ class TransformerTemporalModelOutput(BaseOutput): class TransformerTemporalModel(ModelMixin, ConfigMixin): """ - Transformer model for video-like data. + A Transformer model for video-like data. Parameters: num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention. attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head. in_channels (`int`, *optional*): - Pass if the input is continuous. The number of channels in the input and output. + The number of channels in the input and output (specify if the input is **continuous**). num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use. dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. - cross_attention_dim (`int`, *optional*): The number of encoder_hidden_states dimensions to use. - sample_size (`int`, *optional*): Pass if the input is discrete. The width of the latent images. - Note that this is fixed at training time as it is used for learning a number of position embeddings. See - `ImagePositionalEmbeddings`. - activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use. + sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**). + This is fixed during training since it is used to learn a number of position embeddings. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward. attention_bias (`bool`, *optional*): - Configure if the TransformerBlocks' attention should contain a bias parameter. + Configure if the `TransformerBlock` attention should contain a bias parameter. double_self_attention (`bool`, *optional*): - Configure if each TransformerBlock should contain two self-attention layers + Configure if each `TransformerBlock` should contain two self-attention layers. """ @register_to_config @@ -114,25 +115,27 @@ def forward( return_dict: bool = True, ): """ + The [`TransformerTemporal`] forward method. + Args: - hidden_states ( When discrete, `torch.LongTensor` of shape `(batch size, num latent pixels)`. - When continous, `torch.FloatTensor` of shape `(batch size, channel, height, width)`): Input - hidden_states + hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous): + Input hidden_states. encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*): Conditional embeddings for cross attention layer. If not given, cross-attention defaults to self-attention. timestep ( `torch.long`, *optional*): - Optional timestep to be applied as an embedding in AdaLayerNorm's. Used to indicate denoising step. + Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`. class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*): - Optional class labels to be applied as an embedding in AdaLayerZeroNorm. Used to indicate class labels - conditioning. + Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in + `AdaLayerZeroNorm`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. Returns: - [`~models.transformer_2d.TransformerTemporalModelOutput`] or `tuple`: - [`~models.transformer_2d.TransformerTemporalModelOutput`] if `return_dict` is True, otherwise a `tuple`. - When returning a tuple, the first element is the sample tensor. + [`~models.transformer_temporal.TransformerTemporalModelOutput`] or `tuple`: + If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is + returned, otherwise a `tuple` where the first element is the sample tensor. """ # 1. Input batch_frames, channel, height, width = hidden_states.shape diff --git a/src/diffusers/models/unet_1d.py b/src/diffusers/models/unet_1d.py index 34a1d2b5160e..9b617388f391 100644 --- a/src/diffusers/models/unet_1d.py +++ b/src/diffusers/models/unet_1d.py @@ -28,9 +28,11 @@ @dataclass class UNet1DOutput(BaseOutput): """ + The output of [`UNet1DModel`]. + Args: sample (`torch.FloatTensor` of shape `(batch_size, num_channels, sample_size)`): - Hidden states output. Output of last layer of model. + The hidden states output from the last layer of the model. """ sample: torch.FloatTensor @@ -38,10 +40,10 @@ class UNet1DOutput(BaseOutput): class UNet1DModel(ModelMixin, ConfigMixin): r""" - UNet1DModel is a 1D UNet model that takes in a noisy sample and a timestep and returns sample shaped output. + A 1D UNet model that takes a noisy sample and a timestep and returns a sample shaped output. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the model (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: sample_size (`int`, *optional*): Default length of sample. Should be adaptable at runtime. @@ -49,24 +51,24 @@ class UNet1DModel(ModelMixin, ConfigMixin): out_channels (`int`, *optional*, defaults to 2): Number of channels in the output. extra_in_channels (`int`, *optional*, defaults to 0): Number of additional channels to be added to the input of the first down block. Useful for cases where the - input data has more channels than what the model is initially designed for. + input data has more channels than what the model was initially designed for. time_embedding_type (`str`, *optional*, defaults to `"fourier"`): Type of time embedding to use. - freq_shift (`float`, *optional*, defaults to 0.0): Frequency shift for fourier time embedding. - flip_sin_to_cos (`bool`, *optional*, defaults to : - obj:`False`): Whether to flip sin to cos for fourier time embedding. - down_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("DownBlock1D", "DownBlock1DNoSkip", "AttnDownBlock1D")`): Tuple of downsample block types. - up_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to : - obj:`(32, 32, 64)`): Tuple of block output channels. - mid_block_type (`str`, *optional*, defaults to "UNetMidBlock1D"): block type for middle of UNet. - out_block_type (`str`, *optional*, defaults to `None`): optional output processing of UNet. - act_fn (`str`, *optional*, defaults to None): optional activation function in UNet blocks. - norm_num_groups (`int`, *optional*, defaults to 8): group norm member count in UNet blocks. - layers_per_block (`int`, *optional*, defaults to 1): added number of layers in a UNet block. - downsample_each_block (`int`, *optional*, defaults to False: - experimental feature for using a UNet without upsampling. + freq_shift (`float`, *optional*, defaults to 0.0): Frequency shift for Fourier time embedding. + flip_sin_to_cos (`bool`, *optional*, defaults to `False`): + Whether to flip sin to cos for Fourier time embedding. + down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock1D", "DownBlock1DNoSkip", "AttnDownBlock1D")`): + Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock1D", "UpBlock1DNoSkip", "AttnUpBlock1D")`): + Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(32, 32, 64)`): + Tuple of block output channels. + mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock1D"`): Block type for middle of UNet. + out_block_type (`str`, *optional*, defaults to `None`): Optional output processing block of UNet. + act_fn (`str`, *optional*, defaults to `None`): Optional activation function in UNet blocks. + norm_num_groups (`int`, *optional*, defaults to 8): The number of groups for normalization. + layers_per_block (`int`, *optional*, defaults to 1): The number of layers per block. + downsample_each_block (`int`, *optional*, defaults to `False`): + Experimental feature for using a UNet without upsampling. """ @register_to_config @@ -197,15 +199,19 @@ def forward( return_dict: bool = True, ) -> Union[UNet1DOutput, Tuple]: r""" + The [`UNet1DModel`] forward method. + Args: - sample (`torch.FloatTensor`): `(batch_size, num_channels, sample_size)` noisy inputs tensor - timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch_size, num_channels, sample_size)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_1d.UNet1DOutput`] instead of a plain tuple. Returns: - [`~models.unet_1d.UNet1DOutput`] or `tuple`: [`~models.unet_1d.UNet1DOutput`] if `return_dict` is True, - otherwise a `tuple`. When returning a tuple, the first element is the sample tensor. + [`~models.unet_1d.UNet1DOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_1d.UNet1DOutput`] is returned, otherwise a `tuple` is + returned where the first element is the sample tensor. """ # 1. time diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py index 4a752fa94a99..7077aa889190 100644 --- a/src/diffusers/models/unet_2d.py +++ b/src/diffusers/models/unet_2d.py @@ -27,9 +27,11 @@ @dataclass class UNet2DOutput(BaseOutput): """ + The output of [`UNet2DModel`]. + Args: sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Hidden states output. Output of last layer of model. + The hidden states output from the last layer of the model. """ sample: torch.FloatTensor @@ -37,46 +39,45 @@ class UNet2DOutput(BaseOutput): class UNet2DModel(ModelMixin, ConfigMixin): r""" - UNet2DModel is a 2D UNet model that takes in a noisy sample and a timestep and returns sample shaped output. + A 2D UNet model that takes a noisy sample and a timestep and returns a sample shaped output. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the model (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): Height and width of input/output sample. Dimensions must be a multiple of `2 ** (len(block_out_channels) - 1)`. - in_channels (`int`, *optional*, defaults to 3): Number of channels in the input image. + in_channels (`int`, *optional*, defaults to 3): Number of channels in the input sample. out_channels (`int`, *optional*, defaults to 3): Number of channels in the output. center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample. time_embedding_type (`str`, *optional*, defaults to `"positional"`): Type of time embedding to use. - freq_shift (`int`, *optional*, defaults to 0): Frequency shift for fourier time embedding. - flip_sin_to_cos (`bool`, *optional*, defaults to : - obj:`True`): Whether to flip sin to cos for fourier time embedding. - down_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`): Tuple of downsample block - types. + freq_shift (`int`, *optional*, defaults to 0): Frequency shift for Fourier time embedding. + flip_sin_to_cos (`bool`, *optional*, defaults to `True`): + Whether to flip sin to cos for Fourier time embedding. + down_block_types (`Tuple[str]`, *optional*, defaults to `("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D")`): + Tuple of downsample block types. mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2D"`): - The mid block type. Choose from `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`. - up_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`): Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to : - obj:`(224, 448, 672, 896)`): Tuple of block output channels. + Block type for middle of UNet, it can be either `UNetMidBlock2D` or `UnCLIPUNetMidBlock2D`. + up_block_types (`Tuple[str]`, *optional*, defaults to `("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D")`): + Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(224, 448, 672, 896)`): + Tuple of block output channels. layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block. mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block. downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution. act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension. - norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for the normalization. - norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for the normalization. + norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization. + norm_eps (`float`, *optional*, defaults to `1e-5`): The epsilon for normalization. resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config - for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`. - class_embed_type (`str`, *optional*, defaults to None): + for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`. + class_embed_type (`str`, *optional*, defaults to `None`): The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, `"timestep"`, or `"identity"`. - num_class_embeds (`int`, *optional*, defaults to None): - Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing - class conditioning with `class_embed_type` equal to `None`. + num_class_embeds (`int`, *optional*, defaults to `None`): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim` when performing class + conditioning with `class_embed_type` equal to `None`. """ @register_to_config @@ -224,17 +225,21 @@ def forward( return_dict: bool = True, ) -> Union[UNet2DOutput, Tuple]: r""" + The [`UNet2DModel`] forward method. + Args: - sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor - timestep (`torch.FloatTensor` or `float` or `int): (batch) timesteps + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. class_labels (`torch.FloatTensor`, *optional*, defaults to `None`): Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. return_dict (`bool`, *optional*, defaults to `True`): Whether or not to return a [`~models.unet_2d.UNet2DOutput`] instead of a plain tuple. Returns: - [`~models.unet_2d.UNet2DOutput`] or `tuple`: [`~models.unet_2d.UNet2DOutput`] if `return_dict` is True, - otherwise a `tuple`. When returning a tuple, the first element is the sample tensor. + [`~models.unet_2d.UNet2DOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_2d.UNet2DOutput`] is returned, otherwise a `tuple` is + returned where the first element is the sample tensor. """ # 0. center input if necessary if self.config.center_input_sample: diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 7bca5c336c57..868511ef6625 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -50,9 +50,11 @@ @dataclass class UNet2DConditionOutput(BaseOutput): """ + The output of [`UNet2DConditionModel`]. + Args: sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model. + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ sample: torch.FloatTensor @@ -60,17 +62,17 @@ class UNet2DConditionOutput(BaseOutput): class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" - UNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep - and returns sample shaped output. + A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the models (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): Height and width of input/output sample. - in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample. - out_channels (`int`, *optional*, defaults to 4): The number of channels in the output. + in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample. + out_channels (`int`, *optional*, defaults to 4): Number of channels in the output. center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample. flip_sin_to_cos (`bool`, *optional*, defaults to `False`): Whether to flip the sin to cos in the time embedding. @@ -78,9 +80,9 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): The tuple of downsample blocks to use. mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): - The mid block type. Choose from `UNetMidBlock2DCrossAttn` or `UNetMidBlock2DSimpleCrossAttn`, will skip the - mid block layer if `None`. - up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`): + Block type for middle of UNet, it can be either `UNetMidBlock2DCrossAttn` or + `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): The tuple of upsample blocks to use. only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`): Whether to include self-attention in the basic transformer blocks, see @@ -92,52 +94,52 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block. act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. - If `None`, it will skip the normalization and activation layers in post-processing + If `None`, normalization and activation layers is skipped in post-processing. norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): The dimension of the cross attention features. - encoder_hid_dim (`int`, *optional*, defaults to None): + encoder_hid_dim (`int`, *optional*, defaults to `None`): If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` dimension to `cross_attention_dim`. - encoder_hid_dim_type (`str`, *optional*, defaults to None): - If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. num_attention_heads (`int`, *optional*): The number of attention heads. If not defined, defaults to `attention_head_dim` resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config - for resnet blocks, see [`~models.resnet.ResnetBlock2D`]. Choose from `default` or `scale_shift`. - class_embed_type (`str`, *optional*, defaults to None): + for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`. + class_embed_type (`str`, *optional*, defaults to `None`): The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. - addition_embed_type (`str`, *optional*, defaults to None): + addition_embed_type (`str`, *optional*, defaults to `None`): Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or "text". "text" will use the `TextTimeEmbedding` layer. - num_class_embeds (`int`, *optional*, defaults to None): + num_class_embeds (`int`, *optional*, defaults to `None`): Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing class conditioning with `class_embed_type` equal to `None`. - time_embedding_type (`str`, *optional*, default to `positional`): + time_embedding_type (`str`, *optional*, defaults to `positional`): The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. - time_embedding_dim (`int`, *optional*, default to `None`): + time_embedding_dim (`int`, *optional*, defaults to `None`): An optional override for the dimension of the projected time embedding. - time_embedding_act_fn (`str`, *optional*, default to `None`): - Optional activation function to use on the time embeddings only one time before they as passed to the rest - of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`. - timestep_post_act (`str, *optional*, default to `None`): + time_embedding_act_fn (`str`, *optional*, defaults to `None`): + Optional activation function to use only once on the time embeddings before they are passed to the rest of + the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`. + timestep_post_act (`str`, *optional*, defaults to `None`): The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`. - time_cond_proj_dim (`int`, *optional*, default to `None`): - The dimension of `cond_proj` layer in timestep embedding. + time_cond_proj_dim (`int`, *optional*, defaults to `None`): + The dimension of `cond_proj` layer in the timestep embedding. conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when - using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`. + `class_embed_type="projection"`. Required when `class_embed_type="projection"`. class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time embeddings with the class embeddings. mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`): Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If - `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is None, the - `only_cross_attention` value will be used as the value for `mid_block_only_cross_attention`. Else, it will - default to `False`. + `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the + `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False` + otherwise. """ _supports_gradient_checkpointing = True @@ -551,11 +553,15 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" + Sets the attention processor to use to compute attention. + Parameters: - `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor - of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. """ count = len(self.attn_processors.keys()) @@ -589,15 +595,15 @@ def set_attention_slice(self, slice_size): r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `num_attention_heads // slice_size`. In this case, - `num_attention_heads` must be a multiple of `slice_size`. + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. """ sliceable_head_dims = [] @@ -670,29 +676,28 @@ def forward( return_dict: bool = True, ) -> Union[UNet2DConditionOutput, Tuple]: r""" + The [`UNet2DConditionModel`] forward method. + Args: - sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor - timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps - encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. encoder_attention_mask (`torch.Tensor`): - (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False = - discard. Mask will be converted into a bias, which adds large negative values to attention scores - corresponding to "discard" tokens. + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). - added_cond_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time - embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and - `addition_embed_type` for more information. + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. Returns: [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When - returning a tuple, the first element is the sample tensor. + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). diff --git a/src/diffusers/models/unet_2d_condition_flax.py b/src/diffusers/models/unet_2d_condition_flax.py index 73f7e9263ff2..352b0b1b5e10 100644 --- a/src/diffusers/models/unet_2d_condition_flax.py +++ b/src/diffusers/models/unet_2d_condition_flax.py @@ -35,9 +35,11 @@ @flax.struct.dataclass class FlaxUNet2DConditionOutput(BaseOutput): """ + The output of [`FlaxUNet2DConditionModel`]. + Args: sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): - Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model. + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ sample: jnp.ndarray @@ -46,17 +48,17 @@ class FlaxUNet2DConditionOutput(BaseOutput): @flax_register_to_config class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): r""" - FlaxUNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a - timestep and returns sample shaped output. + A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. - This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the models (such as downloading or saving, etc.) + This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods + implemented for all models (such as downloading or saving). - Also, this model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) - subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to + This model is also a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) + subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matters related to its general usage and behavior. - Finally, this model supports inherent JAX features such as: + Inherent JAX features such as the following are supported: - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) @@ -69,12 +71,10 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): The number of channels in the input sample. out_channels (`int`, *optional*, defaults to 4): The number of channels in the output. - down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): - The tuple of downsample blocks to use. The corresponding class names will be: "FlaxCrossAttnDownBlock2D", - "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D" - up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`): - The tuple of upsample blocks to use. The corresponding class names will be: "FlaxUpBlock2D", - "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D" + down_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D")`): + The tuple of downsample blocks to use. + up_block_types (`Tuple[str]`, *optional*, defaults to `("FlaxUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D")`): + The tuple of upsample blocks to use. block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): The tuple of output channels for each block. layers_per_block (`int`, *optional*, defaults to 2): @@ -91,8 +91,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin): Whether to flip the sin to cos in the time embedding. freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding. use_memory_efficient_attention (`bool`, *optional*, defaults to `False`): - enable memory efficient attention https://arxiv.org/abs/2112.05682 - + Enable memory efficient attention as described [here](https://arxiv.org/abs/2112.05682). """ sample_size: int = 32 diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index aa6aa542b158..36dcaf21f827 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -43,9 +43,11 @@ @dataclass class UNet3DConditionOutput(BaseOutput): """ + The output of [`UNet3DConditionModel`]. + Args: sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`): - Hidden states conditioned on `encoder_hidden_states` input. Output of last layer of model. + The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ sample: torch.FloatTensor @@ -53,11 +55,11 @@ class UNet3DConditionOutput(BaseOutput): class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): r""" - UNet3DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a timestep - and returns sample shaped output. + A conditional 3D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the models (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): @@ -66,7 +68,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) out_channels (`int`, *optional*, defaults to 4): The number of channels in the output. down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): The tuple of downsample blocks to use. - up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`): + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): The tuple of upsample blocks to use. block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`): The tuple of output channels for each block. @@ -75,7 +77,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block. act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. - If `None`, it will skip the normalization and activation layers in post-processing + If `None`, normalization and activation layers is skipped in post-processing. norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. @@ -291,15 +293,15 @@ def set_attention_slice(self, slice_size): r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `num_attention_heads // slice_size`. In this case, - `num_attention_heads` must be a multiple of `slice_size`. + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. """ sliceable_head_dims = [] @@ -355,11 +357,15 @@ def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[i # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_attn_processor def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" + Sets the attention processor to use to compute attention. + Parameters: - `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor - of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. """ count = len(self.attn_processors.keys()) @@ -408,21 +414,24 @@ def forward( return_dict: bool = True, ) -> Union[UNet3DConditionOutput, Tuple]: r""" + The [`UNet3DConditionModel`] forward method. + Args: - sample (`torch.FloatTensor`): (batch, num_frames, channel, height, width) noisy inputs tensor - timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps - encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unet_2d_condition.UNet3DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`~models.unet_3d_condition.UNet3DConditionOutput`] instead of a plain + tuple. cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. Returns: - [`~models.unet_2d_condition.UNet3DConditionOutput`] or `tuple`: - [`~models.unet_2d_condition.UNet3DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When - returning a tuple, the first element is the sample tensor. + [`~models.unet_3d_condition.UNet3DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unet_3d_condition.UNet3DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layears). diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py index b54e3964f183..edd516dd380a 100644 --- a/src/diffusers/models/vae.py +++ b/src/diffusers/models/vae.py @@ -30,7 +30,7 @@ class DecoderOutput(BaseOutput): Args: sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Decoded output sample of the model. Output of the last layer of the model. + The decoded output sample from the last layer of the model. """ sample: torch.FloatTensor diff --git a/src/diffusers/models/vae_flax.py b/src/diffusers/models/vae_flax.py index 9812954db76d..b8f5b1d0e399 100644 --- a/src/diffusers/models/vae_flax.py +++ b/src/diffusers/models/vae_flax.py @@ -36,9 +36,9 @@ class FlaxDecoderOutput(BaseOutput): Args: sample (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): - Decoded output sample of the model. Output of the last layer of the model. - dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32): - Parameters `dtype` + The decoded output sample from the last layer of the model. + dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`): + The `dtype` of the parameters. """ sample: jnp.ndarray @@ -720,40 +720,43 @@ def mode(self): @flax_register_to_config class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin): r""" - Flax Implementation of Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational - Bayes by Diederik P. Kingma and Max Welling. + Flax implementation of a VAE model with KL loss for decoding latent representations. + + This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for it's generic methods + implemented for all models (such as downloading or saving). This model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module) - subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to + subclass. Use it as a regular Flax Linen module and refer to the Flax documentation for all matter related to its general usage and behavior. - Finally, this model supports inherent JAX features such as: + Inherent JAX features such as the following are supported: + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) Parameters: - in_channels (:obj:`int`, *optional*, defaults to 3): - Input channels - out_channels (:obj:`int`, *optional*, defaults to 3): - Output channels - down_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`): - DownEncoder block type - up_block_types (:obj:`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`): - UpDecoder block type - block_out_channels (:obj:`Tuple[str]`, *optional*, defaults to `(64,)`): - Tuple containing the number of output channels for each block - layers_per_block (:obj:`int`, *optional*, defaults to `2`): - Number of Resnet layer for each block - act_fn (:obj:`str`, *optional*, defaults to `silu`): - Activation function - latent_channels (:obj:`int`, *optional*, defaults to `4`): - Latent space channels - norm_num_groups (:obj:`int`, *optional*, defaults to `32`): - Norm num group - sample_size (:obj:`int`, *optional*, defaults to 32): - Sample input size + in_channels (`int`, *optional*, defaults to 3): + Number of channels in the input image. + out_channels (`int`, *optional*, defaults to 3): + Number of channels in the output. + down_block_types (`Tuple[str]`, *optional*, defaults to `(DownEncoderBlock2D)`): + Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to `(UpDecoderBlock2D)`): + Tuple of upsample block types. + block_out_channels (`Tuple[str]`, *optional*, defaults to `(64,)`): + Tuple of block output channels. + layers_per_block (`int`, *optional*, defaults to `2`): + Number of ResNet layer for each block. + act_fn (`str`, *optional*, defaults to `silu`): + The activation function to use. + latent_channels (`int`, *optional*, defaults to `4`): + Number of channels in the latent space. + norm_num_groups (`int`, *optional*, defaults to `32`): + The number of groups for normalization. + sample_size (`int`, *optional*, defaults to 32): + Sample input size. scaling_factor (`float`, *optional*, defaults to 0.18215): The component-wise standard deviation of the trained latent space computed using the first batch of the training set. This is used to scale the latent space to have unit variance when training the diffusion @@ -761,8 +764,8 @@ class FlaxAutoencoderKL(nn.Module, FlaxModelMixin, ConfigMixin): diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. - dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32): - parameters `dtype` + dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`): + The `dtype` of the parameters. """ in_channels: int = 3 out_channels: int = 3 diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py index 73158294ee6e..32f944dacac9 100644 --- a/src/diffusers/models/vq_model.py +++ b/src/diffusers/models/vq_model.py @@ -30,31 +30,31 @@ class VQEncoderOutput(BaseOutput): Args: latents (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Encoded output sample of the model. Output of the last layer of the model. + The encoded output sample from the last layer of the model. """ latents: torch.FloatTensor class VQModel(ModelMixin, ConfigMixin): - r"""VQ-VAE model from the paper Neural Discrete Representation Learning by Aaron van den Oord, Oriol Vinyals and Koray - Kavukcuoglu. + r""" + A VQ-VAE model for decoding latent representations. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the model (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: in_channels (int, *optional*, defaults to 3): Number of channels in the input image. out_channels (int, *optional*, defaults to 3): Number of channels in the output. - down_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types. - up_block_types (`Tuple[str]`, *optional*, defaults to : - obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to : - obj:`(64,)`): Tuple of block output channels. + down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`): + Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`): + Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`): + Tuple of block output channels. act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. latent_channels (`int`, *optional*, defaults to `3`): Number of channels in the latent space. - sample_size (`int`, *optional*, defaults to `32`): TODO + sample_size (`int`, *optional*, defaults to `32`): Sample input size. num_vq_embeddings (`int`, *optional*, defaults to `256`): Number of codebook vectors in the VQ-VAE. vq_embed_dim (`int`, *optional*): Hidden dim of codebook vectors in the VQ-VAE. scaling_factor (`float`, *optional*, defaults to `0.18215`): @@ -143,10 +143,17 @@ def decode( def forward(self, sample: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]: r""" + The [`VQModel`] forward method. + Args: sample (`torch.FloatTensor`): Input sample. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + Whether or not to return a [`models.vq_model.VQEncoderOutput`] instead of a plain tuple. + + Returns: + [`~models.vq_model.VQEncoderOutput`] or `tuple`: + If return_dict is True, a [`~models.vq_model.VQEncoderOutput`] is returned, otherwise a plain `tuple` + is returned. """ x = sample h = self.encode(x).latents diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 0dd2351e6076..dd5b7f77c1ce 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -153,17 +153,17 @@ def get_up_block( # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel with UNet2DConditionModel->UNetFlatConditionModel, nn.Conv2d->LinearMultiDim, Block2D->BlockFlat class UNetFlatConditionModel(ModelMixin, ConfigMixin): r""" - UNetFlatConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a - timestep and returns sample shaped output. + A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample + shaped output. - This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library - implements for all the models (such as downloading or saving, etc.) + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). Parameters: sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`): Height and width of input/output sample. - in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample. - out_channels (`int`, *optional*, defaults to 4): The number of channels in the output. + in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample. + out_channels (`int`, *optional*, defaults to 4): Number of channels in the output. center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample. flip_sin_to_cos (`bool`, *optional*, defaults to `False`): Whether to flip the sin to cos in the time embedding. @@ -171,9 +171,9 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "CrossAttnDownBlockFlat", "DownBlockFlat")`): The tuple of downsample blocks to use. mid_block_type (`str`, *optional*, defaults to `"UNetMidBlockFlatCrossAttn"`): - The mid block type. Choose from `UNetMidBlockFlatCrossAttn` or `UNetMidBlockFlatSimpleCrossAttn`, will skip - the mid block layer if `None`. - up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat",)`): + Block type for middle of UNet, it can be either `UNetMidBlockFlatCrossAttn` or + `UNetMidBlockFlatSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat", "CrossAttnUpBlockFlat")`): The tuple of upsample blocks to use. only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`): Whether to include self-attention in the basic transformer blocks, see @@ -185,52 +185,52 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block. act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization. - If `None`, it will skip the normalization and activation layers in post-processing + If `None`, normalization and activation layers is skipped in post-processing. norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): The dimension of the cross attention features. - encoder_hid_dim (`int`, *optional*, defaults to None): + encoder_hid_dim (`int`, *optional*, defaults to `None`): If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` dimension to `cross_attention_dim`. - encoder_hid_dim_type (`str`, *optional*, defaults to None): - If given, the `encoder_hidden_states` and potentially other embeddings will be down-projected to text + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads. num_attention_heads (`int`, *optional*): The number of attention heads. If not defined, defaults to `attention_head_dim` resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config - for resnet blocks, see [`~models.resnet.ResnetBlockFlat`]. Choose from `default` or `scale_shift`. - class_embed_type (`str`, *optional*, defaults to None): + for ResNet blocks (see [`~models.resnet.ResnetBlockFlat`]). Choose from `default` or `scale_shift`. + class_embed_type (`str`, *optional*, defaults to `None`): The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`, `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. - addition_embed_type (`str`, *optional*, defaults to None): + addition_embed_type (`str`, *optional*, defaults to `None`): Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or "text". "text" will use the `TextTimeEmbedding` layer. - num_class_embeds (`int`, *optional*, defaults to None): + num_class_embeds (`int`, *optional*, defaults to `None`): Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing class conditioning with `class_embed_type` equal to `None`. - time_embedding_type (`str`, *optional*, default to `positional`): + time_embedding_type (`str`, *optional*, defaults to `positional`): The type of position embedding to use for timesteps. Choose from `positional` or `fourier`. - time_embedding_dim (`int`, *optional*, default to `None`): + time_embedding_dim (`int`, *optional*, defaults to `None`): An optional override for the dimension of the projected time embedding. - time_embedding_act_fn (`str`, *optional*, default to `None`): - Optional activation function to use on the time embeddings only one time before they as passed to the rest - of the unet. Choose from `silu`, `mish`, `gelu`, and `swish`. - timestep_post_act (`str, *optional*, default to `None`): + time_embedding_act_fn (`str`, *optional*, defaults to `None`): + Optional activation function to use only once on the time embeddings before they are passed to the rest of + the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`. + timestep_post_act (`str`, *optional*, defaults to `None`): The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`. - time_cond_proj_dim (`int`, *optional*, default to `None`): - The dimension of `cond_proj` layer in timestep embedding. + time_cond_proj_dim (`int`, *optional*, defaults to `None`): + The dimension of `cond_proj` layer in the timestep embedding. conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`, *optional*): The dimension of the `class_labels` input when - using the "projection" `class_embed_type`. Required when using the "projection" `class_embed_type`. + `class_embed_type="projection"`. Required when `class_embed_type="projection"`. class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time embeddings with the class embeddings. mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`): Whether to use cross attention with the mid block when using the `UNetMidBlockFlatSimpleCrossAttn`. If - `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is None, the - `only_cross_attention` value will be used as the value for `mid_block_only_cross_attention`. Else, it will - default to `False`. + `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the + `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False` + otherwise. """ _supports_gradient_checkpointing = True @@ -656,11 +656,15 @@ def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): r""" + Sets the attention processor to use to compute attention. + Parameters: - `processor (`dict` of `AttentionProcessor` or `AttentionProcessor`): + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): The instantiated processor class or a dictionary of processor classes that will be set as the processor - of **all** `Attention` layers. - In case `processor` is a dict, the key needs to define the path to the corresponding cross attention processor. This is strongly recommended when setting trainable attention processors.: + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. """ count = len(self.attn_processors.keys()) @@ -694,15 +698,15 @@ def set_attention_slice(self, slice_size): r""" Enable sliced attention computation. - When this option is enabled, the attention module will split the input tensor in slices, to compute attention - in several steps. This is useful to save some memory in exchange for a small speed decrease. + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. Args: slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): - When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If - `"max"`, maximum amount of memory will be saved by running only one slice at a time. If a number is - provided, uses as many slices as `num_attention_heads // slice_size`. In this case, - `num_attention_heads` must be a multiple of `slice_size`. + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. """ sliceable_head_dims = [] @@ -775,29 +779,28 @@ def forward( return_dict: bool = True, ) -> Union[UNet2DConditionOutput, Tuple]: r""" + The [`UNetFlatConditionModel`] forward method. + Args: - sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor - timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps - encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. encoder_attention_mask (`torch.Tensor`): - (batch, sequence_length) cross-attention mask, applied to encoder_hidden_states. True = keep, False = - discard. Mask will be converted into a bias, which adds large negative values to attention scores - corresponding to "discard" tokens. + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple. + Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). - added_cond_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified includes additonal conditions that can be used for additonal time - embeddings or encoder hidden states projections. See the configurations `encoder_hid_dim_type` and - `addition_embed_type` for more information. + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. Returns: [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: - [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When - returning a tuple, the first element is the sample tensor. + If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. """ # By default samples have to be AT least a multiple of the overall upsampling factor. # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). From 5439e917cacc885c0ac39dda1b8af12258e6e16d Mon Sep 17 00:00:00 2001 From: Aisuko Date: Sat, 1 Jul 2023 16:07:59 +1000 Subject: [PATCH 166/199] fix/docs: Fix the broken doc links (#3897) * fix/docs: Fix the broken doc links Signed-off-by: GitHub * Update docs/source/en/using-diffusers/write_own_pipeline.mdx Co-authored-by: Pedro Cuenca --------- Signed-off-by: GitHub Co-authored-by: Pedro Cuenca --- docs/source/en/using-diffusers/write_own_pipeline.mdx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/using-diffusers/write_own_pipeline.mdx b/docs/source/en/using-diffusers/write_own_pipeline.mdx index c7e257f4fa36..ca5ea38b4ad2 100644 --- a/docs/source/en/using-diffusers/write_own_pipeline.mdx +++ b/docs/source/en/using-diffusers/write_own_pipeline.mdx @@ -286,5 +286,5 @@ This is really what 🧨 Diffusers is designed for: to make it intuitive and eas For your next steps, feel free to: -* Learn how to [build and contribute a pipeline](using-diffusers/#contribute_pipeline) to 🧨 Diffusers. We can't wait and see what you'll come up with! -* Explore [existing pipelines](./api/pipelines/overview) in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately. +* Learn how to [build and contribute a pipeline](contribute_pipeline) to 🧨 Diffusers. We can't wait and see what you'll come up with! +* Explore [existing pipelines](../api/pipelines/overview) in the library, and see if you can deconstruct and build a pipeline from scratch using the models and schedulers separately. From 62825064bf512dedb96db56fa0f077c18a168de8 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Sun, 2 Jul 2023 13:19:27 +0200 Subject: [PATCH 167/199] Add video img2img (#3900) * Add image to image video * Improve * better naming * make fix copies * add docs * finish tests * trigger tests * make style * correct * finish * Fix more * make style * finish --- .../source/en/api/pipelines/text_to_video.mdx | 63 ++ src/diffusers/__init__.py | 1 + src/diffusers/models/autoencoder_kl.py | 7 +- src/diffusers/pipelines/__init__.py | 2 +- .../text_to_video_synthesis/__init__.py | 3 +- .../pipeline_text_to_video_synth.py | 3 + .../pipeline_text_to_video_synth_img2img.py | 770 ++++++++++++++++++ .../dummy_torch_and_transformers_objects.py | 15 + tests/pipelines/test_pipelines_common.py | 4 +- .../text_to_video/test_video_to_video.py | 195 +++++ 10 files changed, 1058 insertions(+), 5 deletions(-) create mode 100644 src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py create mode 100644 tests/pipelines/text_to_video/test_video_to_video.py diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx index 82b2f19ce1b2..75868d7dd6ea 100644 --- a/docs/source/en/api/pipelines/text_to_video.mdx +++ b/docs/source/en/api/pipelines/text_to_video.mdx @@ -37,9 +37,12 @@ Resources: | Pipeline | Tasks | Demo |---|---|:---:| | [TextToVideoSDPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py) | *Text-to-Video Generation* | [🤗 Spaces](https://huggingface.co/spaces/damo-vilab/modelscope-text-to-video-synthesis) +| [VideoToVideoSDPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py) | *Text-Guided Video-to-Video Generation* | [(TODO)🤗 Spaces]() ## Usage example +### `text-to-video-ms-1.7b` + Let's start by generating a short video with the default length of 16 frames (2s at 8 fps): ```python @@ -119,12 +122,72 @@ Here are some sample outputs: +### `cerspense/zeroscope_v2_576w` & `cerspense/zeroscope_v2_XL` + +Zeroscope are watermark-free model and have been trained on specific sizes such as `576x320` and `1024x576`. +One should first generate a video using the lower resolution checkpoint [`cerspense/zeroscope_v2_576w`](https://huggingface.co/cerspense/zeroscope_v2_576w) with [`TextToVideoSDPipeline`], +which can then be upscaled using [`VideoToVideoSDPipeline`] and [`cerspense/zeroscope_v2_XL`](https://huggingface.co/cerspense/zeroscope_v2_XL). + + +```py +import torch +from diffusers import DiffusionPipeline +from diffusers.utils import export_to_video + +pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16) +pipe.enable_model_cpu_offload() + +# memory optimization +pipe.enable_vae_slicing() + +prompt = "Darth Vader surfing a wave" +video_frames = pipe(prompt, num_frames=24).frames +video_path = export_to_video(video_frames) +video_path +``` + +Now the video can be upscaled: + +```py +pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16) +pipe.vae.enable_slicing() +pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) +pipe.enable_model_cpu_offload() + +video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] + +video_frames = pipe(prompt, video=video, strength=0.6).frames +video_path = export_to_video(video_frames) +video_path +``` + +Here are some sample outputs: + + + + + +
+ Darth vader surfing in waves. +
+ Darth vader surfing in waves. +
+ ## Available checkpoints * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/) * [damo-vilab/text-to-video-ms-1.7b-legacy](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b-legacy) +* [cerspense/zeroscope_v2_576w](https://huggingface.co/cerspense/zeroscope_v2_576w) +* [cerspense/zeroscope_v2_XL](https://huggingface.co/cerspense/zeroscope_v2_XL) ## TextToVideoSDPipeline [[autodoc]] TextToVideoSDPipeline - all - __call__ + +## VideoToVideoSDPipeline +[[autodoc]] VideoToVideoSDPipeline + - all + - __call__ diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 02907075345e..764f9204dffb 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -173,6 +173,7 @@ VersatileDiffusionImageVariationPipeline, VersatileDiffusionPipeline, VersatileDiffusionTextToImagePipeline, + VideoToVideoSDPipeline, VQDiffusionPipeline, ) diff --git a/src/diffusers/models/autoencoder_kl.py b/src/diffusers/models/autoencoder_kl.py index d61281a53e7c..ddb9bde0ee0a 100644 --- a/src/diffusers/models/autoencoder_kl.py +++ b/src/diffusers/models/autoencoder_kl.py @@ -229,7 +229,12 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderK if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size): return self.tiled_encode(x, return_dict=return_dict) - h = self.encoder(x) + if self.use_slicing and x.shape[0] > 1: + encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)] + h = torch.cat(encoded_slices) + else: + h = self.encoder(x) + moments = self.quant_conv(h) posterior = DiagonalGaussianDistribution(moments) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index b1650240848a..ca57756c6aa4 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -89,7 +89,7 @@ StableUnCLIPPipeline, ) from .stable_diffusion_safe import StableDiffusionPipelineSafe - from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline + from .text_to_video_synthesis import TextToVideoSDPipeline, TextToVideoZeroPipeline, VideoToVideoSDPipeline from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline from .unidiffuser import ImageTextPipelineOutput, UniDiffuserModel, UniDiffuserPipeline, UniDiffuserTextDecoder from .versatile_diffusion import ( diff --git a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py index 165a1a0f0d98..d70c1c2ea2a8 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/__init__.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/__init__.py @@ -28,5 +28,6 @@ class TextToVideoSDPipelineOutput(BaseOutput): except OptionalDependencyNotAvailable: from ...utils.dummy_torch_and_transformers_objects import * # noqa F403 else: - from .pipeline_text_to_video_synth import TextToVideoSDPipeline # noqa: F401 + from .pipeline_text_to_video_synth import TextToVideoSDPipeline + from .pipeline_text_to_video_synth_img2img import VideoToVideoSDPipeline # noqa: F401 from .pipeline_text_to_video_zero import TextToVideoZeroPipeline diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 8bf4bafa4fe5..e30f183808a5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -672,6 +672,9 @@ def __call__( if callback is not None and i % callback_steps == 0: callback(i, t, latents) + if output_type == "latent": + return TextToVideoSDPipelineOutput(frames=latents) + video_tensor = self.decode_latents(latents) if output_type == "pt": diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py new file mode 100644 index 000000000000..ce5109a58213 --- /dev/null +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -0,0 +1,770 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPTextModel, CLIPTokenizer + +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet3DConditionModel +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import TextToVideoSDPipelineOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler + >>> from diffusers.utils import export_to_video + + >>> pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16) + >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + >>> pipe.to("cuda") + + >>> prompt = "spiderman running in the desert" + >>> video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames + >>> # safe low-res video + >>> video_path = export_to_video(video_frames, output_video_path="./video_576_spiderman.mp4") + + >>> # let's offload the text-to-image model + >>> pipe.to("cpu") + + >>> # and load the image-to-image model + >>> pipe = DiffusionPipeline.from_pretrained( + ... "cerspense/zeroscope_v2_XL", torch_dtype=torch.float16, revision="refs/pr/15" + ... ) + >>> pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) + >>> pipe.enable_model_cpu_offload() + + >>> # The VAE consumes A LOT of memory, let's make sure we run it in sliced mode + >>> pipe.vae.enable_slicing() + + >>> # now let's upscale it + >>> video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] + + >>> # and denoise it + >>> video_frames = pipe(prompt, video=video, strength=0.6).frames + >>> video_path = export_to_video(video_frames, output_video_path="./video_1024_spiderman.mp4") + >>> video_path + ``` +""" + + +def tensor2vid(video: torch.Tensor, mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) -> List[np.ndarray]: + # This code is copied from https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78 + # reshape to ncfhw + mean = torch.tensor(mean, device=video.device).reshape(1, -1, 1, 1, 1) + std = torch.tensor(std, device=video.device).reshape(1, -1, 1, 1, 1) + # unnormalize back to [0,1] + video = video.mul_(std).add_(mean) + video.clamp_(0, 1) + # prepare the final outputs + i, c, f, h, w = video.shape + images = video.permute(2, 3, 0, 4, 1).reshape( + f, h, i * w, c + ) # 1st (frames, h, batch_size, w, c) 2nd (frames, h, batch_size * w, c) + images = images.unbind(dim=0) # prepare a list of indvidual (consecutive frames) + images = [(image.cpu().numpy() * 255).astype("uint8") for image in images] # f h w c + return images + + +def preprocess_video(video): + supported_formats = (np.ndarray, torch.Tensor, PIL.Image.Image) + + if isinstance(video, supported_formats): + video = [video] + elif not (isinstance(video, list) and all(isinstance(i, supported_formats) for i in video)): + raise ValueError( + f"Input is in incorrect format: {[type(i) for i in video]}. Currently, we only support {', '.join(supported_formats)}" + ) + + if isinstance(video[0], PIL.Image.Image): + video = [np.array(frame) for frame in video] + + if isinstance(video[0], np.ndarray): + video = np.concatenate(video, axis=0) if video[0].ndim == 5 else np.stack(video, axis=0) + + if video.dtype == np.uint8: + video = np.array(video).astype(np.float32) / 255.0 + + if video.ndim == 4: + video = video[None, ...] + + video = torch.from_numpy(video.transpose(0, 4, 1, 2, 3)) + + elif isinstance(video[0], torch.Tensor): + video = torch.cat(video, axis=0) if video[0].ndim == 5 else torch.stack(video, axis=0) + + # don't need any preprocess if the video is latents + channel = video.shape[1] + if channel == 4: + return video + + # move channels before num_frames + video = video.permute(0, 2, 1, 3, 4) + + # normalize video + video = 2.0 * video - 1.0 + + return video + + +class VideoToVideoSDPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): + r""" + Pipeline for text-to-video generation. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Same as Stable Diffusion 2. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet3DConditionModel`]): Conditional U-Net architecture to denoise the encoded video latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet3DConditionModel, + scheduler: KarrasDiffusionSchedulers, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded + to GPU only when their specific submodule has its `forward` method called. Note that offloading happens on a + submodule basis. Memory savings are higher than with `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.vae, self.unet]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + ) + prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents + def decode_latents(self, latents): + latents = 1 / self.vae.config.scaling_factor * latents + + batch_size, channels, num_frames, height, width = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width) + + image = self.vae.decode(latents).sample + video = ( + image[None, :] + .reshape( + ( + batch_size, + num_frames, + -1, + ) + + image.shape[2:] + ) + .permute(0, 2, 1, 3, 4) + ) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + video = video.float() + return video + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.check_inputs + def check_inputs( + self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + ): + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, video, timestep, batch_size, dtype, device, generator=None): + video = video.to(device=device, dtype=dtype) + + # change from (b, c, f, h, w) -> (b * f, c, w, h) + bsz, channel, frames, width, height = video.shape + video = video.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + + if video.shape[1] == 4: + init_latents = video + else: + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.vae.encode(video[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(video).latent_dist.sample(generator) + + init_latents = self.vae.config.scaling_factor * init_latents + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `video` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + latents = latents[None, :].reshape((bsz, frames, latents.shape[1]) + latents.shape[2:]).permute(0, 2, 1, 3, 4) + + return latents + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + video: Union[List[np.ndarray], torch.FloatTensor] = None, + strength: float = 0.6, + num_inference_steps: int = 50, + guidance_scale: float = 15.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "np", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`. + instead. + video: (`List[np.ndarray]` or `torch.FloatTensor`): + `video` frames or tensor representing a video batch, that will be used as the starting point for the + process. Can also accpet video latents as `image`, if passing latents directly, it will not be encoded + again. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality videos at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate videos that are closely linked to the text `prompt`, + usually at the expense of lower video quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the video generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. Latents should be of shape + `(batch_size, num_channel, num_frames, height, width)`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"np"`): + The output format of the generate video. Choose between `torch.FloatTensor` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + + Examples: + + Returns: + [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.TextToVideoSDPipelineOutput`] if `return_dict` is True, otherwise a `tuple. + When returning a tuple, the first element is a list with the generated frames. + """ + # 0. Default height and width to unet + num_images_per_prompt = 1 + + # 1. Check inputs. Raise error if not correct + self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + prompt_embeds = self._encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + + # 4. Preprocess video + video = preprocess_video(video) + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + # 5. Prepare latent variables + latents = self.prepare_latents(video, latent_timestep, batch_size, prompt_embeds.dtype, device, generator) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + ).sample + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + # reshape latents + bsz, channel, frames, width, height = latents.shape + latents = latents.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample + + # reshape latents back + latents = latents[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + if output_type == "latent": + return TextToVideoSDPipelineOutput(frames=latents) + + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.unet.to("cpu") + + video_tensor = self.decode_latents(latents) + + if output_type == "pt": + video = video_tensor + else: + video = tensor2vid(video_tensor) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (video,) + + return TextToVideoSDPipelineOutput(frames=video) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 3f0b17d879e5..0dbc8f1f6f99 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -782,6 +782,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class VideoToVideoSDPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class VQDiffusionPipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 008a8a2e6367..9fb3e167facc 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -652,11 +652,11 @@ def _test_xformers_attention_forwardGenerator_pass( pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(torch_device) - output_without_offload = pipe(**inputs)[0] + output_without_offload = pipe(**inputs)[0].cpu() pipe.enable_xformers_memory_efficient_attention() inputs = self.get_dummy_inputs(torch_device) - output_with_offload = pipe(**inputs)[0] + output_with_offload = pipe(**inputs)[0].cpu() if test_max_difference: max_diff = np.abs(output_with_offload - output_without_offload).max() diff --git a/tests/pipelines/text_to_video/test_video_to_video.py b/tests/pipelines/text_to_video/test_video_to_video.py new file mode 100644 index 000000000000..41e213c43dea --- /dev/null +++ b/tests/pipelines/text_to_video/test_video_to_video.py @@ -0,0 +1,195 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DDIMScheduler, + UNet3DConditionModel, + VideoToVideoSDPipeline, +) +from diffusers.utils import floats_tensor, is_xformers_available, skip_mps +from diffusers.utils.testing_utils import enable_full_determinism, slow, torch_device + +from ..pipeline_params import ( + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +@skip_mps +class VideoToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = VideoToVideoSDPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS.union({"video"}) - {"image", "width", "height"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS.union({"video"}) - {"image"} + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + test_attention_slicing = False + + # No `output_type`. + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "return_dict", + "callback", + "callback_steps", + ] + ) + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet3DConditionModel( + block_out_channels=(32, 64, 64, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D"), + up_block_types=("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"), + cross_attention_dim=32, + attention_head_dim=4, + ) + scheduler = DDIMScheduler( + beta_start=0.00085, + beta_end=0.012, + beta_schedule="scaled_linear", + clip_sample=False, + set_alpha_to_one=False, + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + hidden_act="gelu", + projection_dim=512, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + } + return components + + def get_dummy_inputs(self, device, seed=0): + # 3 frames + video = floats_tensor((1, 3, 3, 32, 32), rng=random.Random(seed)).to(device) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "video": video, + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 6.0, + "output_type": "pt", + } + return inputs + + def test_text_to_video_default_case(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = VideoToVideoSDPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["output_type"] = "np" + frames = sd_pipe(**inputs).frames + image_slice = frames[0][-3:, -3:, -1] + + assert frames[0].shape == (32, 32, 3) + expected_slice = np.array([106, 117, 113, 174, 137, 112, 148, 151, 131]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + @unittest.skipIf( + torch_device != "cuda" or not is_xformers_available(), + reason="XFormers attention is only available with CUDA and `xformers` installed", + ) + def test_xformers_attention_forwardGenerator_pass(self): + self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=5e-3) + + # (todo): sayakpaul + @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") + def test_inference_batch_consistent(self): + pass + + # (todo): sayakpaul + @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.") + def test_inference_batch_single_identical(self): + pass + + @unittest.skip(reason="`num_images_per_prompt` argument is not supported for this pipeline.") + def test_num_images_per_prompt(self): + pass + + def test_progress_bar(self): + return super().test_progress_bar() + + +@slow +@skip_mps +class VideoToVideoSDPipelineSlowTests(unittest.TestCase): + def test_two_step_model(self): + pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16) + pipe.enable_model_cpu_offload() + + # 10 frames + generator = torch.Generator(device="cpu").manual_seed(0) + video = torch.randn((1, 10, 3, 1024, 576), generator=generator) + video = video.to("cuda") + + prompt = "Spiderman is surfing" + + video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="pt").frames + + expected_array = np.array([-1.0458984, -1.1279297, -0.9663086, -0.91503906, -0.75097656]) + assert np.abs(video_frames.cpu().numpy()[0, 0, 0, 0, -5:] - expected_array).sum() < 1e-2 From f911287cc94218b3d65d97c233ed34d5b729c8c5 Mon Sep 17 00:00:00 2001 From: Aisuko Date: Mon, 3 Jul 2023 20:28:05 +1000 Subject: [PATCH 168/199] fix/doc-code: Updating to the latest version parameters (#3924) fix/doc-code: update to use the new parameter Signed-off-by: GitHub --- docs/source/en/tutorials/basic_training.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/tutorials/basic_training.mdx b/docs/source/en/tutorials/basic_training.mdx index c8f5c7fac780..2cf9128f3dd4 100644 --- a/docs/source/en/tutorials/basic_training.mdx +++ b/docs/source/en/tutorials/basic_training.mdx @@ -313,7 +313,7 @@ Now you can wrap all these components together in a training loop with 🤗 Acce ... mixed_precision=config.mixed_precision, ... gradient_accumulation_steps=config.gradient_accumulation_steps, ... log_with="tensorboard", -... logging_dir=os.path.join(config.output_dir, "logs"), +... project_dir=os.path.join(config.output_dir, "logs"), ... ) ... if accelerator.is_main_process: ... if config.push_to_hub: From b298484fd0303ba48f1fdceff3100d7068f62b79 Mon Sep 17 00:00:00 2001 From: Aisuko Date: Mon, 3 Jul 2023 20:28:42 +1000 Subject: [PATCH 169/199] fix/doc: no import torch issue (#3923) Ffix/doc: no import torch issue Signed-off-by: GitHub --- docs/source/en/stable_diffusion.mdx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/source/en/stable_diffusion.mdx b/docs/source/en/stable_diffusion.mdx index 78fa848421d8..7684052c313a 100644 --- a/docs/source/en/stable_diffusion.mdx +++ b/docs/source/en/stable_diffusion.mdx @@ -52,6 +52,8 @@ pipeline = pipeline.to("cuda") To make sure you can use the same image and improve on it, use a [`Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed for [reproducibility](./using-diffusers/reproducibility): ```python +import torch + generator = torch.Generator("cuda").manual_seed(0) ``` From 2e8668f0af98032fe402068c641783469e78b2f1 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 3 Jul 2023 15:10:07 +0200 Subject: [PATCH 170/199] Correct controlnet out of list error (#3928) * Correct controlnet out of list error * Apply suggestions from code review * correct tests * correct tests * fix * test all * Apply suggestions from code review * test all * test all * Apply suggestions from code review * Apply suggestions from code review * fix more tests * Fix more * Apply suggestions from code review * finish * Apply suggestions from code review * Update src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py * finish --- .../controlnet/pipeline_controlnet.py | 4 +- .../controlnet/pipeline_controlnet_img2img.py | 4 +- .../controlnet/pipeline_controlnet_inpaint.py | 4 +- ...eline_stable_diffusion_instruct_pix2pix.py | 2 +- .../schedulers/scheduling_deis_multistep.py | 15 +++++- .../scheduling_dpmsolver_multistep.py | 7 +-- .../scheduling_dpmsolver_singlestep.py | 7 +-- .../schedulers/scheduling_unipc_multistep.py | 14 ++++++ .../altdiffusion/test_alt_diffusion.py | 6 ++- tests/pipelines/controlnet/test_controlnet.py | 14 ++++-- .../controlnet/test_controlnet_img2img.py | 14 ++++-- .../controlnet/test_controlnet_inpaint.py | 14 ++++-- .../stable_diffusion/test_stable_diffusion.py | 6 ++- .../test_stable_diffusion_image_variation.py | 4 +- .../test_stable_diffusion_img2img.py | 6 ++- .../test_stable_diffusion_inpaint.py | 6 ++- ...st_stable_diffusion_instruction_pix2pix.py | 4 +- .../test_stable_diffusion_model_editing.py | 6 ++- .../test_stable_diffusion_pix2pix_zero.py | 6 ++- .../test_stable_diffusion.py | 6 ++- ...test_stable_diffusion_attend_and_excite.py | 4 +- .../test_stable_diffusion_depth.py | 6 ++- .../test_stable_diffusion_inpaint.py | 6 ++- .../test_stable_diffusion_latent_upscale.py | 49 ++++++++++++++++++- .../stable_unclip/test_stable_unclip.py | 11 ++++- .../test_stable_unclip_img2img.py | 5 +- tests/pipelines/test_pipelines_common.py | 46 +++++++++++++++++ 27 files changed, 225 insertions(+), 51 deletions(-) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py index dddfc3591b66..c266e8b20e74 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet.py @@ -947,9 +947,9 @@ def __call__( # 7.1 Create tensor stating which controlnets to keep controlnet_keep = [] - for i in range(num_inference_steps): + for i in range(len(timesteps)): keeps = [ - 1.0 - float(i / num_inference_steps < s or (i + 1) / num_inference_steps > e) + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) for s, e in zip(control_guidance_start, control_guidance_end) ] controlnet_keep.append(keeps[0] if len(keeps) == 1 else keeps) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py index c7a0db96e8c0..fd013c4974f1 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_img2img.py @@ -1040,9 +1040,9 @@ def __call__( # 7.1 Create tensor stating which controlnets to keep controlnet_keep = [] - for i in range(num_inference_steps): + for i in range(len(timesteps)): keeps = [ - 1.0 - float(i / num_inference_steps < s or (i + 1) / num_inference_steps > e) + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) for s, e in zip(control_guidance_start, control_guidance_end) ] controlnet_keep.append(keeps[0] if len(keeps) == 1 else keeps) diff --git a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py index bfaaaae49401..7de3f1dd9d88 100644 --- a/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py +++ b/src/diffusers/pipelines/controlnet/pipeline_controlnet_inpaint.py @@ -1275,9 +1275,9 @@ def __call__( # 7.1 Create tensor stating which controlnets to keep controlnet_keep = [] - for i in range(num_inference_steps): + for i in range(len(timesteps)): keeps = [ - 1.0 - float(i / num_inference_steps < s or (i + 1) / num_inference_steps > e) + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) for s, e in zip(control_guidance_start, control_guidance_end) ] controlnet_keep.append(keeps[0] if len(keeps) == 1 else keeps) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py index 25102ae7cf4a..367e401d57f8 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py @@ -374,7 +374,7 @@ def __call__( # predicted_original_sample instead of the noise_pred. So we need to compute the # predicted_original_sample here if we are using a karras style scheduler. if scheduler_is_in_sigma_space: - step_index = (self.scheduler.timesteps == t).nonzero().item() + step_index = (self.scheduler.timesteps == t).nonzero()[0].item() sigma = self.scheduler.sigmas[step_index] noise_pred = latent_model_input - sigma * noise_pred diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index 8ea001a882d0..56c362018c18 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -103,7 +103,10 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): lower_order_final (`bool`, default `True`): whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. We empirically find this trick can stabilize the sampling of DEIS for steps < 15, especially for steps <= 10. - + use_karras_sigmas (`bool`, *optional*, defaults to `False`): + This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the + noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence + of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -125,6 +128,7 @@ def __init__( algorithm_type: str = "deis", solver_type: str = "logrho", lower_order_final: bool = True, + use_karras_sigmas: Optional[bool] = False, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -188,6 +192,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic .astype(np.int64) ) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + if self.config.use_karras_sigmas: + log_sigmas = np.log(sigmas) + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() + timesteps = np.flip(timesteps).copy().astype(np.int64) + + self.sigmas = torch.from_numpy(sigmas) + # when num_inference_steps == num_train_timesteps, we can end up with # duplicates in timesteps. _, unique_indices = np.unique(timesteps, return_index=True) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index e72b1bdc23b5..d7c29d5488a5 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -203,7 +203,6 @@ def __init__( self.timesteps = torch.from_numpy(timesteps) self.model_outputs = [None] * solver_order self.lower_order_nums = 0 - self.use_karras_sigmas = use_karras_sigmas def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None): """ @@ -225,13 +224,15 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc .astype(np.int64) ) - if self.use_karras_sigmas: - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + if self.config.use_karras_sigmas: log_sigmas = np.log(sigmas) sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() timesteps = np.flip(timesteps).copy().astype(np.int64) + self.sigmas = torch.from_numpy(sigmas) + # when num_inference_steps == num_train_timesteps, we can end up with # duplicates in timesteps. _, unique_indices = np.unique(timesteps, return_index=True) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 7fa8eabb5a15..721dd5e5bb85 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -202,7 +202,6 @@ def __init__( self.model_outputs = [None] * solver_order self.sample = None self.order_list = self.get_order_list(num_train_timesteps) - self.use_karras_sigmas = use_karras_sigmas def get_order_list(self, num_inference_steps: int) -> List[int]: """ @@ -259,13 +258,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic .astype(np.int64) ) - if self.use_karras_sigmas: - sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + if self.config.use_karras_sigmas: log_sigmas = np.log(sigmas) sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() timesteps = np.flip(timesteps).copy().astype(np.int64) + self.sigmas = torch.from_numpy(sigmas) + self.timesteps = torch.from_numpy(timesteps).to(device) self.model_outputs = [None] * self.config.solver_order self.sample = None diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 2cce68f7d962..7233258a4766 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -117,6 +117,10 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): by disable the corrector at the first few steps (e.g., disable_corrector=[0]) solver_p (`SchedulerMixin`, default `None`): can be any other scheduler. If specified, the algorithm will become solver_p + UniC. + use_karras_sigmas (`bool`, *optional*, defaults to `False`): + This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the + noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence + of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -140,6 +144,7 @@ def __init__( lower_order_final: bool = True, disable_corrector: List[int] = [], solver_p: SchedulerMixin = None, + use_karras_sigmas: Optional[bool] = False, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -201,6 +206,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic .astype(np.int64) ) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) + if self.config.use_karras_sigmas: + log_sigmas = np.log(sigmas) + sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=num_inference_steps) + timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]).round() + timesteps = np.flip(timesteps).copy().astype(np.int64) + + self.sigmas = torch.from_numpy(sigmas) + # when num_inference_steps == num_train_timesteps, we can end up with # duplicates in timesteps. _, unique_indices = np.unique(timesteps, return_index=True) diff --git a/tests/pipelines/altdiffusion/test_alt_diffusion.py b/tests/pipelines/altdiffusion/test_alt_diffusion.py index 1344d33a2552..3f16964fd567 100644 --- a/tests/pipelines/altdiffusion/test_alt_diffusion.py +++ b/tests/pipelines/altdiffusion/test_alt_diffusion.py @@ -29,13 +29,15 @@ from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() -class AltDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class AltDiffusionPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = AltDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/controlnet/test_controlnet.py b/tests/pipelines/controlnet/test_controlnet.py index 906e1e7ee66f..a548983c3841 100644 --- a/tests/pipelines/controlnet/test_controlnet.py +++ b/tests/pipelines/controlnet/test_controlnet.py @@ -46,7 +46,11 @@ TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import ( + PipelineKarrasSchedulerTesterMixin, + PipelineLatentTesterMixin, + PipelineTesterMixin, +) enable_full_determinism() @@ -97,7 +101,9 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout): out_queue.join() -class ControlNetPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class ControlNetPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS @@ -207,7 +213,9 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) -class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionMultiControlNetPipelineFastTests( + PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionControlNetPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/controlnet/test_controlnet_img2img.py b/tests/pipelines/controlnet/test_controlnet_img2img.py index bc0e96b2f92b..c46593f03e5e 100644 --- a/tests/pipelines/controlnet/test_controlnet_img2img.py +++ b/tests/pipelines/controlnet/test_controlnet_img2img.py @@ -42,13 +42,19 @@ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import ( + PipelineKarrasSchedulerTesterMixin, + PipelineLatentTesterMixin, + PipelineTesterMixin, +) enable_full_determinism() -class ControlNetImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class ControlNetImg2ImgPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionControlNetImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS @@ -161,7 +167,9 @@ def test_inference_batch_single_identical(self): self._test_inference_batch_single_identical(expected_max_diff=2e-3) -class StableDiffusionMultiControlNetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class StableDiffusionMultiControlNetPipelineFastTests( + PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionControlNetImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS diff --git a/tests/pipelines/controlnet/test_controlnet_inpaint.py b/tests/pipelines/controlnet/test_controlnet_inpaint.py index 81647d968b6b..cf423f4c49d0 100644 --- a/tests/pipelines/controlnet/test_controlnet_inpaint.py +++ b/tests/pipelines/controlnet/test_controlnet_inpaint.py @@ -42,13 +42,19 @@ TEXT_GUIDED_IMAGE_INPAINTING_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import ( + PipelineKarrasSchedulerTesterMixin, + PipelineLatentTesterMixin, + PipelineTesterMixin, +) enable_full_determinism() -class ControlNetInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class ControlNetInpaintPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionControlNetInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS @@ -237,7 +243,9 @@ def get_dummy_components(self): return components -class MultiControlNetInpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): +class MultiControlNetInpaintPipelineFastTests( + PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionControlNetInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 93abe7ae58bc..7daf3fcda4a2 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -50,7 +50,7 @@ from ...models.test_lora_layers import create_unet_lora_layers from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() @@ -88,7 +88,9 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout): out_queue.join() -class StableDiffusionPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py index e16478f06112..580c78675a92 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_image_variation.py @@ -33,14 +33,14 @@ from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import IMAGE_VARIATION_BATCH_PARAMS, IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() class StableDiffusionImageVariationPipelineFastTests( - PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase ): pipeline_class = StableDiffusionImageVariationPipeline params = IMAGE_VARIATION_PARAMS diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py index eefbc83ce9d7..d1f7a49467e6 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py @@ -46,7 +46,7 @@ TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() @@ -84,7 +84,9 @@ def _test_img2img_compile(in_queue, out_queue, timeout): out_queue.join() -class StableDiffusionImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionImg2ImgPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py index f761f245883f..e7b084acb280 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py @@ -42,7 +42,7 @@ from ...models.test_models_unet_2d_condition import create_lora_layers from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() @@ -82,7 +82,9 @@ def _test_inpaint_compile(in_queue, out_queue, timeout): out_queue.join() -class StableDiffusionInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionInpaintPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py index 691427b1c6eb..513e11c105d5 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_instruction_pix2pix.py @@ -40,14 +40,14 @@ TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() class StableDiffusionInstructPix2PixPipelineFastTests( - PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase ): pipeline_class = StableDiffusionInstructPix2PixPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "cross_attention_kwargs"} diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py index f47a70c4ece8..81d1baed5df6 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_model_editing.py @@ -32,14 +32,16 @@ from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, skip_mps from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() @skip_mps -class StableDiffusionModelEditingPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionModelEditingPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionModelEditingPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py index 6f41d2c43c8e..1b17f8b31be9 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py @@ -41,7 +41,11 @@ TEXT_GUIDED_IMAGE_VARIATION_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference +from ..test_pipelines_common import ( + PipelineLatentTesterMixin, + PipelineTesterMixin, + assert_mean_pixel_difference, +) enable_full_determinism() diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py index 33cc7f638ec2..a26abfa50096 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion.py @@ -36,13 +36,15 @@ from diffusers.utils.testing_utils import CaptureLogger, enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() -class StableDiffusion2PipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusion2PipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py index 304ddacd2c36..b4d49c92425c 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_attend_and_excite.py @@ -30,7 +30,7 @@ from diffusers.utils.testing_utils import require_torch_gpu from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin torch.backends.cuda.matmul.allow_tf32 = False @@ -38,7 +38,7 @@ @skip_mps class StableDiffusionAttendAndExcitePipelineFastTests( - PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase ): pipeline_class = StableDiffusionAttendAndExcitePipeline test_attention_slicing = False diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py index f393967c7de4..fe2cf73da096 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_depth.py @@ -57,14 +57,16 @@ TEXT_GUIDED_IMAGE_VARIATION_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, ) -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() @skip_mps -class StableDiffusionDepth2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusionDepth2ImgPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionDepth2ImgPipeline test_save_load_optional_components = False params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py index 37c254f367f3..68a4b5132375 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_inpaint.py @@ -27,13 +27,15 @@ from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow from ..pipeline_params import TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS, TEXT_GUIDED_IMAGE_INPAINTING_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() -class StableDiffusion2InpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableDiffusion2InpaintPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionInpaintPipeline params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py index b94aaca4258a..5e1d610efcaf 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_latent_upscale.py @@ -21,6 +21,7 @@ import torch from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer +import diffusers from diffusers import ( AutoencoderKL, EulerDiscreteScheduler, @@ -28,17 +29,25 @@ StableDiffusionPipeline, UNet2DConditionModel, ) +from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin +from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin enable_full_determinism() -class StableDiffusionLatentUpscalePipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +def check_same_shape(tensor_list): + shapes = [tensor.shape for tensor in tensor_list] + return all(shape == shapes[0] for shape in shapes[1:]) + + +class StableDiffusionLatentUpscalePipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableDiffusionLatentUpscalePipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - { "height", @@ -185,6 +194,42 @@ def test_save_load_local(self): def test_save_load_optional_components(self): super().test_save_load_optional_components(expected_max_difference=3e-3) + def test_karras_schedulers_shape(self): + skip_schedulers = [ + "DDIMScheduler", + "DDPMScheduler", + "PNDMScheduler", + "HeunDiscreteScheduler", + "EulerAncestralDiscreteScheduler", + "KDPM2DiscreteScheduler", + "KDPM2AncestralDiscreteScheduler", + "DPMSolverSDEScheduler", + ] + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + + # make sure that PNDM does not need warm-up + pipe.scheduler.register_to_config(skip_prk_steps=True) + + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = 2 + + outputs = [] + for scheduler_enum in KarrasDiffusionSchedulers: + if scheduler_enum.name in skip_schedulers: + # no sigma schedulers are not supported + # no schedulers + continue + + scheduler_cls = getattr(diffusers, scheduler_enum.name) + pipe.scheduler = scheduler_cls.from_config(pipe.scheduler.config) + output = pipe(**inputs)[0] + outputs.append(output) + + assert check_same_shape(outputs) + @require_torch_gpu @slow diff --git a/tests/pipelines/stable_unclip/test_stable_unclip.py b/tests/pipelines/stable_unclip/test_stable_unclip.py index 4bbbad757edf..8d5edda16904 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip.py @@ -16,13 +16,20 @@ from diffusers.utils.testing_utils import enable_full_determinism, load_numpy, require_torch_gpu, slow, torch_device from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS -from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference +from ..test_pipelines_common import ( + PipelineKarrasSchedulerTesterMixin, + PipelineLatentTesterMixin, + PipelineTesterMixin, + assert_mean_pixel_difference, +) enable_full_determinism() -class StableUnCLIPPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableUnCLIPPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableUnCLIPPipeline params = TEXT_TO_IMAGE_PARAMS batch_params = TEXT_TO_IMAGE_BATCH_PARAMS diff --git a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py index 741343066133..52581eb574e0 100644 --- a/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py +++ b/tests/pipelines/stable_unclip/test_stable_unclip_img2img.py @@ -30,6 +30,7 @@ from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS from ..test_pipelines_common import ( + PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, assert_mean_pixel_difference, @@ -39,7 +40,9 @@ enable_full_determinism() -class StableUnCLIPImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): +class StableUnCLIPImg2ImgPipelineFastTests( + PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase +): pipeline_class = StableUnCLIPImg2ImgPipeline params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 9fb3e167facc..52dd4afd6b21 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -14,6 +14,7 @@ import diffusers from diffusers import DiffusionPipeline from diffusers.image_processor import VaeImageProcessor +from diffusers.schedulers import KarrasDiffusionSchedulers from diffusers.utils import logging from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available from diffusers.utils.testing_utils import require_torch, torch_device @@ -26,6 +27,11 @@ def to_np(tensor): return tensor +def check_same_shape(tensor_list): + shapes = [tensor.shape for tensor in tensor_list] + return all(shape == shapes[0] for shape in shapes[1:]) + + class PipelineLatentTesterMixin: """ This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes. @@ -155,6 +161,46 @@ def test_latents_input(self): self.assertLess(max_diff, 1e-4, "passing latents as image input generate different result from passing image") +@require_torch +class PipelineKarrasSchedulerTesterMixin: + """ + This mixin is designed to be used with unittest.TestCase classes. + It provides a set of common tests for each PyTorch pipeline that makes use of KarrasDiffusionSchedulers + equivalence of dict and tuple outputs, etc. + """ + + def test_karras_schedulers_shape(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + + # make sure that PNDM does not need warm-up + pipe.scheduler.register_to_config(skip_prk_steps=True) + + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + inputs = self.get_dummy_inputs(torch_device) + inputs["num_inference_steps"] = 2 + + if "strength" in inputs: + inputs["num_inference_steps"] = 4 + inputs["strength"] = 0.5 + + outputs = [] + for scheduler_enum in KarrasDiffusionSchedulers: + if "KDPM2" in scheduler_enum.name: + inputs["num_inference_steps"] = 5 + + scheduler_cls = getattr(diffusers, scheduler_enum.name) + pipe.scheduler = scheduler_cls.from_config(pipe.scheduler.config) + output = pipe(**inputs)[0] + outputs.append(output) + + if "KDPM2" in scheduler_enum.name: + inputs["num_inference_steps"] = 2 + + assert check_same_shape(outputs) + + @require_torch class PipelineTesterMixin: """ From 572d8e2002236e61045d9d80505b92d45cfdcbca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20Mauricio=20Repetto=20Ferrero?= Date: Mon, 3 Jul 2023 10:55:45 -0500 Subject: [PATCH 171/199] Adding better way to define multiple concepts and also validation capabilities. (#3807) * - Added validation parameters - Changed some parameter descriptions to better explain their use. - Fixed a few typos. - Added concept_list parameter for better management of multiple subjects - changed logic for image validation * - Fixed bad logic for class data root directories * Defaulting validation_steps to None for an easier logic * Fixed multiple validation prompts * Fixed bug on validation negative prompt * Changed validation logic for tracker. * Added uuid for validation image labeling * Fix error when comparing validation prompts and validation negative prompts * Improved error message when negative prompts for validation are more than the number of prompts * - Changed image tracking number from epoch to global_step - Added Typing for functions * Added some validations more when using concept_list parameter and the regular ones. * Fixed error message * Added more validations for validation parameters * Improved messaging for errors * Fixed validation error for parameters with default values * - Added train step to image name for validation - reformatted code * - Added train step to image's name for validation - reformatted code * Updated README.md file. * reverted back original script of train_dreambooth.py * reverted back original script of train_dreambooth.py * left one blank line at the eof * reverted back setup.py * reverted back setup.py * added same logic for when parameters for prior preservation are used without enabling the flag while using concept_list parameter. * Ran black formatter. * fixed a few strings * fixed import sort with isort and removed fstrings without placeholder * fixed import order with ruff (since with isort wasn't ok) --------- Co-authored-by: Patrick von Platen --- .../multi_subject_dreambooth/README.md | 47 ++ .../train_multi_subject_dreambooth.py | 410 +++++++++++++++--- 2 files changed, 404 insertions(+), 53 deletions(-) diff --git a/examples/research_projects/multi_subject_dreambooth/README.md b/examples/research_projects/multi_subject_dreambooth/README.md index cf7dd31d0797..d1a7705cfebb 100644 --- a/examples/research_projects/multi_subject_dreambooth/README.md +++ b/examples/research_projects/multi_subject_dreambooth/README.md @@ -86,6 +86,53 @@ This example shows training for 2 subjects, but please note that the model can b Note also that in this script, `sks` and `t@y` were used as tokens to learn the new subjects ([this thread](https://github.com/XavierXiao/Dreambooth-Stable-Diffusion/issues/71) inspired the use of `t@y` as our second identifier). However, there may be better rare tokens to experiment with, and results also seemed to be good when more intuitive words are used. +**Important**: New parameters are added to the script, making possible to validate the progress of the training by +generating images at specified steps. Taking also into account that a comma separated list in a text field for a prompt +it's never a good idea (simply because it is very common in prompts to have them as part of a regular text) we +introduce the `concept_list` parameter: allowing to specify a json-like file where you can define the different +configuration for each subject that you want to train. + +An example of how to generate the file: +```python +import json + +# here we are using parameters for prior-preservation and validation as well. +concepts_list = [ + { + "instance_prompt": "drawing of a t@y meme", + "class_prompt": "drawing of a meme", + "instance_data_dir": "/some_folder/meme_toy", + "class_data_dir": "/data/meme", + "validation_prompt": "drawing of a t@y meme about football in Uruguay", + "validation_negative_prompt": "black and white" + }, + { + "instance_prompt": "drawing of a sks sir", + "class_prompt": "drawing of a sir", + "instance_data_dir": "/some_other_folder/sir_sks", + "class_data_dir": "/data/sir", + "validation_prompt": "drawing of a sks sir with the Uruguayan sun in his chest", + "validation_negative_prompt": "an old man", + "validation_guidance_scale": 20, + "validation_number_images": 3, + "validation_inference_steps": 10 + } +] + +with open("concepts_list.json", "w") as f: + json.dump(concepts_list, f, indent=4) +``` +And then just point to the file when executing the script: + +```bash +# exports... +accelerate launch train_multi_subject_dreambooth.py \ +# more parameters... +--concepts_list="concepts_list.json" +``` + +You can use the helper from the script to get a better sense of each parameter. + ### Inference Once you have trained a model using above command, the inference can be done simply using the `StableDiffusionPipeline`. Make sure to include the `identifier`(e.g. sks in above example) in your prompt. diff --git a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py index f24c6057fd8c..c75a0a9acc64 100644 --- a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py +++ b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py @@ -1,13 +1,18 @@ import argparse import hashlib import itertools +import json import logging import math -import os +import uuid import warnings +from os import environ, listdir, makedirs +from os.path import basename, join from pathlib import Path +from typing import List import datasets +import numpy as np import torch import torch.nn.functional as F import torch.utils.checkpoint @@ -17,24 +22,140 @@ from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import create_repo, upload_folder from PIL import Image +from torch import dtype +from torch.nn import Module from torch.utils.data import Dataset from torchvision import transforms from tqdm.auto import tqdm from transformers import AutoTokenizer, PretrainedConfig import diffusers -from diffusers import AutoencoderKL, DDPMScheduler, DiffusionPipeline, UNet2DConditionModel +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + DiffusionPipeline, + DPMSolverMultistepScheduler, + UNet2DConditionModel, +) from diffusers.optimization import get_scheduler -from diffusers.utils import check_min_version +from diffusers.utils import check_min_version, is_wandb_available from diffusers.utils.import_utils import is_xformers_available +if is_wandb_available(): + import wandb + # Will error if the minimal version of diffusers is not installed. Remove at your own risks. check_min_version("0.13.0.dev0") logger = get_logger(__name__) +def log_validation_images_to_tracker( + images: List[np.array], label: str, validation_prompt: str, accelerator: Accelerator, epoch: int +): + logger.info(f"Logging images to tracker for validation prompt: {validation_prompt}.") + + for tracker in accelerator.trackers: + if tracker.name == "tensorboard": + np_images = np.stack([np.asarray(img) for img in images]) + tracker.writer.add_images("validation", np_images, epoch, dataformats="NHWC") + if tracker.name == "wandb": + tracker.log( + { + "validation": [ + wandb.Image(image, caption=f"{label}_{epoch}_{i}: {validation_prompt}") + for i, image in enumerate(images) + ] + } + ) + + +# TODO: Add `prompt_embeds` and `negative_prompt_embeds` parameters to the function when `pre_compute_text_embeddings` +# argument is implemented. +def generate_validation_images( + text_encoder: Module, + tokenizer: Module, + unet: Module, + vae: Module, + arguments: argparse.Namespace, + accelerator: Accelerator, + weight_dtype: dtype, +): + logger.info("Running validation images.") + + pipeline_args = {} + + if text_encoder is not None: + pipeline_args["text_encoder"] = accelerator.unwrap_model(text_encoder) + + if vae is not None: + pipeline_args["vae"] = vae + + # create pipeline (note: unet and vae are loaded again in float32) + pipeline = DiffusionPipeline.from_pretrained( + arguments.pretrained_model_name_or_path, + tokenizer=tokenizer, + unet=accelerator.unwrap_model(unet), + revision=arguments.revision, + torch_dtype=weight_dtype, + **pipeline_args, + ) + + # We train on the simplified learning objective. If we were previously predicting a variance, we need the + # scheduler to ignore it + scheduler_args = {} + + if "variance_type" in pipeline.scheduler.config: + variance_type = pipeline.scheduler.config.variance_type + + if variance_type in ["learned", "learned_range"]: + variance_type = "fixed_small" + + scheduler_args["variance_type"] = variance_type + + pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, **scheduler_args) + pipeline = pipeline.to(accelerator.device) + pipeline.set_progress_bar_config(disable=True) + + generator = ( + None if arguments.seed is None else torch.Generator(device=accelerator.device).manual_seed(arguments.seed) + ) + + images_sets = [] + for vp, nvi, vnp, vis, vgs in zip( + arguments.validation_prompt, + arguments.validation_number_images, + arguments.validation_negative_prompt, + arguments.validation_inference_steps, + arguments.validation_guidance_scale, + ): + images = [] + if vp is not None: + logger.info( + f"Generating {nvi} images with prompt: '{vp}', negative prompt: '{vnp}', inference steps: {vis}, " + f"guidance scale: {vgs}." + ) + + pipeline_args = {"prompt": vp, "negative_prompt": vnp, "num_inference_steps": vis, "guidance_scale": vgs} + + # run inference + # TODO: it would be good to measure whether it's faster to run inference on all images at once, one at a + # time or in small batches + for _ in range(nvi): + with torch.autocast("cuda"): + image = pipeline(**pipeline_args, num_images_per_prompt=1, generator=generator).images[0] + images.append(image) + + images_sets.append(images) + + del pipeline + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return images_sets + + def import_model_class_from_model_name_or_path(pretrained_model_name_or_path: str, revision: str): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, @@ -81,7 +202,7 @@ def parse_args(input_args=None): "--instance_data_dir", type=str, default=None, - required=True, + required=False, help="A folder containing the training data of instance images.", ) parser.add_argument( @@ -95,7 +216,7 @@ def parse_args(input_args=None): "--instance_prompt", type=str, default=None, - required=True, + required=False, help="The prompt with identifier specifying the instance", ) parser.add_argument( @@ -272,6 +393,52 @@ def parse_args(input_args=None): ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.' ), ) + parser.add_argument( + "--validation_steps", + type=int, + default=None, + help=( + "Run validation every X steps. Validation consists of running the prompt(s) `validation_prompt` " + "multiple times (`validation_number_images`) and logging the images." + ), + ) + parser.add_argument( + "--validation_prompt", + type=str, + default=None, + help="A prompt that is used during validation to verify that the model is learning. You can use commas to " + "define multiple negative prompts. This parameter can be defined also within the file given by " + "`concepts_list` parameter in the respective subject.", + ) + parser.add_argument( + "--validation_number_images", + type=int, + default=4, + help="Number of images that should be generated during validation with the validation parameters given. This " + "can be defined within the file given by `concepts_list` parameter in the respective subject.", + ) + parser.add_argument( + "--validation_negative_prompt", + type=str, + default=None, + help="A negative prompt that is used during validation to verify that the model is learning. You can use commas" + " to define multiple negative prompts, each one corresponding to a validation prompt. This parameter can " + "be defined also within the file given by `concepts_list` parameter in the respective subject.", + ) + parser.add_argument( + "--validation_inference_steps", + type=int, + default=25, + help="Number of inference steps (denoising steps) to run during validation. This can be defined within the " + "file given by `concepts_list` parameter in the respective subject.", + ) + parser.add_argument( + "--validation_guidance_scale", + type=float, + default=7.5, + help="To control how much the image generation process follows the text prompt. This can be defined within the " + "file given by `concepts_list` parameter in the respective subject.", + ) parser.add_argument( "--mixed_precision", type=str, @@ -297,27 +464,80 @@ def parse_args(input_args=None): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument( + "--set_grads_to_none", + action="store_true", + help=( + "Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain" + " behaviors, so disable this argument if it causes any problems. More info:" + " https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html" + ), + ) + parser.add_argument( + "--concepts_list", + type=str, + default=None, + help="Path to json file containing a list of multiple concepts, will overwrite parameters like instance_prompt," + " class_prompt, etc.", + ) - if input_args is not None: + if input_args: args = parser.parse_args(input_args) else: args = parser.parse_args() - env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) + if not args.concepts_list and (not args.instance_data_dir or not args.instance_prompt): + raise ValueError( + "You must specify either instance parameters (data directory, prompt, etc.) or use " + "the `concept_list` parameter and specify them within the file." + ) + + if args.concepts_list: + if args.instance_prompt: + raise ValueError("If you are using `concepts_list` parameter, define the instance prompt within the file.") + if args.instance_data_dir: + raise ValueError( + "If you are using `concepts_list` parameter, define the instance data directory within the file." + ) + if args.validation_steps and (args.validation_prompt or args.validation_negative_prompt): + raise ValueError( + "If you are using `concepts_list` parameter, define validation parameters for " + "each subject within the file:\n - `validation_prompt`." + "\n - `validation_negative_prompt`.\n - `validation_guidance_scale`." + "\n - `validation_number_images`.\n - `validation_prompt`." + "\n - `validation_inference_steps`.\nThe `validation_steps` parameter is the only one " + "that needs to be defined outside the file." + ) + + env_local_rank = int(environ.get("LOCAL_RANK", -1)) if env_local_rank != -1 and env_local_rank != args.local_rank: args.local_rank = env_local_rank if args.with_prior_preservation: - if args.class_data_dir is None: - raise ValueError("You must specify a data directory for class images.") - if args.class_prompt is None: - raise ValueError("You must specify prompt for class images.") + if not args.concepts_list: + if not args.class_data_dir: + raise ValueError("You must specify a data directory for class images.") + if not args.class_prompt: + raise ValueError("You must specify prompt for class images.") + else: + if args.class_data_dir: + raise ValueError( + "If you are using `concepts_list` parameter, define the class data directory within the file." + ) + if args.class_prompt: + raise ValueError( + "If you are using `concepts_list` parameter, define the class prompt within the file." + ) else: # logger is not available yet - if args.class_data_dir is not None: - warnings.warn("You need not use --class_data_dir without --with_prior_preservation.") - if args.class_prompt is not None: - warnings.warn("You need not use --class_prompt without --with_prior_preservation.") + if not args.class_data_dir: + warnings.warn( + "Ignoring `class_data_dir` parameter, you need to use it together with `with_prior_preservation`." + ) + if not args.class_prompt: + warnings.warn( + "Ignoring `class_prompt` parameter, you need to use it together with `with_prior_preservation`." + ) return args @@ -325,7 +545,7 @@ def parse_args(input_args=None): class DreamBoothDataset(Dataset): """ A dataset to prepare the instance and class images with the prompts for fine-tuning the model. - It pre-processes the images and the tokenizes prompts. + It pre-processes the images and then tokenizes prompts. """ def __init__( @@ -346,7 +566,7 @@ def __init__( self.instance_images_path = [] self.num_instance_images = [] self.instance_prompt = [] - self.class_data_root = [] + self.class_data_root = [] if class_data_root is not None else None self.class_images_path = [] self.num_class_images = [] self.class_prompt = [] @@ -371,8 +591,6 @@ def __init__( self._length -= self.num_instance_images[i] self._length += self.num_class_images[i] self.class_prompt.append(class_prompt[i]) - else: - self.class_data_root = None self.image_transforms = transforms.Compose( [ @@ -446,7 +664,7 @@ def collate_fn(num_instances, examples, with_prior_preservation=False): class PromptDataset(Dataset): - "A simple dataset to prepare the prompts to generate class images on multiple GPUs." + """A simple dataset to prepare the prompts to generate class images on multiple GPUs.""" def __init__(self, prompt, num_samples): self.prompt = prompt @@ -474,6 +692,10 @@ def main(args): project_config=accelerator_project_config, ) + if args.report_to == "wandb": + if not is_wandb_available(): + raise ImportError("Make sure to install wandb if you want to use it for logging during training.") + # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate # This will be enabled soon in accelerate. For now, we don't allow gradient accumulation when training two models. # TODO (patil-suraj): Remove this check when gradient accumulation with two models is enabled in accelerate. @@ -483,23 +705,84 @@ def main(args): "Please set gradient_accumulation_steps to 1. This feature will be supported in the future." ) - # Parse instance and class inputs, and double check that lengths match - instance_data_dir = args.instance_data_dir.split(",") - instance_prompt = args.instance_prompt.split(",") - assert all( - x == len(instance_data_dir) for x in [len(instance_data_dir), len(instance_prompt)] - ), "Instance data dir and prompt inputs are not of the same length." + instance_data_dir = [] + instance_prompt = [] + class_data_dir = [] if args.with_prior_preservation else None + class_prompt = [] if args.with_prior_preservation else None + if args.concepts_list: + with open(args.concepts_list, "r") as f: + concepts_list = json.load(f) + + if args.validation_steps: + args.validation_prompt = [] + args.validation_number_images = [] + args.validation_negative_prompt = [] + args.validation_inference_steps = [] + args.validation_guidance_scale = [] + + for concept in concepts_list: + instance_data_dir.append(concept["instance_data_dir"]) + instance_prompt.append(concept["instance_prompt"]) + + if args.with_prior_preservation: + try: + class_data_dir.append(concept["class_data_dir"]) + class_prompt.append(concept["class_prompt"]) + except KeyError: + raise KeyError( + "`class_data_dir` or `class_prompt` not found in concepts_list while using " + "`with_prior_preservation`." + ) + else: + if "class_data_dir" in concept: + warnings.warn( + "Ignoring `class_data_dir` key, to use it you need to enable `with_prior_preservation`." + ) + if "class_prompt" in concept: + warnings.warn( + "Ignoring `class_prompt` key, to use it you need to enable `with_prior_preservation`." + ) - if args.with_prior_preservation: - class_data_dir = args.class_data_dir.split(",") - class_prompt = args.class_prompt.split(",") - assert all( - x == len(instance_data_dir) - for x in [len(instance_data_dir), len(instance_prompt), len(class_data_dir), len(class_prompt)] - ), "Instance & class data dir or prompt inputs are not of the same length." + if args.validation_steps: + args.validation_prompt.append(concept.get("validation_prompt", None)) + args.validation_number_images.append(concept.get("validation_number_images", 4)) + args.validation_negative_prompt.append(concept.get("validation_negative_prompt", None)) + args.validation_inference_steps.append(concept.get("validation_inference_steps", 25)) + args.validation_guidance_scale.append(concept.get("validation_guidance_scale", 7.5)) else: - class_data_dir = args.class_data_dir - class_prompt = args.class_prompt + # Parse instance and class inputs, and double check that lengths match + instance_data_dir = args.instance_data_dir.split(",") + instance_prompt = args.instance_prompt.split(",") + assert all( + x == len(instance_data_dir) for x in [len(instance_data_dir), len(instance_prompt)] + ), "Instance data dir and prompt inputs are not of the same length." + + if args.with_prior_preservation: + class_data_dir = args.class_data_dir.split(",") + class_prompt = args.class_prompt.split(",") + assert all( + x == len(instance_data_dir) + for x in [len(instance_data_dir), len(instance_prompt), len(class_data_dir), len(class_prompt)] + ), "Instance & class data dir or prompt inputs are not of the same length." + + if args.validation_steps: + validation_prompts = args.validation_prompt.split(",") + num_of_validation_prompts = len(validation_prompts) + args.validation_prompt = validation_prompts + args.validation_number_images = [args.validation_number_images] * num_of_validation_prompts + + negative_validation_prompts = [None] * num_of_validation_prompts + if args.validation_negative_prompt: + negative_validation_prompts = args.validation_negative_prompt.split(",") + while len(negative_validation_prompts) < num_of_validation_prompts: + negative_validation_prompts.append(None) + args.validation_negative_prompt = negative_validation_prompts + + assert num_of_validation_prompts == len( + negative_validation_prompts + ), "The length of negative prompts for validation is greater than the number of validation prompts." + args.validation_inference_steps = [args.validation_inference_steps] * num_of_validation_prompts + args.validation_guidance_scale = [args.validation_guidance_scale] * num_of_validation_prompts # Make one log on every process with the configuration for debugging. logging.basicConfig( @@ -559,21 +842,24 @@ def main(args): ): images = pipeline(example["prompt"]).images - for i, image in enumerate(images): + for ii, image in enumerate(images): hash_image = hashlib.sha1(image.tobytes()).hexdigest() image_filename = ( - class_images_dir / f"{example['index'][i] + cur_class_images}-{hash_image}.jpg" + class_images_dir / f"{example['index'][ii] + cur_class_images}-{hash_image}.jpg" ) image.save(image_filename) + # Clean up the memory deleting one-time-use variables. del pipeline + del sample_dataloader + del sample_dataset if torch.cuda.is_available(): torch.cuda.empty_cache() # Handle the repository creation if accelerator.is_main_process: if args.output_dir is not None: - os.makedirs(args.output_dir, exist_ok=True) + makedirs(args.output_dir, exist_ok=True) if args.push_to_hub: repo_id = create_repo( @@ -581,6 +867,7 @@ def main(args): ).repo_id # Load the tokenizer + tokenizer = None if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, revision=args.revision, use_fast=False) elif args.pretrained_model_name_or_path: @@ -658,7 +945,7 @@ def main(args): train_dataset = DreamBoothDataset( instance_data_root=instance_data_dir, instance_prompt=instance_prompt, - class_data_root=class_data_dir if args.with_prior_preservation else None, + class_data_root=class_data_dir, class_prompt=class_prompt, tokenizer=tokenizer, size=args.resolution, @@ -720,7 +1007,7 @@ def main(args): args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # We need to initialize the trackers we use, and also store our configuration. - # The trackers initializes automatically on the main process. + # The trackers initialize automatically on the main process. if accelerator.is_main_process: accelerator.init_trackers("dreambooth", config=vars(args)) @@ -741,10 +1028,10 @@ def main(args): # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint != "latest": - path = os.path.basename(args.resume_from_checkpoint) + path = basename(args.resume_from_checkpoint) else: # Get the mos recent checkpoint - dirs = os.listdir(args.output_dir) + dirs = listdir(args.output_dir) dirs = [d for d in dirs if d.startswith("checkpoint")] dirs = sorted(dirs, key=lambda x: int(x.split("-")[1])) path = dirs[-1] if len(dirs) > 0 else None @@ -756,7 +1043,7 @@ def main(args): args.resume_from_checkpoint = None else: accelerator.print(f"Resuming from checkpoint {path}") - accelerator.load_state(os.path.join(args.output_dir, path)) + accelerator.load_state(join(args.output_dir, path)) global_step = int(path.split("-")[1]) resume_global_step = global_step * args.gradient_accumulation_steps @@ -787,24 +1074,26 @@ def main(args): noise = torch.randn_like(latents) bsz = latents.shape[0] # Sample a random timestep for each image - timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device) - timesteps = timesteps.long() + time_steps = torch.randint( + 0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device + ) + time_steps = time_steps.long() # Add noise to the latents according to the noise magnitude at each timestep # (this is the forward diffusion process) - noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps) + noisy_latents = noise_scheduler.add_noise(latents, noise, time_steps) # Get the text embedding for conditioning encoder_hidden_states = text_encoder(batch["input_ids"])[0] # Predict the noise residual - model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample + model_pred = unet(noisy_latents, time_steps, encoder_hidden_states).sample # Get the target for loss depending on the prediction type if noise_scheduler.config.prediction_type == "epsilon": target = noise elif noise_scheduler.config.prediction_type == "v_prediction": - target = noise_scheduler.get_velocity(latents, noise, timesteps) + target = noise_scheduler.get_velocity(latents, noise, time_steps) else: raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}") @@ -834,19 +1123,34 @@ def main(args): accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm) optimizer.step() lr_scheduler.step() - optimizer.zero_grad() + optimizer.zero_grad(set_to_none=args.set_grads_to_none) # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) global_step += 1 - if global_step % args.checkpointing_steps == 0: - if accelerator.is_main_process: - save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + if accelerator.is_main_process: + if global_step % args.checkpointing_steps == 0: + save_path = join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) logger.info(f"Saved state to {save_path}") + if ( + args.validation_steps + and any(args.validation_prompt) + and global_step % args.validation_steps == 0 + ): + images_set = generate_validation_images( + text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype + ) + for images, validation_prompt in zip(images_set, args.validation_prompt): + if len(images) > 0: + label = str(uuid.uuid1())[:8] # generate an id for different set of images + log_validation_images_to_tracker( + images, label, validation_prompt, accelerator, global_step + ) + logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]} progress_bar.set_postfix(**logs) accelerator.log(logs, step=global_step) @@ -854,7 +1158,7 @@ def main(args): if global_step >= args.max_train_steps: break - # Create the pipeline using using the trained modules and save it. + # Create the pipeline using the trained modules and save it. accelerator.wait_for_everyone() if accelerator.is_main_process: pipeline = DiffusionPipeline.from_pretrained( From b8a5dda56ebc60313cabc9494a4cf5b0ecd847c4 Mon Sep 17 00:00:00 2001 From: estelleafl Date: Mon, 3 Jul 2023 19:15:46 +0300 Subject: [PATCH 172/199] [ldm3d] Update code to be functional with the new checkpoints (#3875) * fixed typo * updated doc to be consistent in naming * make style/quality * preprocessing for 4 channels and not 6 * make style * test for 4c * make style/quality * fixed test on cpu --------- Co-authored-by: Aflalo Co-authored-by: Aflalo Co-authored-by: Aflalo --- src/diffusers/image_processor.py | 21 ++++++++---- .../test_stable_diffusion_ldm3d.py | 32 +++++++++++++++---- 2 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/diffusers/image_processor.py b/src/diffusers/image_processor.py index 2a433ee14d98..6ccf9b465ebd 100644 --- a/src/diffusers/image_processor.py +++ b/src/diffusers/image_processor.py @@ -312,12 +312,17 @@ def numpy_to_depth(self, images): """ if images.ndim == 3: images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - raise Exception("Not supported") + images_depth = images[:, :, :, 3:] + if images.shape[-1] == 6: + images_depth = (images_depth * 255).round().astype("uint8") + pil_images = [ + Image.fromarray(self.rgblike_to_depthmap(image_depth), mode="I;16") for image_depth in images_depth + ] + elif images.shape[-1] == 4: + images_depth = (images_depth * 65535.0).astype(np.uint16) + pil_images = [Image.fromarray(image_depth, mode="I;16") for image_depth in images_depth] else: - pil_images = [Image.fromarray(self.rgblike_to_depthmap(image[:, :, 3:]), mode="I;16") for image in images] + raise Exception("Not supported") return pil_images @@ -349,7 +354,11 @@ def postprocess( image = self.pt_to_numpy(image) if output_type == "np": - return image[:, :, :, :3], np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0) + if image.shape[-1] == 6: + image_depth = np.stack([self.rgblike_to_depthmap(im[:, :, 3:]) for im in image], axis=0) + else: + image_depth = image[:, :, :, 3:] + return image[:, :, :, :3], image_depth if output_type == "pil": return self.numpy_to_pil(image), self.numpy_to_depth(image) diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py index 933e4307a41b..e2164e8117ad 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_ldm3d.py @@ -130,9 +130,9 @@ def test_stable_diffusion_ddim(self): assert depth.shape == (1, 64, 64) expected_slice_rgb = np.array( - [0.37301102, 0.7023895, 0.7418312, 0.5163375, 0.5825485, 0.60929704, 0.4188174, 0.48407027, 0.46555096] + [0.37338176, 0.70247, 0.74203193, 0.51643604, 0.58256793, 0.60932136, 0.4181095, 0.48355877, 0.46535262] ) - expected_slice_depth = np.array([103.4673, 85.81202, 87.84926]) + expected_slice_depth = np.array([103.46727, 85.812004, 87.849236]) assert np.abs(image_slice_rgb.flatten() - expected_slice_rgb).max() < 1e-2 assert np.abs(image_slice_depth.flatten() - expected_slice_depth).max() < 1e-2 @@ -280,10 +280,30 @@ def test_ldm3d(self): output = ldm3d_pipe(**inputs) rgb, depth = output.rgb, output.depth - expected_rgb_mean = 0.54461557 - expected_rgb_std = 0.2806707 - expected_depth_mean = 143.64595 - expected_depth_std = 83.491776 + expected_rgb_mean = 0.495586 + expected_rgb_std = 0.33795515 + expected_depth_mean = 112.48518 + expected_depth_std = 98.489746 + assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 + assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 + assert np.abs(expected_depth_mean - depth.mean()) < 1e-3 + assert np.abs(expected_depth_std - depth.std()) < 1e-3 + + def test_ldm3d_v2(self): + ldm3d_pipe = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c").to(torch_device) + ldm3d_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + output = ldm3d_pipe(**inputs) + rgb, depth = output.rgb, output.depth + + expected_rgb_mean = 0.4194127 + expected_rgb_std = 0.35375586 + expected_depth_mean = 0.5638502 + expected_depth_std = 0.34686103 + + assert rgb.shape == (1, 512, 512, 3) + assert depth.shape == (1, 512, 512, 1) assert np.abs(expected_rgb_mean - rgb.mean()) < 1e-3 assert np.abs(expected_rgb_std - rgb.std()) < 1e-3 assert np.abs(expected_depth_mean - depth.mean()) < 1e-3 From 332d2bbea35d9f29d6f6097725aacaca8aff309b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 3 Jul 2023 18:17:34 +0200 Subject: [PATCH 173/199] Improve memory text to video (#3930) * Improve memory text to video * Apply suggestions from code review * add test * Apply suggestions from code review Co-authored-by: Pedro Cuenca * finish test setup --------- Co-authored-by: Pedro Cuenca --- src/diffusers/models/attention.py | 25 +++++++++++- src/diffusers/models/unet_3d_condition.py | 40 +++++++++++++++++++ .../pipeline_text_to_video_synth.py | 3 ++ .../pipeline_text_to_video_synth_img2img.py | 3 ++ tests/models/test_models_unet_3d_condition.py | 18 +++++++++ 5 files changed, 88 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py index 8805257ebe9a..6b05bf35e87f 100644 --- a/src/diffusers/models/attention.py +++ b/src/diffusers/models/attention.py @@ -119,6 +119,15 @@ def __init__( self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + def forward( self, hidden_states: torch.FloatTensor, @@ -141,6 +150,7 @@ def forward( norm_hidden_states = self.norm1(hidden_states) cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} + attn_output = self.attn1( norm_hidden_states, encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, @@ -171,7 +181,20 @@ def forward( if self.use_ada_layer_norm_zero: norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] - ff_output = self.ff(norm_hidden_states) + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size + ff_output = torch.cat( + [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)], + dim=self._chunk_dim, + ) + else: + ff_output = self.ff(norm_hidden_states) if self.use_ada_layer_norm_zero: ff_output = gate_mlp.unsqueeze(1) * ff_output diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index 36dcaf21f827..9bc89c571c52 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -389,6 +389,46 @@ def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): for name, module in self.named_children(): fn_recursive_attn_processor(name, module, processor) + def enable_forward_chunking(self, chunk_size=None, dim=0): + """ + Sets the attention processor to use [feed forward + chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers). + + Parameters: + chunk_size (`int`, *optional*): + The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually + over each tensor of dim=`dim`. + dim (`int`, *optional*, defaults to `0`): + The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch) + or dim=1 (sequence length). + """ + if dim not in [0, 1]: + raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}") + + # By default chunk size is 1 + chunk_size = chunk_size or 1 + + def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int): + if hasattr(module, "set_chunk_feed_forward"): + module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim) + + for child in module.children(): + fn_recursive_feed_forward(child, chunk_size, dim) + + for module in self.children(): + fn_recursive_feed_forward(module, chunk_size, dim) + + def disable_forward_chunking(self): + def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int): + if hasattr(module, "set_chunk_feed_forward"): + module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim) + + for child in module.children(): + fn_recursive_feed_forward(child, chunk_size, dim) + + for module in self.children(): + fn_recursive_feed_forward(module, None, 0) + # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor def set_default_attn_processor(self): """ diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index e30f183808a5..680a524732e9 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -634,6 +634,9 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 6.1 Chunk feed-forward computation to save memory + self.unet.enable_forward_chunking(chunk_size=1, dim=1) + # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index ce5109a58213..1b6cd9c2b392 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -709,6 +709,9 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + # 6.1 Chunk feed-forward computation to save memory + self.unet.enable_forward_chunking(chunk_size=1, dim=1) + # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: diff --git a/tests/models/test_models_unet_3d_condition.py b/tests/models/test_models_unet_3d_condition.py index 3f29d0a41e18..72a33854bdcd 100644 --- a/tests/models/test_models_unet_3d_condition.py +++ b/tests/models/test_models_unet_3d_condition.py @@ -399,5 +399,23 @@ def test_lora_xformers_on_off(self): assert (sample - on_sample).abs().max() < 1e-4 assert (sample - off_sample).abs().max() < 1e-4 + def test_feed_forward_chunking(self): + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + init_dict["norm_num_groups"] = 32 + + model = self.model_class(**init_dict) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + output = model(**inputs_dict)[0] + + model.enable_forward_chunking() + with torch.no_grad(): + output_2 = model(**inputs_dict)[0] + + self.assertEqual(output.shape, output_2.shape, "Shape doesn't match") + assert np.abs(output.cpu() - output_2.cpu()).max() < 1e-2 + # (todo: sayakpaul) implement SLOW tests. From 4e898560cefb86525a65d32a662d6b3d6b2b0b82 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Mon, 3 Jul 2023 23:12:41 +0200 Subject: [PATCH 174/199] revert automatic chunking (#3934) * revert automatic chunking * Apply suggestions from code review * revert automatic chunking --- .../source/en/api/pipelines/text_to_video.mdx | 28 ++++++++++++++++++- .../pipeline_text_to_video_synth.py | 3 -- .../pipeline_text_to_video_synth_img2img.py | 3 -- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/source/en/api/pipelines/text_to_video.mdx b/docs/source/en/api/pipelines/text_to_video.mdx index 75868d7dd6ea..583d461ea948 100644 --- a/docs/source/en/api/pipelines/text_to_video.mdx +++ b/docs/source/en/api/pipelines/text_to_video.mdx @@ -138,6 +138,7 @@ pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dt pipe.enable_model_cpu_offload() # memory optimization +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) pipe.enable_vae_slicing() prompt = "Darth Vader surfing a wave" @@ -150,10 +151,13 @@ Now the video can be upscaled: ```py pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16) -pipe.vae.enable_slicing() pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config) pipe.enable_model_cpu_offload() +# memory optimization +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) +pipe.enable_vae_slicing() + video = [Image.fromarray(frame).resize((1024, 576)) for frame in video_frames] video_frames = pipe(prompt, video=video, strength=0.6).frames @@ -175,6 +179,28 @@ Here are some sample outputs: +### Memory optimizations + +Text-guided video generation with [`~TextToVideoSDPipeline`] and [`~VideoToVideoSDPipeline`] is very memory intensive both +when denoising with [`~UNet3DConditionModel`] and when decoding with [`~AutoencoderKL`]. It is possible though to reduce +memory usage at the cost of increased runtime to achieve the exact same result. To do so, it is recommended to enable +**forward chunking** and **vae slicing**: + +Forward chunking via [`~UNet3DConditionModel.enable_forward_chunking`]is explained in [this blog post](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers) and +allows to significantly reduce the required memory for the unet. You can chunk the feed forward layer over the `num_frames` +dimension by doing: + +```py +pipe.unet.enable_forward_chunking(chunk_size=1, dim=1) +``` + +Vae slicing via [`~TextToVideoSDPipeline.enable_vae_slicing`] and [`~VideoToVideoSDPipeline.enable_vae_slicing`] also +gives significant memory savings since the two pipelines decode all image frames at once. + +```py +pipe.enable_vae_slicing() +``` + ## Available checkpoints * [damo-vilab/text-to-video-ms-1.7b](https://huggingface.co/damo-vilab/text-to-video-ms-1.7b/) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index 680a524732e9..e30f183808a5 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -634,9 +634,6 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 6.1 Chunk feed-forward computation to save memory - self.unet.enable_forward_chunking(chunk_size=1, dim=1) - # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index 1b6cd9c2b392..ce5109a58213 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -709,9 +709,6 @@ def __call__( # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - # 6.1 Chunk feed-forward computation to save memory - self.unet.enable_forward_chunking(chunk_size=1, dim=1) - # 7. Denoising loop num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order with self.progress_bar(total=num_inference_steps) as progress_bar: From 1997614aa9525ef0f49858ac409540fdf2f02e9d Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Mon, 3 Jul 2023 18:49:49 -0700 Subject: [PATCH 175/199] avoid upcasting by assigning dtype to noise tensor (#3713) * avoid upcasting by assigning dtype to noise tensor * make style * Update train_unconditional.py * Update train_unconditional.py * make style * add unit test for pickle * revert change --------- Co-authored-by: root Co-authored-by: Patrick von Platen Co-authored-by: Prathik Rao --- .../unconditional_image_generation/train_unconditional.py | 4 +++- .../unconditional_image_generation/train_unconditional.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py index a42187fadea1..12ff40bbd680 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py @@ -568,7 +568,9 @@ def transform_images(examples): clean_images = batch["input"] # Sample noise that we'll add to the images - noise = torch.randn(clean_images.shape).to(clean_images.device) + noise = torch.randn( + clean_images.shape, dtype=(torch.float32 if args.mixed_precision == "no" else torch.float16) + ).to(clean_images.device) bsz = clean_images.shape[0] # Sample a random timestep for each image timesteps = torch.randint( diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index d6e4b17ba889..e10e6d302457 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -557,7 +557,9 @@ def transform_images(examples): clean_images = batch["input"] # Sample noise that we'll add to the images - noise = torch.randn(clean_images.shape).to(clean_images.device) + noise = torch.randn( + clean_images.shape, dtype=(torch.float32 if args.mixed_precision == "no" else torch.float16) + ).to(clean_images.device) bsz = clean_images.shape[0] # Sample a random timestep for each image timesteps = torch.randint( From 2837d490799e33aee66be50697d06d2adb61d0aa Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 4 Jul 2023 14:00:43 +0200 Subject: [PATCH 176/199] Fix failing np tests (#3942) * Fix failing np tests * Apply suggestions from code review * Update tests/pipelines/test_pipelines_common.py --- tests/pipelines/test_pipelines_common.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/pipelines/test_pipelines_common.py b/tests/pipelines/test_pipelines_common.py index 52dd4afd6b21..e97bdb352b22 100644 --- a/tests/pipelines/test_pipelines_common.py +++ b/tests/pipelines/test_pipelines_common.py @@ -698,11 +698,13 @@ def _test_xformers_attention_forwardGenerator_pass( pipe.set_progress_bar_config(disable=None) inputs = self.get_dummy_inputs(torch_device) - output_without_offload = pipe(**inputs)[0].cpu() + output_without_offload = pipe(**inputs)[0] + output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload pipe.enable_xformers_memory_efficient_attention() inputs = self.get_dummy_inputs(torch_device) - output_with_offload = pipe(**inputs)[0].cpu() + output_with_offload = pipe(**inputs)[0] + output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload if test_max_difference: max_diff = np.abs(output_with_offload - output_without_offload).max() From 07c9a08e67feb4d05ba22ba6b1a2d39468d68225 Mon Sep 17 00:00:00 2001 From: Pedro Cuenca Date: Wed, 5 Jul 2023 15:49:30 +0200 Subject: [PATCH 177/199] Add `timestep_spacing` and `steps_offset` to schedulers (#3947) * Add timestep_spacing to DDPM, LMSDiscrete, PNDM. * Remove spurious line. * More easy schedulers. * Add `linspace` to DDIM * Noise sigma for `trailing`. * Add timestep_spacing to DEISMultistepScheduler. Not sure the range is the way it was intended. * Fix: remove line used to debug. * Support timestep_spacing in DPMSolverMultistep, DPMSolverSDE, UniPC * Fix: convert to numpy. * Use sched. defaults when instantiating from_config For params not present in the original configuration. This makes it possible to switch pipeline schedulers even if they use different timestep_spacing (or any other param). * Apply suggestions from code review Co-authored-by: Patrick von Platen * Missing args in DPMSolverMultistep * Test: default args not in config * Style * Fix scheduler name in test * Remove duplicated entries * Add test for solver_type This test currently fails in main. When switching from DEIS to UniPC, solver_type is "logrho" (the default value from DEIS), which gets translated to "bh1" by UniPC. This is different to the default value for UniPC: "bh2". This is where the translation happens: https://github.com/huggingface/diffusers/blob/36d22d0709dc19776e3016fb3392d0f5578b0ab2/src/diffusers/schedulers/scheduling_unipc_multistep.py#L171 * UniPC: use same default for solver_type Fixes a bug when switching from UniPC from another scheduler (i.e., DEIS) that uses a different solver type. The solver is now the same as if we had instantiated the scheduler directly. * do not save use default values * fix more * fix all * fix schedulers * fix more * finish for real * finish for real * flaky tests * Update tests/pipelines/stable_diffusion/test_stable_diffusion_pix2pix_zero.py * Default steps_offset to 0. * Add missing docstrings * Apply suggestions from code review --------- Co-authored-by: Patrick von Platen --- src/diffusers/configuration_utils.py | 16 +++++- src/diffusers/schedulers/scheduling_ddim.py | 11 +++- .../schedulers/scheduling_ddim_parallel.py | 11 +++- src/diffusers/schedulers/scheduling_ddpm.py | 37 +++++++++++-- .../schedulers/scheduling_ddpm_parallel.py | 37 +++++++++++-- .../schedulers/scheduling_deis_multistep.py | 39 +++++++++++--- .../scheduling_dpmsolver_multistep.py | 38 ++++++++++--- .../schedulers/scheduling_dpmsolver_sde.py | 40 ++++++++++++-- .../scheduling_euler_ancestral_discrete.py | 44 +++++++++++++-- .../schedulers/scheduling_euler_discrete.py | 43 +++++++++++++-- .../schedulers/scheduling_heun_discrete.py | 40 ++++++++++++-- .../scheduling_k_dpm_2_ancestral_discrete.py | 40 ++++++++++++-- .../schedulers/scheduling_k_dpm_2_discrete.py | 40 ++++++++++++-- .../schedulers/scheduling_lms_discrete.py | 42 +++++++++++++-- src/diffusers/schedulers/scheduling_pndm.py | 33 +++++++++--- .../schedulers/scheduling_unipc_multistep.py | 41 +++++++++++--- tests/others/test_config.py | 53 +++++++++++++++++++ .../test_stable_diffusion_panorama.py | 2 +- tests/schedulers/test_scheduler_euler.py | 4 +- .../test_scheduler_euler_ancestral.py | 4 +- tests/schedulers/test_scheduler_lms.py | 2 +- tests/schedulers/test_scheduler_unipc.py | 10 ++-- tests/schedulers/test_schedulers.py | 48 ++++++++++++++++- 23 files changed, 598 insertions(+), 77 deletions(-) diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py index 1a030e467134..202905db52c6 100644 --- a/src/diffusers/configuration_utils.py +++ b/src/diffusers/configuration_utils.py @@ -423,6 +423,10 @@ def _get_init_keys(cls): @classmethod def extract_init_dict(cls, config_dict, **kwargs): + # Skip keys that were not present in the original config, so default __init__ values were used + used_defaults = config_dict.get("_use_default_values", []) + config_dict = {k: v for k, v in config_dict.items() if k not in used_defaults and k != "_use_default_values"} + # 0. Copy origin config dict original_dict = dict(config_dict.items()) @@ -544,8 +548,9 @@ def to_json_saveable(value): return value config_dict = {k: to_json_saveable(v) for k, v in config_dict.items()} - # Don't save "_ignore_files" + # Don't save "_ignore_files" or "_use_default_values" config_dict.pop("_ignore_files", None) + config_dict.pop("_use_default_values", None) return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" @@ -599,6 +604,11 @@ def inner_init(self, *args, **kwargs): if k not in ignore and k not in new_kwargs } ) + + # Take note of the parameters that were not present in the loaded config + if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0: + new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs) + new_kwargs = {**config_init_kwargs, **new_kwargs} getattr(self, "register_to_config")(**new_kwargs) init(self, *args, **init_kwargs) @@ -643,6 +653,10 @@ def init(self, *args, **kwargs): name = fields[i].name new_kwargs[name] = arg + # Take note of the parameters that were not present in the loaded config + if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0: + new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs) + getattr(self, "register_to_config")(**new_kwargs) original_init(self, *args, **kwargs) diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index bab6f8acea03..99602d14038b 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -302,8 +302,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.num_inference_steps = num_inference_steps - # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891 - if self.config.timestep_spacing == "leading": + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) + .round()[::-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": step_ratio = self.config.num_train_timesteps // self.num_inference_steps # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 22b7d8ec97dc..8875aa73208b 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -321,8 +321,15 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic self.num_inference_steps = num_inference_steps - # "leading" and "trailing" corresponds to annotation of Table 1. of https://arxiv.org/abs/2305.08891 - if self.config.timestep_spacing == "leading": + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) + .round()[::-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": step_ratio = self.config.num_train_timesteps // self.num_inference_steps # creates integer timesteps by multiplying by ratio # casting to int to avoid issues when num_inference_step is power of 3 diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index 5d24766d68c7..ddf27d409d88 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -114,6 +114,13 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin): (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`. sample_max_value (`float`, default `1.0`): the threshold value for dynamic thresholding. Valid only when `thresholding=True`. + timestep_spacing (`str`, default `"leading"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -134,6 +141,8 @@ def __init__( dynamic_thresholding_ratio: float = 0.995, clip_sample_range: float = 1.0, sample_max_value: float = 1.0, + timestep_spacing: str = "leading", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -228,11 +237,33 @@ def set_timesteps( ) self.num_inference_steps = num_inference_steps - - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.custom_timesteps = False + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) + .round()[::-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) + self.timesteps = torch.from_numpy(timesteps).to(device) def _get_variance(self, t, predicted_variance=None, variance_type=None): diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index 2719d90b9314..e4d858efde8f 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -116,6 +116,13 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin): (https://arxiv.org/abs/2205.11487). Valid only when `thresholding=True`. sample_max_value (`float`, default `1.0`): the threshold value for dynamic thresholding. Valid only when `thresholding=True`. + timestep_spacing (`str`, default `"leading"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -138,6 +145,8 @@ def __init__( dynamic_thresholding_ratio: float = 0.995, clip_sample_range: float = 1.0, sample_max_value: float = 1.0, + timestep_spacing: str = "leading", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -234,11 +243,33 @@ def set_timesteps( ) self.num_inference_steps = num_inference_steps - - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) self.custom_timesteps = False + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) + .round()[::-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) + self.timesteps = torch.from_numpy(timesteps).to(device) # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index 56c362018c18..c504fb19231a 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -107,6 +107,13 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin): This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -129,6 +136,8 @@ def __init__( solver_type: str = "logrho", lower_order_final: bool = True, use_karras_sigmas: Optional[bool] = False, + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -185,12 +194,30 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic device (`str` or `torch.device`, optional): the device to which the timesteps should be moved to. If `None`, the timesteps are not moved. """ - timesteps = ( - np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) - .round()[::-1][:-1] - .copy() - .astype(np.int64) - ) + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // (num_inference_steps + 1) + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.arange(self.config.num_train_timesteps, 0, -step_ratio).round().copy().astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) if self.config.use_karras_sigmas: diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index d7c29d5488a5..528b7b838b1c 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -134,6 +134,13 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin): guided-diffusion (https://github.com/openai/guided-diffusion) predicts both mean and variance of the Gaussian distribution in the model's output. DPM-Solver only needs the "mean" output because it is based on diffusion ODEs. + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -158,6 +165,8 @@ def __init__( use_karras_sigmas: Optional[bool] = False, lambda_min_clipped: float = -float("inf"), variance_type: Optional[str] = None, + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -217,12 +226,29 @@ def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torc # Clipping the minimum of all lambda(t) for numerical stability. # This is critical for cosine (squaredcos_cap_v2) noise schedule. clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped) - timesteps = ( - np.linspace(0, self.config.num_train_timesteps - 1 - clipped_idx, num_inference_steps + 1) - .round()[::-1][:-1] - .copy() - .astype(np.int64) - ) + last_timestep = ((self.config.num_train_timesteps - clipped_idx).numpy()).item() + + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, last_timestep - 1, num_inference_steps + 1).round()[::-1][:-1].copy().astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = last_timestep // (num_inference_steps + 1) + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.arange(last_timestep, 0, -step_ratio).round().copy().astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) if self.config.use_karras_sigmas: diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index ae9229981152..da8b71788b75 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -133,6 +133,13 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin): of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. noise_sampler_seed (`int`, *optional*, defaults to `None`): The random seed to use for the noise sampler. If `None`, a random seed will be generated. + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -149,6 +156,8 @@ def __init__( prediction_type: str = "epsilon", use_karras_sigmas: Optional[bool] = False, noise_sampler_seed: Optional[int] = None, + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -187,6 +196,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): pos = 0 return indices[pos].item() + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, @@ -226,7 +243,25 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) @@ -242,9 +277,6 @@ def set_timesteps( sigmas = torch.from_numpy(sigmas).to(device=device) self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]]) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - timesteps = torch.from_numpy(timesteps) second_order_timesteps = torch.from_numpy(second_order_timesteps) timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)]) diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 6b08e9bfc207..6b8c2f1a8a28 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -99,7 +99,13 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf) - + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -114,6 +120,8 @@ def __init__( beta_schedule: str = "linear", trained_betas: Optional[Union[np.ndarray, List[float]]] = None, prediction_type: str = "epsilon", + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -137,15 +145,20 @@ def __init__( sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = torch.from_numpy(sigmas) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - # setable values self.num_inference_steps = None timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() self.timesteps = torch.from_numpy(timesteps) self.is_scale_input_called = False + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -179,7 +192,28 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[ + ::-1 + ].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32) diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index 7237128cbf07..fc52c50ebc7f 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -107,6 +107,13 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin): This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -123,6 +130,8 @@ def __init__( prediction_type: str = "epsilon", interpolation_type: str = "linear", use_karras_sigmas: Optional[bool] = False, + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -146,9 +155,6 @@ def __init__( sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = torch.from_numpy(sigmas) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - # setable values self.num_inference_steps = None timesteps = np.linspace(0, num_train_timesteps - 1, num_train_timesteps, dtype=float)[::-1].copy() @@ -156,6 +162,14 @@ def __init__( self.is_scale_input_called = False self.use_karras_sigmas = use_karras_sigmas + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -191,7 +205,28 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[ + ::-1 + ].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) + sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py index 100e2012ea20..28f29067a544 100644 --- a/src/diffusers/schedulers/scheduling_heun_discrete.py +++ b/src/diffusers/schedulers/scheduling_heun_discrete.py @@ -78,6 +78,13 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -93,6 +100,8 @@ def __init__( trained_betas: Optional[Union[np.ndarray, List[float]]] = None, prediction_type: str = "epsilon", use_karras_sigmas: Optional[bool] = False, + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -128,6 +137,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): pos = 0 return indices[pos].item() + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, @@ -166,7 +183,25 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) @@ -180,9 +215,6 @@ def set_timesteps( sigmas = torch.from_numpy(sigmas).to(device=device) self.sigmas = torch.cat([sigmas[:1], sigmas[1:-1].repeat_interleave(2), sigmas[-1:]]) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - timesteps = torch.from_numpy(timesteps) timesteps = torch.cat([timesteps[:1], timesteps[1:].repeat_interleave(2)]) diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index 2fa0431e1292..d4a35ab82502 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -78,6 +78,13 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf) + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -92,6 +99,8 @@ def __init__( beta_schedule: str = "linear", trained_betas: Optional[Union[np.ndarray, List[float]]] = None, prediction_type: str = "epsilon", + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -127,6 +136,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): pos = 0 return indices[pos].item() + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, @@ -169,7 +186,25 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) self.log_sigmas = torch.from_numpy(np.log(sigmas)).to(device) @@ -197,9 +232,6 @@ def set_timesteps( self.sigmas_up = torch.cat([sigmas_up[:1], sigmas_up[1:].repeat_interleave(2), sigmas_up[-1:]]) self.sigmas_down = torch.cat([sigmas_down[:1], sigmas_down[1:].repeat_interleave(2), sigmas_down[-1:]]) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - if str(device).startswith("mps"): # mps does not support float64 timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py index bb80c4a54bfe..39079fde10d2 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -77,6 +77,13 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf) + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -91,6 +98,8 @@ def __init__( beta_schedule: str = "linear", trained_betas: Optional[Union[np.ndarray, List[float]]] = None, prediction_type: str = "epsilon", + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -126,6 +135,14 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): pos = 0 return indices[pos].item() + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, @@ -168,7 +185,25 @@ def set_timesteps( num_train_timesteps = num_train_timesteps or self.config.num_train_timesteps - timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) self.log_sigmas = torch.from_numpy(np.log(sigmas)).to(device) @@ -185,9 +220,6 @@ def set_timesteps( [sigmas_interpol[:1], sigmas_interpol[1:].repeat_interleave(2), sigmas_interpol[-1:]] ) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - if str(device).startswith("mps"): # mps does not support float64 timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py index 0656475c3093..1256660b843c 100644 --- a/src/diffusers/schedulers/scheduling_lms_discrete.py +++ b/src/diffusers/schedulers/scheduling_lms_discrete.py @@ -102,6 +102,13 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf) + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -117,6 +124,8 @@ def __init__( trained_betas: Optional[Union[np.ndarray, List[float]]] = None, use_karras_sigmas: Optional[bool] = False, prediction_type: str = "epsilon", + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -140,9 +149,6 @@ def __init__( sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32) self.sigmas = torch.from_numpy(sigmas) - # standard deviation of the initial noise distribution - self.init_noise_sigma = self.sigmas.max() - # setable values self.num_inference_steps = None self.use_karras_sigmas = use_karras_sigmas @@ -150,6 +156,14 @@ def __init__( self.derivatives = [] self.is_scale_input_called = False + @property + def init_noise_sigma(self): + # standard deviation of the initial noise distribution + if self.config.timestep_spacing in ["linspace", "trailing"]: + return self.sigmas.max() + + return (self.sigmas.max() ** 2 + 1) ** 0.5 + def scale_model_input( self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] ) -> torch.FloatTensor: @@ -205,7 +219,27 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic """ self.num_inference_steps = num_inference_steps - timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[::-1].copy() + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps, dtype=float)[ + ::-1 + ].copy() + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(float) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(self.config.num_train_timesteps, 0, -step_ratio)).round().copy().astype(float) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) log_sigmas = np.log(sigmas) diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py index 01c02a21bbfc..70ee1301129c 100644 --- a/src/diffusers/schedulers/scheduling_pndm.py +++ b/src/diffusers/schedulers/scheduling_pndm.py @@ -85,11 +85,13 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin): prediction_type (`str`, default `epsilon`, optional): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf) + timestep_spacing (`str`, default `"leading"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. steps_offset (`int`, default `0`): an offset added to the inference steps. You can use a combination of `offset=1` and `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in stable diffusion. - """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -106,6 +108,7 @@ def __init__( skip_prk_steps: bool = False, set_alpha_to_one: bool = False, prediction_type: str = "epsilon", + timestep_spacing: str = "leading", steps_offset: int = 0, ): if trained_betas is not None: @@ -159,11 +162,29 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic """ self.num_inference_steps = num_inference_steps - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round() - self._timesteps += self.config.steps_offset + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + self._timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps).round().astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + self._timesteps = (np.arange(0, num_inference_steps) * step_ratio).round() + self._timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + self._timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio))[::-1].astype( + np.int64 + ) + self._timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) if self.config.skip_prk_steps: # for some models like stable diffusion the prk steps can/should be skipped to diff --git a/src/diffusers/schedulers/scheduling_unipc_multistep.py b/src/diffusers/schedulers/scheduling_unipc_multistep.py index 7233258a4766..3caa01a58562 100644 --- a/src/diffusers/schedulers/scheduling_unipc_multistep.py +++ b/src/diffusers/schedulers/scheduling_unipc_multistep.py @@ -121,6 +121,13 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin): This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence of noise levels {σi} as defined in Equation (5) of the paper https://arxiv.org/pdf/2206.00364.pdf. + timestep_spacing (`str`, default `"linspace"`): + The way the timesteps should be scaled. Refer to Table 2. of [Common Diffusion Noise Schedules and Sample + Steps are Flawed](https://arxiv.org/abs/2305.08891) for more information. + steps_offset (`int`, default `0`): + an offset added to the inference steps. You can use a combination of `offset=1` and + `set_alpha_to_one=False`, to make the last step use step 0 for the previous alpha product, as done in + stable diffusion. """ _compatibles = [e.name for e in KarrasDiffusionSchedulers] @@ -145,6 +152,8 @@ def __init__( disable_corrector: List[int] = [], solver_p: SchedulerMixin = None, use_karras_sigmas: Optional[bool] = False, + timestep_spacing: str = "linspace", + steps_offset: int = 0, ): if trained_betas is not None: self.betas = torch.tensor(trained_betas, dtype=torch.float32) @@ -173,7 +182,7 @@ def __init__( if solver_type not in ["bh1", "bh2"]: if solver_type in ["midpoint", "heun", "logrho"]: - self.register_to_config(solver_type="bh1") + self.register_to_config(solver_type="bh2") else: raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}") @@ -199,12 +208,30 @@ def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.devic device (`str` or `torch.device`, optional): the device to which the timesteps should be moved to. If `None`, the timesteps are not moved. """ - timesteps = ( - np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) - .round()[::-1][:-1] - .copy() - .astype(np.int64) - ) + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps + 1) + .round()[::-1][:-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // (num_inference_steps + 1) + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps + 1) * step_ratio).round()[::-1][:-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.arange(self.config.num_train_timesteps, 0, -step_ratio).round().copy().astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'linspace', 'leading' or 'trailing'." + ) sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5) if self.config.use_karras_sigmas: diff --git a/tests/others/test_config.py b/tests/others/test_config.py index a29190c199ca..d1f8a6e054d4 100644 --- a/tests/others/test_config.py +++ b/tests/others/test_config.py @@ -75,6 +75,22 @@ def __init__( pass +class SampleObject4(ConfigMixin): + config_name = "config.json" + + @register_to_config + def __init__( + self, + a=2, + b=5, + c=(2, 5), + d="for diffusion", + e=[1, 5], + f=[5, 4], + ): + pass + + class ConfigTester(unittest.TestCase): def test_load_not_from_mixin(self): with self.assertRaises(ValueError): @@ -137,6 +153,7 @@ def test_save_load(self): assert config.pop("c") == (2, 5) # instantiated as tuple assert new_config.pop("c") == [2, 5] # saved & loaded as list because of json + config.pop("_use_default_values") assert config == new_config def test_load_ddim_from_pndm(self): @@ -233,3 +250,39 @@ def test_load_dpmsolver(self): assert dpm.__class__ == DPMSolverMultistepScheduler # no warning should be thrown assert cap_logger.out == "" + + def test_use_default_values(self): + # let's first save a config that should be in the form + # a=2, + # b=5, + # c=(2, 5), + # d="for diffusion", + # e=[1, 3], + + config = SampleObject() + + config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")} + + # make sure that default config has all keys in `_use_default_values` + assert set(config_dict.keys()) == config.config._use_default_values + + with tempfile.TemporaryDirectory() as tmpdirname: + config.save_config(tmpdirname) + + # now loading it with SampleObject2 should put f into `_use_default_values` + config = SampleObject2.from_config(tmpdirname) + + assert "f" in config._use_default_values + assert config.f == [1, 3] + + # now loading the config, should **NOT** use [1, 3] for `f`, but the default [1, 4] value + # **BECAUSE** it is part of `config._use_default_values` + new_config = SampleObject4.from_config(config.config) + assert new_config.f == [5, 4] + + config.config._use_default_values.pop() + new_config_2 = SampleObject4.from_config(config.config) + assert new_config_2.f == [1, 3] + + # Nevertheless "e" should still be correctly loaded to [1, 3] from SampleObject2 instead of defaulting to [1, 5] + assert new_config_2.e == [1, 3] diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py index 32541c980a15..080bd0091f4f 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion_panorama.py @@ -186,7 +186,7 @@ def test_stable_diffusion_panorama_euler(self): assert image.shape == (1, 64, 64, 3) - expected_slice = np.array([0.4886, 0.5586, 0.4476, 0.5053, 0.6013, 0.4737, 0.5538, 0.5100, 0.4927]) + expected_slice = np.array([0.4024, 0.6510, 0.4901, 0.5378, 0.5813, 0.5622, 0.4795, 0.4467, 0.4952]) assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 diff --git a/tests/schedulers/test_scheduler_euler.py b/tests/schedulers/test_scheduler_euler.py index aa46ef31885a..0c3b065161db 100644 --- a/tests/schedulers/test_scheduler_euler.py +++ b/tests/schedulers/test_scheduler_euler.py @@ -101,7 +101,7 @@ def test_full_loop_device(self): generator = torch.manual_seed(0) model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma + sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu() sample = sample.to(torch_device) for t in scheduler.timesteps: @@ -128,7 +128,7 @@ def test_full_loop_device_karras_sigmas(self): generator = torch.manual_seed(0) model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma + sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu() sample = sample.to(torch_device) for t in scheduler.timesteps: diff --git a/tests/schedulers/test_scheduler_euler_ancestral.py b/tests/schedulers/test_scheduler_euler_ancestral.py index 5fa36be6bc64..9866bd12d6af 100644 --- a/tests/schedulers/test_scheduler_euler_ancestral.py +++ b/tests/schedulers/test_scheduler_euler_ancestral.py @@ -47,7 +47,7 @@ def test_full_loop_no_noise(self): generator = torch.manual_seed(0) model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma + sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu() sample = sample.to(torch_device) for i, t in enumerate(scheduler.timesteps): @@ -100,7 +100,7 @@ def test_full_loop_device(self): generator = torch.manual_seed(0) model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma + sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu() sample = sample.to(torch_device) for t in scheduler.timesteps: diff --git a/tests/schedulers/test_scheduler_lms.py b/tests/schedulers/test_scheduler_lms.py index 2682886a788d..1e0a8212354d 100644 --- a/tests/schedulers/test_scheduler_lms.py +++ b/tests/schedulers/test_scheduler_lms.py @@ -97,7 +97,7 @@ def test_full_loop_device(self): scheduler.set_timesteps(self.num_inference_steps, device=torch_device) model = self.dummy_model() - sample = self.dummy_sample_deter * scheduler.init_noise_sigma + sample = self.dummy_sample_deter * scheduler.init_noise_sigma.cpu() sample = sample.to(torch_device) for i, t in enumerate(scheduler.timesteps): diff --git a/tests/schedulers/test_scheduler_unipc.py b/tests/schedulers/test_scheduler_unipc.py index 62cffc67388c..171ee85be1d3 100644 --- a/tests/schedulers/test_scheduler_unipc.py +++ b/tests/schedulers/test_scheduler_unipc.py @@ -23,7 +23,7 @@ def get_scheduler_config(self, **kwargs): "beta_end": 0.02, "beta_schedule": "linear", "solver_order": 2, - "solver_type": "bh1", + "solver_type": "bh2", } config.update(**kwargs) @@ -144,7 +144,7 @@ def test_switch(self): sample = self.full_loop(scheduler=scheduler) result_mean = torch.mean(torch.abs(sample)) - assert abs(result_mean.item() - 0.2521) < 1e-3 + assert abs(result_mean.item() - 0.2464) < 1e-3 scheduler = DPMSolverSinglestepScheduler.from_config(scheduler.config) scheduler = DEISMultistepScheduler.from_config(scheduler.config) @@ -154,7 +154,7 @@ def test_switch(self): sample = self.full_loop(scheduler=scheduler) result_mean = torch.mean(torch.abs(sample)) - assert abs(result_mean.item() - 0.2521) < 1e-3 + assert abs(result_mean.item() - 0.2464) < 1e-3 def test_timesteps(self): for timesteps in [25, 50, 100, 999, 1000]: @@ -206,13 +206,13 @@ def test_full_loop_no_noise(self): sample = self.full_loop() result_mean = torch.mean(torch.abs(sample)) - assert abs(result_mean.item() - 0.2521) < 1e-3 + assert abs(result_mean.item() - 0.2464) < 1e-3 def test_full_loop_with_v_prediction(self): sample = self.full_loop(prediction_type="v_prediction") result_mean = torch.mean(torch.abs(sample)) - assert abs(result_mean.item() - 0.1096) < 1e-3 + assert abs(result_mean.item() - 0.1014) < 1e-3 def test_fp16_support(self): scheduler_class = self.scheduler_classes[0] diff --git a/tests/schedulers/test_schedulers.py b/tests/schedulers/test_schedulers.py index a2d065f388bd..d1ae333c0cd2 100755 --- a/tests/schedulers/test_schedulers.py +++ b/tests/schedulers/test_schedulers.py @@ -24,10 +24,14 @@ import diffusers from diffusers import ( + DDIMScheduler, + DEISMultistepScheduler, + DiffusionPipeline, EulerAncestralDiscreteScheduler, EulerDiscreteScheduler, IPNDMScheduler, LMSDiscreteScheduler, + UniPCMultistepScheduler, VQDiffusionScheduler, logging, ) @@ -202,6 +206,44 @@ def test_save_load_from_different_config_comp_schedulers(self): assert cap_logger_2.out == "{'f'} was not found in config. Values will be initialized to default values.\n" assert cap_logger_3.out == "{'f'} was not found in config. Values will be initialized to default values.\n" + def test_default_arguments_not_in_config(self): + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-pipe", torch_dtype=torch.float16 + ) + assert pipe.scheduler.__class__ == DDIMScheduler + + # Default for DDIMScheduler + assert pipe.scheduler.config.timestep_spacing == "leading" + + # Switch to a different one, verify we use the default for that class + pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config) + assert pipe.scheduler.config.timestep_spacing == "linspace" + + # Override with kwargs + pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing") + assert pipe.scheduler.config.timestep_spacing == "trailing" + + # Verify overridden kwargs stick + pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) + assert pipe.scheduler.config.timestep_spacing == "trailing" + + # And stick + pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config) + assert pipe.scheduler.config.timestep_spacing == "trailing" + + def test_default_solver_type_after_switch(self): + pipe = DiffusionPipeline.from_pretrained( + "hf-internal-testing/tiny-stable-diffusion-pipe", torch_dtype=torch.float16 + ) + assert pipe.scheduler.__class__ == DDIMScheduler + + pipe.scheduler = DEISMultistepScheduler.from_config(pipe.scheduler.config) + assert pipe.scheduler.config.solver_type == "logrho" + + # Switch to UniPC, verify the solver is the default + pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) + assert pipe.scheduler.config.solver_type == "bh2" + class SchedulerCommonTest(unittest.TestCase): scheduler_classes = () @@ -414,7 +456,11 @@ def test_from_pretrained(self): scheduler.save_pretrained(tmpdirname) new_scheduler = scheduler_class.from_pretrained(tmpdirname) - assert scheduler.config == new_scheduler.config + # `_use_default_values` should not exist for just saved & loaded scheduler + scheduler_config = dict(scheduler.config) + del scheduler_config["_use_default_values"] + + assert scheduler_config == new_scheduler.config def test_step_shape(self): kwargs = dict(self.forward_default_kwargs) From aed7499a8d81de78bb1692d7a0745d3890618b0e Mon Sep 17 00:00:00 2001 From: dg845 <58458699+dg845@users.noreply.github.com> Date: Wed, 5 Jul 2023 10:33:58 -0700 Subject: [PATCH 178/199] Add Consistency Models Pipeline (#3492) * initial commit * Improve consistency models sampling implementation. * Add CMStochasticIterativeScheduler, which implements the multi-step sampler (stochastic_iterative_sampler) in the original code, and make further improvements to sampling. * Add Unet blocks for consistency models * Add conversion script for Unet * Fix bug in new unet blocks * Fix attention weight loading * Make design improvements to ConsistencyModelPipeline and CMStochasticIterativeScheduler and add initial version of tests. * make style * Make small random test UNet class conditional and set resnet_time_scale_shift to 'scale_shift' to better match consistency model checkpoints. * Add support for converting a test UNet and non-class-conditional UNets to the consistency models conversion script. * make style * Change num_class_embeds to 1000 to better match the original consistency models implementation. * Add support for distillation in pipeline_consistency_models.py. * Improve consistency model tests: - Get small testing checkpoints from hub - Modify tests to take into account "distillation" parameter of ConsistencyModelPipeline - Add onestep, multistep tests for distillation and distillation + class conditional - Add expected image slices for onestep tests * make style * Improve ConsistencyModelPipeline: - Add initial support for class-conditional generation - Fix initial sigma for onestep generation - Fix some sigma shape issues * make style * Improve ConsistencyModelPipeline: - add latents __call__ argument and prepare_latents method - add check_inputs method - add initial docstrings for ConsistencyModelPipeline.__call__ * make style * Fix bug when randomly generating class labels for class-conditional generation. * Switch CMStochasticIterativeScheduler to configuring a sigma schedule and make related changes to the pipeline and tests. * Remove some unused code and make style. * Fix small bug in CMStochasticIterativeScheduler. * Add expected slices for multistep sampling tests and make them pass. * Work on consistency model fast tests: - in pipeline, call self.scheduler.scale_model_input before denoising - get expected slices for Euler and Heun scheduler tests - make Euler test pass - mark Heun test as expected fail because it doesn't support prediction_type "sample" yet - remove DPM and Euler Ancestral tests because they don't support use_karras_sigmas * make style * Refactor conversion script to make it easier to add more model architectures to convert in the future. * Work on ConsistencyModelPipeline tests: - Fix device bug when handling class labels in ConsistencyModelPipeline.__call__ - Add slow tests for onestep and multistep sampling and make them pass - Refactor fast tests - Refactor ConsistencyModelPipeline.__init__ * make style * Remove the add_noise and add_noise_to_input methods from CMStochasticIterativeScheduler for now. * Run python utils/check_copies.py --fix_and_overwrite python utils/check_dummies.py --fix_and_overwrite to make dummy objects for new pipeline and scheduler. * Make fast tests from PipelineTesterMixin pass. * make style * Refactor consistency models pipeline and scheduler: - Remove support for Karras schedulers (only support CMStochasticIterativeScheduler) - Move sigma manipulation, input scaling, denoising from pipeline to scheduler - Make corresponding changes to tests and ensure they pass * make style * Add docstrings and further refactor pipeline and scheduler. * make style * Add initial version of the consistency models documentation. * Refactor custom timesteps logic following DDPMScheduler/IFPipeline and temporarily add torch 2.0 SDPA kernel selection logic for debugging. * make style * Convert current slow tests to use fp16 and flash attention. * make style * Add slow tests for normal attention on cuda device. * make style * Fix attention weights loading * Update consistency model fast tests for new test checkpoints with attention fix. * make style * apply suggestions * Add add_noise method to CMStochasticIterativeScheduler (copied from EulerDiscreteScheduler). * Conversion script now outputs pipeline instead of UNet and add support for LSUN-256 models and different schedulers. * When both timesteps and num_inference_steps are supplied, raise warning instead of error (timesteps take precedence). * make style * Add remaining diffusers model checkpoints for models in the original consistency model release and update usage example. * apply suggestions from review * make style * fix attention naming * Add tests for CMStochasticIterativeScheduler. * make style * Make CMStochasticIterativeScheduler tests pass. * make style * Override test_step_shape in CMStochasticIterativeSchedulerTest instead of modifying it in SchedulerCommonTest. * make style * rename some models * Improve API * rename some models * Remove duplicated block * Add docstring and make torch compile work * More fixes * Fixes * Apply suggestions from code review * Apply suggestions from code review * add more docstring * update consistency conversion script --------- Co-authored-by: ayushmangal Co-authored-by: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Co-authored-by: Patrick von Platen --- docs/source/en/_toctree.yml | 4 + .../en/api/pipelines/consistency_models.mdx | 87 ++++ .../schedulers/cm_stochastic_iterative.mdx | 11 + scripts/convert_consistency_to_diffusers.py | 313 +++++++++++++++ src/diffusers/__init__.py | 2 + src/diffusers/models/unet_2d.py | 8 + src/diffusers/models/unet_2d_blocks.py | 74 +++- src/diffusers/pipelines/__init__.py | 1 + .../pipelines/consistency_models/__init__.py | 1 + .../pipeline_consistency_models.py | 337 ++++++++++++++++ src/diffusers/schedulers/__init__.py | 1 + .../scheduling_consistency_models.py | 380 ++++++++++++++++++ src/diffusers/utils/dummy_pt_objects.py | 30 ++ .../pipelines/consistency_models/__init__.py | 0 .../test_consistency_models.py | 288 +++++++++++++ .../test_scheduler_consistency_model.py | 150 +++++++ tests/schedulers/test_schedulers.py | 36 +- 17 files changed, 1710 insertions(+), 13 deletions(-) create mode 100644 docs/source/en/api/pipelines/consistency_models.mdx create mode 100644 docs/source/en/api/schedulers/cm_stochastic_iterative.mdx create mode 100644 scripts/convert_consistency_to_diffusers.py create mode 100644 src/diffusers/pipelines/consistency_models/__init__.py create mode 100644 src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py create mode 100644 src/diffusers/schedulers/scheduling_consistency_models.py create mode 100644 tests/pipelines/consistency_models/__init__.py create mode 100644 tests/pipelines/consistency_models/test_consistency_models.py create mode 100644 tests/schedulers/test_scheduler_consistency_model.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 72808df049c9..db9e72a4ea20 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -184,6 +184,8 @@ title: Audio Diffusion - local: api/pipelines/audioldm title: AudioLDM + - local: api/pipelines/consistency_models + title: Consistency Models - local: api/pipelines/controlnet title: ControlNet - local: api/pipelines/cycle_diffusion @@ -274,6 +276,8 @@ - sections: - local: api/schedulers/overview title: Overview + - local: api/schedulers/cm_stochastic_iterative + title: Consistency Model Multistep Scheduler - local: api/schedulers/ddim title: DDIM - local: api/schedulers/ddim_inverse diff --git a/docs/source/en/api/pipelines/consistency_models.mdx b/docs/source/en/api/pipelines/consistency_models.mdx new file mode 100644 index 000000000000..715743b87a12 --- /dev/null +++ b/docs/source/en/api/pipelines/consistency_models.mdx @@ -0,0 +1,87 @@ +# Consistency Models + +Consistency Models were proposed in [Consistency Models](https://arxiv.org/abs/2303.01469) by Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever. + +The abstract of the [paper](https://arxiv.org/pdf/2303.01469.pdf) is as follows: + +*Diffusion models have significantly advanced the fields of image, audio, and video generation, but they depend on an iterative sampling process that causes slow generation. To overcome this limitation, we propose consistency models, a new family of models that generate high quality samples by directly mapping noise to data. They support fast one-step generation by design, while still allowing multistep sampling to trade compute for sample quality. They also support zero-shot data editing, such as image inpainting, colorization, and super-resolution, without requiring explicit training on these tasks. Consistency models can be trained either by distilling pre-trained diffusion models, or as standalone generative models altogether. Through extensive experiments, we demonstrate that they outperform existing distillation techniques for diffusion models in one- and few-step sampling, achieving the new state-of-the-art FID of 3.55 on CIFAR-10 and 6.20 on ImageNet 64x64 for one-step generation. When trained in isolation, consistency models become a new family of generative models that can outperform existing one-step, non-adversarial generative models on standard benchmarks such as CIFAR-10, ImageNet 64x64 and LSUN 256x256. * + +Resources: + +* [Paper](https://arxiv.org/abs/2303.01469) +* [Original Code](https://github.com/openai/consistency_models) + +Available Checkpoints are: +- *cd_imagenet64_l2 (64x64 resolution)* [openai/consistency-model-pipelines](https://huggingface.co/openai/consistency-model-pipelines) +- *cd_imagenet64_lpips (64x64 resolution)* [openai/diffusers-cd_imagenet64_lpips](https://huggingface.co/openai/diffusers-cd_imagenet64_lpips) +- *ct_imagenet64 (64x64 resolution)* [openai/diffusers-ct_imagenet64](https://huggingface.co/openai/diffusers-ct_imagenet64) +- *cd_bedroom256_l2 (256x256 resolution)* [openai/diffusers-cd_bedroom256_l2](https://huggingface.co/openai/diffusers-cd_bedroom256_l2) +- *cd_bedroom256_lpips (256x256 resolution)* [openai/diffusers-cd_bedroom256_lpips](https://huggingface.co/openai/diffusers-cd_bedroom256_lpips) +- *ct_bedroom256 (256x256 resolution)* [openai/diffusers-ct_bedroom256](https://huggingface.co/openai/diffusers-ct_bedroom256) +- *cd_cat256_l2 (256x256 resolution)* [openai/diffusers-cd_cat256_l2](https://huggingface.co/openai/diffusers-cd_cat256_l2) +- *cd_cat256_lpips (256x256 resolution)* [openai/diffusers-cd_cat256_lpips](https://huggingface.co/openai/diffusers-cd_cat256_lpips) +- *ct_cat256 (256x256 resolution)* [openai/diffusers-ct_cat256](https://huggingface.co/openai/diffusers-ct_cat256) + +## Available Pipelines + +| Pipeline | Tasks | Demo | Colab | +|:---:|:---:|:---:|:---:| +| [ConsistencyModelPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_consistency_models.py) | *Unconditional Image Generation* | | | + +This pipeline was contributed by our community members [dg845](https://github.com/dg845) and [ayushtues](https://huggingface.co/ayushtues) :heart: + +## Usage Example + +```python +import torch + +from diffusers import ConsistencyModelPipeline + +device = "cuda" +# Load the cd_imagenet64_l2 checkpoint. +model_id_or_path = "openai/diffusers-cd_imagenet64_l2" +pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +# Onestep Sampling +image = pipe(num_inference_steps=1).images[0] +image.save("consistency_model_onestep_sample.png") + +# Onestep sampling, class-conditional image generation +# ImageNet-64 class label 145 corresponds to king penguins +image = pipe(num_inference_steps=1, class_labels=145).images[0] +image.save("consistency_model_onestep_sample_penguin.png") + +# Multistep sampling, class-conditional image generation +# Timesteps can be explicitly specified; the particular timesteps below are from the original Github repo. +# https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77 +image = pipe(timesteps=[22, 0], class_labels=145).images[0] +image.save("consistency_model_multistep_sample_penguin.png") +``` + +For an additional speed-up, one can also make use of `torch.compile`. Multiple images can be generated in <1 second as follows: + +```py +import torch +from diffusers import ConsistencyModelPipeline + +device = "cuda" +# Load the cd_bedroom256_lpips checkpoint. +model_id_or_path = "openai/diffusers-cd_bedroom256_lpips" +pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) +pipe.to(device) + +pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) + +# Multistep sampling +# Timesteps can be explicitly specified; the particular timesteps below are from the original Github repo: +# https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L83 +for _ in range(10): + image = pipe(timesteps=[17, 0]).images[0] + image.show() +``` + +## ConsistencyModelPipeline +[[autodoc]] ConsistencyModelPipeline + - all + - __call__ diff --git a/docs/source/en/api/schedulers/cm_stochastic_iterative.mdx b/docs/source/en/api/schedulers/cm_stochastic_iterative.mdx new file mode 100644 index 000000000000..0cc40bde47a0 --- /dev/null +++ b/docs/source/en/api/schedulers/cm_stochastic_iterative.mdx @@ -0,0 +1,11 @@ +# Consistency Model Multistep Scheduler + +## Overview + +Multistep and onestep scheduler (Algorithm 1) introduced alongside consistency models in the paper [Consistency Models](https://arxiv.org/abs/2303.01469) by Yang Song, Prafulla Dhariwal, Mark Chen, and Ilya Sutskever. +Based on the [original consistency models implementation](https://github.com/openai/consistency_models). +Should generate good samples from [`ConsistencyModelPipeline`] in one or a small number of steps. + +## CMStochasticIterativeScheduler +[[autodoc]] CMStochasticIterativeScheduler + diff --git a/scripts/convert_consistency_to_diffusers.py b/scripts/convert_consistency_to_diffusers.py new file mode 100644 index 000000000000..5a6158bb9867 --- /dev/null +++ b/scripts/convert_consistency_to_diffusers.py @@ -0,0 +1,313 @@ +import argparse +import os + +import torch + +from diffusers import ( + CMStochasticIterativeScheduler, + ConsistencyModelPipeline, + UNet2DModel, +) + + +TEST_UNET_CONFIG = { + "sample_size": 32, + "in_channels": 3, + "out_channels": 3, + "layers_per_block": 2, + "num_class_embeds": 1000, + "block_out_channels": [32, 64], + "attention_head_dim": 8, + "down_block_types": [ + "ResnetDownsampleBlock2D", + "AttnDownBlock2D", + ], + "up_block_types": [ + "AttnUpBlock2D", + "ResnetUpsampleBlock2D", + ], + "resnet_time_scale_shift": "scale_shift", + "upsample_type": "resnet", + "downsample_type": "resnet", +} + +IMAGENET_64_UNET_CONFIG = { + "sample_size": 64, + "in_channels": 3, + "out_channels": 3, + "layers_per_block": 3, + "num_class_embeds": 1000, + "block_out_channels": [192, 192 * 2, 192 * 3, 192 * 4], + "attention_head_dim": 64, + "down_block_types": [ + "ResnetDownsampleBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ], + "up_block_types": [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "AttnUpBlock2D", + "ResnetUpsampleBlock2D", + ], + "resnet_time_scale_shift": "scale_shift", + "upsample_type": "resnet", + "downsample_type": "resnet", +} + +LSUN_256_UNET_CONFIG = { + "sample_size": 256, + "in_channels": 3, + "out_channels": 3, + "layers_per_block": 2, + "num_class_embeds": None, + "block_out_channels": [256, 256, 256 * 2, 256 * 2, 256 * 4, 256 * 4], + "attention_head_dim": 64, + "down_block_types": [ + "ResnetDownsampleBlock2D", + "ResnetDownsampleBlock2D", + "ResnetDownsampleBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + "AttnDownBlock2D", + ], + "up_block_types": [ + "AttnUpBlock2D", + "AttnUpBlock2D", + "AttnUpBlock2D", + "ResnetUpsampleBlock2D", + "ResnetUpsampleBlock2D", + "ResnetUpsampleBlock2D", + ], + "resnet_time_scale_shift": "default", + "upsample_type": "resnet", + "downsample_type": "resnet", +} + +CD_SCHEDULER_CONFIG = { + "num_train_timesteps": 40, + "sigma_min": 0.002, + "sigma_max": 80.0, +} + +CT_IMAGENET_64_SCHEDULER_CONFIG = { + "num_train_timesteps": 201, + "sigma_min": 0.002, + "sigma_max": 80.0, +} + +CT_LSUN_256_SCHEDULER_CONFIG = { + "num_train_timesteps": 151, + "sigma_min": 0.002, + "sigma_max": 80.0, +} + + +def str2bool(v): + """ + https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse + """ + if isinstance(v, bool): + return v + if v.lower() in ("yes", "true", "t", "y", "1"): + return True + elif v.lower() in ("no", "false", "f", "n", "0"): + return False + else: + raise argparse.ArgumentTypeError("boolean value expected") + + +def convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=False): + new_checkpoint[f"{new_prefix}.norm1.weight"] = checkpoint[f"{old_prefix}.in_layers.0.weight"] + new_checkpoint[f"{new_prefix}.norm1.bias"] = checkpoint[f"{old_prefix}.in_layers.0.bias"] + new_checkpoint[f"{new_prefix}.conv1.weight"] = checkpoint[f"{old_prefix}.in_layers.2.weight"] + new_checkpoint[f"{new_prefix}.conv1.bias"] = checkpoint[f"{old_prefix}.in_layers.2.bias"] + new_checkpoint[f"{new_prefix}.time_emb_proj.weight"] = checkpoint[f"{old_prefix}.emb_layers.1.weight"] + new_checkpoint[f"{new_prefix}.time_emb_proj.bias"] = checkpoint[f"{old_prefix}.emb_layers.1.bias"] + new_checkpoint[f"{new_prefix}.norm2.weight"] = checkpoint[f"{old_prefix}.out_layers.0.weight"] + new_checkpoint[f"{new_prefix}.norm2.bias"] = checkpoint[f"{old_prefix}.out_layers.0.bias"] + new_checkpoint[f"{new_prefix}.conv2.weight"] = checkpoint[f"{old_prefix}.out_layers.3.weight"] + new_checkpoint[f"{new_prefix}.conv2.bias"] = checkpoint[f"{old_prefix}.out_layers.3.bias"] + + if has_skip: + new_checkpoint[f"{new_prefix}.conv_shortcut.weight"] = checkpoint[f"{old_prefix}.skip_connection.weight"] + new_checkpoint[f"{new_prefix}.conv_shortcut.bias"] = checkpoint[f"{old_prefix}.skip_connection.bias"] + + return new_checkpoint + + +def convert_attention(checkpoint, new_checkpoint, old_prefix, new_prefix, attention_dim=None): + weight_q, weight_k, weight_v = checkpoint[f"{old_prefix}.qkv.weight"].chunk(3, dim=0) + bias_q, bias_k, bias_v = checkpoint[f"{old_prefix}.qkv.bias"].chunk(3, dim=0) + + new_checkpoint[f"{new_prefix}.group_norm.weight"] = checkpoint[f"{old_prefix}.norm.weight"] + new_checkpoint[f"{new_prefix}.group_norm.bias"] = checkpoint[f"{old_prefix}.norm.bias"] + + new_checkpoint[f"{new_prefix}.to_q.weight"] = weight_q.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_q.bias"] = bias_q.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_k.weight"] = weight_k.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_k.bias"] = bias_k.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_v.weight"] = weight_v.squeeze(-1).squeeze(-1) + new_checkpoint[f"{new_prefix}.to_v.bias"] = bias_v.squeeze(-1).squeeze(-1) + + new_checkpoint[f"{new_prefix}.to_out.0.weight"] = ( + checkpoint[f"{old_prefix}.proj_out.weight"].squeeze(-1).squeeze(-1) + ) + new_checkpoint[f"{new_prefix}.to_out.0.bias"] = checkpoint[f"{old_prefix}.proj_out.bias"].squeeze(-1).squeeze(-1) + + return new_checkpoint + + +def con_pt_to_diffuser(checkpoint_path: str, unet_config): + checkpoint = torch.load(checkpoint_path, map_location="cpu") + new_checkpoint = {} + + new_checkpoint["time_embedding.linear_1.weight"] = checkpoint["time_embed.0.weight"] + new_checkpoint["time_embedding.linear_1.bias"] = checkpoint["time_embed.0.bias"] + new_checkpoint["time_embedding.linear_2.weight"] = checkpoint["time_embed.2.weight"] + new_checkpoint["time_embedding.linear_2.bias"] = checkpoint["time_embed.2.bias"] + + if unet_config["num_class_embeds"] is not None: + new_checkpoint["class_embedding.weight"] = checkpoint["label_emb.weight"] + + new_checkpoint["conv_in.weight"] = checkpoint["input_blocks.0.0.weight"] + new_checkpoint["conv_in.bias"] = checkpoint["input_blocks.0.0.bias"] + + down_block_types = unet_config["down_block_types"] + layers_per_block = unet_config["layers_per_block"] + attention_head_dim = unet_config["attention_head_dim"] + channels_list = unet_config["block_out_channels"] + current_layer = 1 + prev_channels = channels_list[0] + + for i, layer_type in enumerate(down_block_types): + current_channels = channels_list[i] + downsample_block_has_skip = current_channels != prev_channels + if layer_type == "ResnetDownsampleBlock2D": + for j in range(layers_per_block): + new_prefix = f"down_blocks.{i}.resnets.{j}" + old_prefix = f"input_blocks.{current_layer}.0" + has_skip = True if j == 0 and downsample_block_has_skip else False + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=has_skip) + current_layer += 1 + + elif layer_type == "AttnDownBlock2D": + for j in range(layers_per_block): + new_prefix = f"down_blocks.{i}.resnets.{j}" + old_prefix = f"input_blocks.{current_layer}.0" + has_skip = True if j == 0 and downsample_block_has_skip else False + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=has_skip) + new_prefix = f"down_blocks.{i}.attentions.{j}" + old_prefix = f"input_blocks.{current_layer}.1" + new_checkpoint = convert_attention( + checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim + ) + current_layer += 1 + + if i != len(down_block_types) - 1: + new_prefix = f"down_blocks.{i}.downsamplers.0" + old_prefix = f"input_blocks.{current_layer}.0" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix) + current_layer += 1 + + prev_channels = current_channels + + # hardcoded the mid-block for now + new_prefix = "mid_block.resnets.0" + old_prefix = "middle_block.0" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix) + new_prefix = "mid_block.attentions.0" + old_prefix = "middle_block.1" + new_checkpoint = convert_attention(checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim) + new_prefix = "mid_block.resnets.1" + old_prefix = "middle_block.2" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix) + + current_layer = 0 + up_block_types = unet_config["up_block_types"] + + for i, layer_type in enumerate(up_block_types): + if layer_type == "ResnetUpsampleBlock2D": + for j in range(layers_per_block + 1): + new_prefix = f"up_blocks.{i}.resnets.{j}" + old_prefix = f"output_blocks.{current_layer}.0" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=True) + current_layer += 1 + + if i != len(up_block_types) - 1: + new_prefix = f"up_blocks.{i}.upsamplers.0" + old_prefix = f"output_blocks.{current_layer-1}.1" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix) + elif layer_type == "AttnUpBlock2D": + for j in range(layers_per_block + 1): + new_prefix = f"up_blocks.{i}.resnets.{j}" + old_prefix = f"output_blocks.{current_layer}.0" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix, has_skip=True) + new_prefix = f"up_blocks.{i}.attentions.{j}" + old_prefix = f"output_blocks.{current_layer}.1" + new_checkpoint = convert_attention( + checkpoint, new_checkpoint, old_prefix, new_prefix, attention_head_dim + ) + current_layer += 1 + + if i != len(up_block_types) - 1: + new_prefix = f"up_blocks.{i}.upsamplers.0" + old_prefix = f"output_blocks.{current_layer-1}.2" + new_checkpoint = convert_resnet(checkpoint, new_checkpoint, old_prefix, new_prefix) + + new_checkpoint["conv_norm_out.weight"] = checkpoint["out.0.weight"] + new_checkpoint["conv_norm_out.bias"] = checkpoint["out.0.bias"] + new_checkpoint["conv_out.weight"] = checkpoint["out.2.weight"] + new_checkpoint["conv_out.bias"] = checkpoint["out.2.bias"] + + return new_checkpoint + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--unet_path", default=None, type=str, required=True, help="Path to the unet.pt to convert.") + parser.add_argument( + "--dump_path", default=None, type=str, required=True, help="Path to output the converted UNet model." + ) + parser.add_argument("--class_cond", default=True, type=str, help="Whether the model is class-conditional.") + + args = parser.parse_args() + args.class_cond = str2bool(args.class_cond) + + ckpt_name = os.path.basename(args.unet_path) + print(f"Checkpoint: {ckpt_name}") + + # Get U-Net config + if "imagenet64" in ckpt_name: + unet_config = IMAGENET_64_UNET_CONFIG + elif "256" in ckpt_name and (("bedroom" in ckpt_name) or ("cat" in ckpt_name)): + unet_config = LSUN_256_UNET_CONFIG + elif "test" in ckpt_name: + unet_config = TEST_UNET_CONFIG + else: + raise ValueError(f"Checkpoint type {ckpt_name} is not currently supported.") + + if not args.class_cond: + unet_config["num_class_embeds"] = None + + converted_unet_ckpt = con_pt_to_diffuser(args.unet_path, unet_config) + + image_unet = UNet2DModel(**unet_config) + image_unet.load_state_dict(converted_unet_ckpt) + + # Get scheduler config + if "cd" in ckpt_name or "test" in ckpt_name: + scheduler_config = CD_SCHEDULER_CONFIG + elif "ct" in ckpt_name and "imagenet64" in ckpt_name: + scheduler_config = CT_IMAGENET_64_SCHEDULER_CONFIG + elif "ct" in ckpt_name and "256" in ckpt_name and (("bedroom" in ckpt_name) or ("cat" in ckpt_name)): + scheduler_config = CT_LSUN_256_SCHEDULER_CONFIG + else: + raise ValueError(f"Checkpoint type {ckpt_name} is not currently supported.") + + cm_scheduler = CMStochasticIterativeScheduler(**scheduler_config) + + consistency_model = ConsistencyModelPipeline(unet=image_unet, scheduler=cm_scheduler) + consistency_model.save_pretrained(args.dump_path) diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 764f9204dffb..f0c25edd3fdc 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -58,6 +58,7 @@ ) from .pipelines import ( AudioPipelineOutput, + ConsistencyModelPipeline, DanceDiffusionPipeline, DDIMPipeline, DDPMPipeline, @@ -72,6 +73,7 @@ ScoreSdeVePipeline, ) from .schedulers import ( + CMStochasticIterativeScheduler, DDIMInverseScheduler, DDIMParallelScheduler, DDIMScheduler, diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py index 7077aa889190..3b17acd3d829 100644 --- a/src/diffusers/models/unet_2d.py +++ b/src/diffusers/models/unet_2d.py @@ -66,6 +66,10 @@ class UNet2DModel(ModelMixin, ConfigMixin): layers_per_block (`int`, *optional*, defaults to `2`): The number of layers per block. mid_block_scale_factor (`float`, *optional*, defaults to `1`): The scale factor for the mid block. downsample_padding (`int`, *optional*, defaults to `1`): The padding for the downsample convolution. + downsample_type (`str`, *optional*, defaults to `conv`): + The downsample type for downsampling layers. Choose between "conv" and "resnet" + upsample_type (`str`, *optional*, defaults to `conv`): + The upsample type for upsampling layers. Choose between "conv" and "resnet" act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. attention_head_dim (`int`, *optional*, defaults to `8`): The attention head dimension. norm_num_groups (`int`, *optional*, defaults to `32`): The number of groups for normalization. @@ -96,6 +100,8 @@ def __init__( layers_per_block: int = 2, mid_block_scale_factor: float = 1, downsample_padding: int = 1, + downsample_type: str = "conv", + upsample_type: str = "conv", act_fn: str = "silu", attention_head_dim: Optional[int] = 8, norm_num_groups: int = 32, @@ -168,6 +174,7 @@ def __init__( attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel, downsample_padding=downsample_padding, resnet_time_scale_shift=resnet_time_scale_shift, + downsample_type=downsample_type, ) self.down_blocks.append(down_block) @@ -207,6 +214,7 @@ def __init__( resnet_groups=norm_num_groups, attention_head_dim=attention_head_dim if attention_head_dim is not None else output_channel, resnet_time_scale_shift=resnet_time_scale_shift, + upsample_type=upsample_type, ) self.up_blocks.append(up_block) prev_output_channel = output_channel diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index eee7e6023e88..d4e7bd4e03f7 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -51,6 +51,7 @@ def get_down_block( resnet_out_scale_factor=1.0, cross_attention_norm=None, attention_head_dim=None, + downsample_type=None, ): # If attn head dim is not defined, we default it to the number of heads if attention_head_dim is None: @@ -88,18 +89,22 @@ def get_down_block( output_scale_factor=resnet_out_scale_factor, ) elif down_block_type == "AttnDownBlock2D": + if add_downsample is False: + downsample_type = None + else: + downsample_type = downsample_type or "conv" # default to 'conv' return AttnDownBlock2D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, - add_downsample=add_downsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, downsample_padding=downsample_padding, attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, + downsample_type=downsample_type, ) elif down_block_type == "CrossAttnDownBlock2D": if cross_attention_dim is None: @@ -239,6 +244,7 @@ def get_up_block( resnet_out_scale_factor=1.0, cross_attention_norm=None, attention_head_dim=None, + upsample_type=None, ): # If attn head dim is not defined, we default it to the number of heads if attention_head_dim is None: @@ -319,18 +325,23 @@ def get_up_block( cross_attention_norm=cross_attention_norm, ) elif up_block_type == "AttnUpBlock2D": + if add_upsample is False: + upsample_type = None + else: + upsample_type = upsample_type or "conv" # default to 'conv' + return AttnUpBlock2D( num_layers=num_layers, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, temb_channels=temb_channels, - add_upsample=add_upsample, resnet_eps=resnet_eps, resnet_act_fn=resnet_act_fn, resnet_groups=resnet_groups, attention_head_dim=attention_head_dim, resnet_time_scale_shift=resnet_time_scale_shift, + upsample_type=upsample_type, ) elif up_block_type == "SkipUpBlock2D": return SkipUpBlock2D( @@ -747,11 +758,12 @@ def __init__( attention_head_dim=1, output_scale_factor=1.0, downsample_padding=1, - add_downsample=True, + downsample_type="conv", ): super().__init__() resnets = [] attentions = [] + self.downsample_type = downsample_type if attention_head_dim is None: logger.warn( @@ -793,7 +805,7 @@ def __init__( self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - if add_downsample: + if downsample_type == "conv": self.downsamplers = nn.ModuleList( [ Downsample2D( @@ -801,6 +813,24 @@ def __init__( ) ] ) + elif downsample_type == "resnet": + self.downsamplers = nn.ModuleList( + [ + ResnetBlock2D( + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + down=True, + ) + ] + ) else: self.downsamplers = None @@ -810,11 +840,14 @@ def forward(self, hidden_states, temb=None, upsample_size=None): for resnet, attn in zip(self.resnets, self.attentions): hidden_states = resnet(hidden_states, temb) hidden_states = attn(hidden_states) - output_states += (hidden_states,) + output_states = output_states + (hidden_states,) if self.downsamplers is not None: for downsampler in self.downsamplers: - hidden_states = downsampler(hidden_states) + if self.downsample_type == "resnet": + hidden_states = downsampler(hidden_states, temb=temb) + else: + hidden_states = downsampler(hidden_states) output_states += (hidden_states,) @@ -1860,12 +1893,14 @@ def __init__( resnet_pre_norm: bool = True, attention_head_dim=1, output_scale_factor=1.0, - add_upsample=True, + upsample_type="conv", ): super().__init__() resnets = [] attentions = [] + self.upsample_type = upsample_type + if attention_head_dim is None: logger.warn( f"It is not recommend to pass `attention_head_dim=None`. Defaulting `attention_head_dim` to `in_channels`: {out_channels}." @@ -1908,8 +1943,26 @@ def __init__( self.attentions = nn.ModuleList(attentions) self.resnets = nn.ModuleList(resnets) - if add_upsample: + if upsample_type == "conv": self.upsamplers = nn.ModuleList([Upsample2D(out_channels, use_conv=True, out_channels=out_channels)]) + elif upsample_type == "resnet": + self.upsamplers = nn.ModuleList( + [ + ResnetBlock2D( + in_channels=out_channels, + out_channels=out_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + up=True, + ) + ] + ) else: self.upsamplers = None @@ -1925,7 +1978,10 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_si if self.upsamplers is not None: for upsampler in self.upsamplers: - hidden_states = upsampler(hidden_states) + if self.upsample_type == "resnet": + hidden_states = upsampler(hidden_states, temb=temb) + else: + hidden_states = upsampler(hidden_states) return hidden_states diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index ca57756c6aa4..3926b3413e01 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -16,6 +16,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_pt_objects import * # noqa F403 else: + from .consistency_models import ConsistencyModelPipeline from .dance_diffusion import DanceDiffusionPipeline from .ddim import DDIMPipeline from .ddpm import DDPMPipeline diff --git a/src/diffusers/pipelines/consistency_models/__init__.py b/src/diffusers/pipelines/consistency_models/__init__.py new file mode 100644 index 000000000000..fd78ddb3aae2 --- /dev/null +++ b/src/diffusers/pipelines/consistency_models/__init__.py @@ -0,0 +1 @@ +from .pipeline_consistency_models import ConsistencyModelPipeline diff --git a/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py new file mode 100644 index 000000000000..4e72e3fdbafe --- /dev/null +++ b/src/diffusers/pipelines/consistency_models/pipeline_consistency_models.py @@ -0,0 +1,337 @@ +from typing import Callable, List, Optional, Union + +import torch + +from ...models import UNet2DModel +from ...schedulers import CMStochasticIterativeScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline, ImagePipelineOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + + >>> from diffusers import ConsistencyModelPipeline + + >>> device = "cuda" + >>> # Load the cd_imagenet64_l2 checkpoint. + >>> model_id_or_path = "openai/diffusers-cd_imagenet64_l2" + >>> pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16) + >>> pipe.to(device) + + >>> # Onestep Sampling + >>> image = pipe(num_inference_steps=1).images[0] + >>> image.save("cd_imagenet64_l2_onestep_sample.png") + + >>> # Onestep sampling, class-conditional image generation + >>> # ImageNet-64 class label 145 corresponds to king penguins + >>> image = pipe(num_inference_steps=1, class_labels=145).images[0] + >>> image.save("cd_imagenet64_l2_onestep_sample_penguin.png") + + >>> # Multistep sampling, class-conditional image generation + >>> # Timesteps can be explicitly specified; the particular timesteps below are from the original Github repo: + >>> # https://github.com/openai/consistency_models/blob/main/scripts/launch.sh#L77 + >>> image = pipe(num_inference_steps=None, timesteps=[22, 0], class_labels=145).images[0] + >>> image.save("cd_imagenet64_l2_multistep_sample_penguin.png") + ``` +""" + + +class ConsistencyModelPipeline(DiffusionPipeline): + r""" + Pipeline for consistency models for unconditional or class-conditional image generation, as introduced in [1]. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + [1] Song, Yang and Dhariwal, Prafulla and Chen, Mark and Sutskever, Ilya. "Consistency Models" + https://arxiv.org/pdf/2303.01469 + + Args: + unet ([`UNet2DModel`]): + Unconditional or class-conditional U-Net architecture to denoise image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the image latents. Currently only compatible + with [`CMStochasticIterativeScheduler`]. + """ + + def __init__(self, unet: UNet2DModel, scheduler: CMStochasticIterativeScheduler) -> None: + super().__init__() + + self.register_modules( + unet=unet, + scheduler=scheduler, + ) + + self.safety_checker = None + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet]: + cpu_offload(cpu_offloaded_model, device) + + if self.safety_checker is not None: + cpu_offload(self.safety_checker, execution_device=device, offload_buffers=True) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def prepare_latents(self, batch_size, num_channels, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels, height, width) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device=device, dtype=dtype) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + # Follows diffusers.VaeImageProcessor.postprocess + def postprocess_image(self, sample: torch.FloatTensor, output_type: str = "pil"): + if output_type not in ["pt", "np", "pil"]: + raise ValueError( + f"output_type={output_type} is not supported. Make sure to choose one of ['pt', 'np', or 'pil']" + ) + + # Equivalent to diffusers.VaeImageProcessor.denormalize + sample = (sample / 2 + 0.5).clamp(0, 1) + if output_type == "pt": + return sample + + # Equivalent to diffusers.VaeImageProcessor.pt_to_numpy + sample = sample.cpu().permute(0, 2, 3, 1).numpy() + if output_type == "np": + return sample + + # Output_type must be 'pil' + sample = self.numpy_to_pil(sample) + return sample + + def prepare_class_labels(self, batch_size, device, class_labels=None): + if self.unet.config.num_class_embeds is not None: + if isinstance(class_labels, list): + class_labels = torch.tensor(class_labels, dtype=torch.int) + elif isinstance(class_labels, int): + assert batch_size == 1, "Batch size must be 1 if classes is an int" + class_labels = torch.tensor([class_labels], dtype=torch.int) + elif class_labels is None: + # Randomly generate batch_size class labels + # TODO: should use generator here? int analogue of randn_tensor is not exposed in ...utils + class_labels = torch.randint(0, self.unet.config.num_class_embeds, size=(batch_size,)) + class_labels = class_labels.to(device) + else: + class_labels = None + return class_labels + + def check_inputs(self, num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps): + if num_inference_steps is None and timesteps is None: + raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.") + + if num_inference_steps is not None and timesteps is not None: + logger.warning( + f"Both `num_inference_steps`: {num_inference_steps} and `timesteps`: {timesteps} are supplied;" + " `timesteps` will be used over `num_inference_steps`." + ) + + if latents is not None: + expected_shape = (batch_size, 3, img_size, img_size) + if latents.shape != expected_shape: + raise ValueError(f"The shape of latents is {latents.shape} but is expected to be {expected_shape}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + batch_size: int = 1, + class_labels: Optional[Union[torch.Tensor, List[int], int]] = None, + num_inference_steps: int = 1, + timesteps: List[int] = None, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + ): + r""" + Args: + batch_size (`int`, *optional*, defaults to 1): + The number of images to generate. + class_labels (`torch.Tensor` or `List[int]` or `int`, *optional*): + Optional class labels for conditioning class-conditional consistency models. Will not be used if the + model is not class-conditional. + num_inference_steps (`int`, *optional*, defaults to 1): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps` + timesteps are used. Must be in descending order. + generator (`torch.Generator`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple`: [`~pipelines.utils.ImagePipelineOutput`] if `return_dict` is + True, otherwise a `tuple. When returning a tuple, the first element is a list with the generated images. + """ + # 0. Prepare call parameters + img_size = self.unet.config.sample_size + device = self._execution_device + + # 1. Check inputs + self.check_inputs(num_inference_steps, timesteps, latents, batch_size, img_size, callback_steps) + + # 2. Prepare image latents + # Sample image latents x_0 ~ N(0, sigma_0^2 * I) + sample = self.prepare_latents( + batch_size=batch_size, + num_channels=self.unet.config.in_channels, + height=img_size, + width=img_size, + dtype=self.unet.dtype, + device=device, + generator=generator, + latents=latents, + ) + + # 3. Handle class_labels for class-conditional models + class_labels = self.prepare_class_labels(batch_size, device, class_labels=class_labels) + + # 4. Prepare timesteps + if timesteps is not None: + self.scheduler.set_timesteps(timesteps=timesteps, device=device) + timesteps = self.scheduler.timesteps + num_inference_steps = len(timesteps) + else: + self.scheduler.set_timesteps(num_inference_steps) + timesteps = self.scheduler.timesteps + + # 5. Denoising loop + # Multistep sampling: implements Algorithm 1 in the paper + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + scaled_sample = self.scheduler.scale_model_input(sample, t) + model_output = self.unet(scaled_sample, t, class_labels=class_labels, return_dict=False)[0] + + sample = self.scheduler.step(model_output, t, sample, generator=generator)[0] + + # call the callback, if provided + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, sample) + + # 6. Post-process image sample + image = self.postprocess_image(sample, output_type=output_type) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py index 935759bbb6af..0a07ce4baed2 100644 --- a/src/diffusers/schedulers/__init__.py +++ b/src/diffusers/schedulers/__init__.py @@ -28,6 +28,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_pt_objects import * # noqa F403 else: + from .scheduling_consistency_models import CMStochasticIterativeScheduler from .scheduling_ddim import DDIMScheduler from .scheduling_ddim_inverse import DDIMInverseScheduler from .scheduling_ddim_parallel import DDIMParallelScheduler diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py new file mode 100644 index 000000000000..fb296054d65b --- /dev/null +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -0,0 +1,380 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from ..configuration_utils import ConfigMixin, register_to_config +from ..utils import BaseOutput, logging, randn_tensor +from .scheduling_utils import SchedulerMixin + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class CMStochasticIterativeSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's step function output. + + Args: + prev_sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample (x_{t-1}) of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.FloatTensor + + +class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin): + """ + Multistep and onestep sampling for consistency models from Song et al. 2023 [1]. This implements Algorithm 1 in the + paper [1]. + + [1] Song, Yang and Dhariwal, Prafulla and Chen, Mark and Sutskever, Ilya. "Consistency Models" + https://arxiv.org/pdf/2303.01469 [2] Karras, Tero, et al. "Elucidating the Design Space of Diffusion-Based + Generative Models." https://arxiv.org/abs/2206.00364 + + [`~ConfigMixin`] takes care of storing all config attributes that are passed in the scheduler's `__init__` + function, such as `num_train_timesteps`. They can be accessed via `scheduler.config.num_train_timesteps`. + [`SchedulerMixin`] provides general loading and saving functionality via the [`SchedulerMixin.save_pretrained`] and + [`~SchedulerMixin.from_pretrained`] functions. + + Args: + num_train_timesteps (`int`): number of diffusion steps used to train the model. + sigma_min (`float`): + Minimum noise magnitude in the sigma schedule. This was set to 0.002 in the original implementation. + sigma_max (`float`): + Maximum noise magnitude in the sigma schedule. This was set to 80.0 in the original implementation. + sigma_data (`float`): + The standard deviation of the data distribution, following the EDM paper [2]. This was set to 0.5 in the + original implementation, which is also the original value suggested in the EDM paper. + s_noise (`float`): + The amount of additional noise to counteract loss of detail during sampling. A reasonable range is [1.000, + 1.011]. This was set to 1.0 in the original implementation. + rho (`float`): + The rho parameter used for calculating the Karras sigma schedule, introduced in the EDM paper [2]. This was + set to 7.0 in the original implementation, which is also the original value suggested in the EDM paper. + clip_denoised (`bool`): + Whether to clip the denoised outputs to `(-1, 1)`. Defaults to `True`. + timesteps (`List` or `np.ndarray` or `torch.Tensor`, *optional*): + Optionally, an explicit timestep schedule can be specified. The timesteps are expected to be in increasing + order. + """ + + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 40, + sigma_min: float = 0.002, + sigma_max: float = 80.0, + sigma_data: float = 0.5, + s_noise: float = 1.0, + rho: float = 7.0, + clip_denoised: bool = True, + ): + # standard deviation of the initial noise distribution + self.init_noise_sigma = sigma_max + + ramp = np.linspace(0, 1, num_train_timesteps) + sigmas = self._convert_to_karras(ramp) + timesteps = self.sigma_to_t(sigmas) + + # setable values + self.num_inference_steps = None + self.sigmas = torch.from_numpy(sigmas) + self.timesteps = torch.from_numpy(timesteps) + self.custom_timesteps = False + self.is_scale_input_called = False + + def index_for_timestep(self, timestep, schedule_timesteps=None): + if schedule_timesteps is None: + schedule_timesteps = self.timesteps + + indices = (schedule_timesteps == timestep).nonzero() + return indices.item() + + def scale_model_input( + self, sample: torch.FloatTensor, timestep: Union[float, torch.FloatTensor] + ) -> torch.FloatTensor: + """ + Scales the consistency model input by `(sigma**2 + sigma_data**2) ** 0.5`, following the EDM model. + + Args: + sample (`torch.FloatTensor`): input sample + timestep (`float` or `torch.FloatTensor`): the current timestep in the diffusion chain + Returns: + `torch.FloatTensor`: scaled input sample + """ + # Get sigma corresponding to timestep + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + step_idx = self.index_for_timestep(timestep) + sigma = self.sigmas[step_idx] + + sample = sample / ((sigma**2 + self.config.sigma_data**2) ** 0.5) + + self.is_scale_input_called = True + return sample + + def sigma_to_t(self, sigmas: Union[float, np.ndarray]): + """ + Gets scaled timesteps from the Karras sigmas, for input to the consistency model. + + Args: + sigmas (`float` or `np.ndarray`): single Karras sigma or array of Karras sigmas + Returns: + `float` or `np.ndarray`: scaled input timestep or scaled input timestep array + """ + if not isinstance(sigmas, np.ndarray): + sigmas = np.array(sigmas, dtype=np.float64) + + timesteps = 1000 * 0.25 * np.log(sigmas + 1e-44) + + return timesteps + + def set_timesteps( + self, + num_inference_steps: Optional[int] = None, + device: Union[str, torch.device] = None, + timesteps: Optional[List[int]] = None, + ): + """ + Sets the timesteps used for the diffusion chain. Supporting function to be run before inference. + + Args: + num_inference_steps (`int`): + the number of diffusion steps used when generating samples with a pre-trained model. + device (`str` or `torch.device`, optional): + the device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, optional): + custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of equal spacing between timesteps is used. If passed, `num_inference_steps` + must be `None`. + """ + if num_inference_steps is None and timesteps is None: + raise ValueError("Exactly one of `num_inference_steps` or `timesteps` must be supplied.") + + if num_inference_steps is not None and timesteps is not None: + raise ValueError("Can only pass one of `num_inference_steps` or `timesteps`.") + + # Follow DDPMScheduler custom timesteps logic + if timesteps is not None: + for i in range(1, len(timesteps)): + if timesteps[i] >= timesteps[i - 1]: + raise ValueError("`timesteps` must be in descending order.") + + if timesteps[0] >= self.config.num_train_timesteps: + raise ValueError( + f"`timesteps` must start before `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps}." + ) + + timesteps = np.array(timesteps, dtype=np.int64) + self.custom_timesteps = True + else: + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + self.custom_timesteps = False + + # Map timesteps to Karras sigmas directly for multistep sampling + # See https://github.com/openai/consistency_models/blob/main/cm/karras_diffusion.py#L675 + num_train_timesteps = self.config.num_train_timesteps + ramp = timesteps[::-1].copy() + ramp = ramp / (num_train_timesteps - 1) + sigmas = self._convert_to_karras(ramp) + timesteps = self.sigma_to_t(sigmas) + + sigmas = np.concatenate([sigmas, [self.sigma_min]]).astype(np.float32) + self.sigmas = torch.from_numpy(sigmas).to(device=device) + + if str(device).startswith("mps"): + # mps does not support float64 + self.timesteps = torch.from_numpy(timesteps).to(device, dtype=torch.float32) + else: + self.timesteps = torch.from_numpy(timesteps).to(device=device) + + # Modified _convert_to_karras implementation that takes in ramp as argument + def _convert_to_karras(self, ramp): + """Constructs the noise schedule of Karras et al. (2022).""" + + sigma_min: float = self.config.sigma_min + sigma_max: float = self.config.sigma_max + + rho = self.config.rho + min_inv_rho = sigma_min ** (1 / rho) + max_inv_rho = sigma_max ** (1 / rho) + sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho + return sigmas + + def get_scalings(self, sigma): + sigma_data = self.config.sigma_data + + c_skip = sigma_data**2 / (sigma**2 + sigma_data**2) + c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5 + return c_skip, c_out + + def get_scalings_for_boundary_condition(self, sigma): + """ + Gets the scalings used in the consistency model parameterization, following Appendix C of the original paper. + This enforces the consistency model boundary condition. + + Note that `epsilon` in the equations for c_skip and c_out is set to sigma_min. + + Args: + sigma (`torch.FloatTensor`): + The current sigma in the Karras sigma schedule. + Returns: + `tuple`: + A two-element tuple where c_skip (which weights the current sample) is the first element and c_out + (which weights the consistency model output) is the second element. + """ + sigma_min = self.config.sigma_min + sigma_data = self.config.sigma_data + + c_skip = sigma_data**2 / ((sigma - sigma_min) ** 2 + sigma_data**2) + c_out = (sigma - sigma_min) * sigma_data / (sigma**2 + sigma_data**2) ** 0.5 + return c_skip, c_out + + def step( + self, + model_output: torch.FloatTensor, + timestep: Union[float, torch.FloatTensor], + sample: torch.FloatTensor, + generator: Optional[torch.Generator] = None, + return_dict: bool = True, + ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]: + """ + Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.FloatTensor`): direct output from learned diffusion model. + timestep (`float`): current timestep in the diffusion chain. + sample (`torch.FloatTensor`): + current instance of sample being created by diffusion process. + generator (`torch.Generator`, *optional*): Random number generator. + return_dict (`bool`): option for returning tuple rather than EulerDiscreteSchedulerOutput class + Returns: + [`~schedulers.scheduling_utils.CMStochasticIterativeSchedulerOutput`] or `tuple`: + [`~schedulers.scheduling_utils.CMStochasticIterativeSchedulerOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is the sample tensor. + """ + + if ( + isinstance(timestep, int) + or isinstance(timestep, torch.IntTensor) + or isinstance(timestep, torch.LongTensor) + ): + raise ValueError( + ( + "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to" + f" `{self.__class__}.step()` is not supported. Make sure to pass" + " one of the `scheduler.timesteps` as a timestep." + ), + ) + + if not self.is_scale_input_called: + logger.warning( + "The `scale_model_input` function should be called before `step` to ensure correct denoising. " + "See `StableDiffusionPipeline` for a usage example." + ) + + if isinstance(timestep, torch.Tensor): + timestep = timestep.to(self.timesteps.device) + + sigma_min = self.config.sigma_min + sigma_max = self.config.sigma_max + + step_index = self.index_for_timestep(timestep) + + # sigma_next corresponds to next_t in original implementation + sigma = self.sigmas[step_index] + if step_index + 1 < self.config.num_train_timesteps: + sigma_next = self.sigmas[step_index + 1] + else: + # Set sigma_next to sigma_min + sigma_next = self.sigmas[-1] + + # Get scalings for boundary conditions + c_skip, c_out = self.get_scalings_for_boundary_condition(sigma) + + # 1. Denoise model output using boundary conditions + denoised = c_out * model_output + c_skip * sample + if self.config.clip_denoised: + denoised = denoised.clamp(-1, 1) + + # 2. Sample z ~ N(0, s_noise^2 * I) + # Noise is not used for onestep sampling. + if len(self.timesteps) > 1: + noise = randn_tensor( + model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator + ) + else: + noise = torch.zeros_like(model_output) + z = noise * self.config.s_noise + + sigma_hat = sigma_next.clamp(min=sigma_min, max=sigma_max) + + # 3. Return noisy sample + # tau = sigma_hat, eps = sigma_min + prev_sample = denoised + z * (sigma_hat**2 - sigma_min**2) ** 0.5 + + if not return_dict: + return (prev_sample,) + + return CMStochasticIterativeSchedulerOutput(prev_sample=prev_sample) + + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + timesteps: torch.FloatTensor, + ) -> torch.FloatTensor: + # Make sure sigmas and timesteps have the same device and dtype as original_samples + sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype) + if original_samples.device.type == "mps" and torch.is_floating_point(timesteps): + # mps does not support float64 + schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32) + timesteps = timesteps.to(original_samples.device, dtype=torch.float32) + else: + schedule_timesteps = self.timesteps.to(original_samples.device) + timesteps = timesteps.to(original_samples.device) + + step_indices = [(schedule_timesteps == t).nonzero().item() for t in timesteps] + + sigma = sigmas[step_indices].flatten() + while len(sigma.shape) < len(original_samples.shape): + sigma = sigma.unsqueeze(-1) + + noisy_samples = original_samples + noise * sigma + return noisy_samples + + def __len__(self): + return self.config.num_train_timesteps diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py index 7a13bc89e883..20dbf84681d3 100644 --- a/src/diffusers/utils/dummy_pt_objects.py +++ b/src/diffusers/utils/dummy_pt_objects.py @@ -210,6 +210,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class ConsistencyModelPipeline(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class DanceDiffusionPipeline(metaclass=DummyObject): _backends = ["torch"] @@ -390,6 +405,21 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch"]) +class CMStochasticIterativeScheduler(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch"]) + + class DDIMInverseScheduler(metaclass=DummyObject): _backends = ["torch"] diff --git a/tests/pipelines/consistency_models/__init__.py b/tests/pipelines/consistency_models/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/consistency_models/test_consistency_models.py b/tests/pipelines/consistency_models/test_consistency_models.py new file mode 100644 index 000000000000..8dce90318505 --- /dev/null +++ b/tests/pipelines/consistency_models/test_consistency_models.py @@ -0,0 +1,288 @@ +import gc +import unittest + +import numpy as np +import torch +from torch.backends.cuda import sdp_kernel + +from diffusers import ( + CMStochasticIterativeScheduler, + ConsistencyModelPipeline, + UNet2DModel, +) +from diffusers.utils import randn_tensor, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_2, require_torch_gpu + +from ..pipeline_params import UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS, UNCONDITIONAL_IMAGE_GENERATION_PARAMS +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class ConsistencyModelPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = ConsistencyModelPipeline + params = UNCONDITIONAL_IMAGE_GENERATION_PARAMS + batch_params = UNCONDITIONAL_IMAGE_GENERATION_BATCH_PARAMS + + # Override required_optional_params to remove num_images_per_prompt + required_optional_params = frozenset( + [ + "num_inference_steps", + "generator", + "latents", + "output_type", + "return_dict", + "callback", + "callback_steps", + ] + ) + + @property + def dummy_uncond_unet(self): + unet = UNet2DModel.from_pretrained( + "diffusers/consistency-models-test", + subfolder="test_unet", + ) + return unet + + @property + def dummy_cond_unet(self): + unet = UNet2DModel.from_pretrained( + "diffusers/consistency-models-test", + subfolder="test_unet_class_cond", + ) + return unet + + def get_dummy_components(self, class_cond=False): + if class_cond: + unet = self.dummy_cond_unet + else: + unet = self.dummy_uncond_unet + + # Default to CM multistep sampler + scheduler = CMStochasticIterativeScheduler( + num_train_timesteps=40, + sigma_min=0.002, + sigma_max=80.0, + ) + + components = { + "unet": unet, + "scheduler": scheduler, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + inputs = { + "batch_size": 1, + "num_inference_steps": None, + "timesteps": [22, 0], + "generator": generator, + "output_type": "np", + } + + return inputs + + def test_consistency_model_pipeline_multistep(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + pipe = ConsistencyModelPipeline(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.3572, 0.6273, 0.4031, 0.3961, 0.4321, 0.5730, 0.5266, 0.4780, 0.5004]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_consistency_model_pipeline_multistep_class_cond(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components(class_cond=True) + pipe = ConsistencyModelPipeline(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["class_labels"] = 0 + image = pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.3572, 0.6273, 0.4031, 0.3961, 0.4321, 0.5730, 0.5266, 0.4780, 0.5004]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_consistency_model_pipeline_onestep(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + pipe = ConsistencyModelPipeline(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = 1 + inputs["timesteps"] = None + image = pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.5004, 0.5004, 0.4994, 0.5008, 0.4976, 0.5018, 0.4990, 0.4982, 0.4987]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + def test_consistency_model_pipeline_onestep_class_cond(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components(class_cond=True) + pipe = ConsistencyModelPipeline(**components) + pipe = pipe.to(device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + inputs["num_inference_steps"] = 1 + inputs["timesteps"] = None + inputs["class_labels"] = 0 + image = pipe(**inputs).images + assert image.shape == (1, 32, 32, 3) + + image_slice = image[0, -3:, -3:, -1] + expected_slice = np.array([0.5004, 0.5004, 0.4994, 0.5008, 0.4976, 0.5018, 0.4990, 0.4982, 0.4987]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + +@slow +@require_torch_gpu +class ConsistencyModelPipelineSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, seed=0, get_fixed_latents=False, device="cpu", dtype=torch.float32, shape=(1, 3, 64, 64)): + generator = torch.manual_seed(seed) + + inputs = { + "num_inference_steps": None, + "timesteps": [22, 0], + "class_labels": 0, + "generator": generator, + "output_type": "np", + } + + if get_fixed_latents: + latents = self.get_fixed_latents(seed=seed, device=device, dtype=dtype, shape=shape) + inputs["latents"] = latents + + return inputs + + def get_fixed_latents(self, seed=0, device="cpu", dtype=torch.float32, shape=(1, 3, 64, 64)): + if type(device) == str: + device = torch.device(device) + generator = torch.Generator(device=device).manual_seed(seed) + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + return latents + + def test_consistency_model_cd_multistep(self): + unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2") + scheduler = CMStochasticIterativeScheduler( + num_train_timesteps=40, + sigma_min=0.002, + sigma_max=80.0, + ) + pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler) + pipe.to(torch_device=torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs() + image = pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + image_slice = image[0, -3:, -3:, -1] + + expected_slice = np.array([0.0888, 0.0881, 0.0666, 0.0479, 0.0292, 0.0195, 0.0201, 0.0163, 0.0254]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 + + def test_consistency_model_cd_onestep(self): + unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2") + scheduler = CMStochasticIterativeScheduler( + num_train_timesteps=40, + sigma_min=0.002, + sigma_max=80.0, + ) + pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler) + pipe.to(torch_device=torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs() + inputs["num_inference_steps"] = 1 + inputs["timesteps"] = None + image = pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + image_slice = image[0, -3:, -3:, -1] + + expected_slice = np.array([0.0340, 0.0152, 0.0063, 0.0267, 0.0221, 0.0107, 0.0416, 0.0186, 0.0217]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2 + + @require_torch_2 + def test_consistency_model_cd_multistep_flash_attn(self): + unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2") + scheduler = CMStochasticIterativeScheduler( + num_train_timesteps=40, + sigma_min=0.002, + sigma_max=80.0, + ) + pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler) + pipe.to(torch_device=torch_device, torch_dtype=torch.float16) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(get_fixed_latents=True, device=torch_device) + # Ensure usage of flash attention in torch 2.0 + with sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + image = pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + image_slice = image[0, -3:, -3:, -1] + + expected_slice = np.array([0.1875, 0.1428, 0.1289, 0.2151, 0.2092, 0.1477, 0.1877, 0.1641, 0.1353]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 + + @require_torch_2 + def test_consistency_model_cd_onestep_flash_attn(self): + unet = UNet2DModel.from_pretrained("diffusers/consistency_models", subfolder="diffusers_cd_imagenet64_l2") + scheduler = CMStochasticIterativeScheduler( + num_train_timesteps=40, + sigma_min=0.002, + sigma_max=80.0, + ) + pipe = ConsistencyModelPipeline(unet=unet, scheduler=scheduler) + pipe.to(torch_device=torch_device, torch_dtype=torch.float16) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(get_fixed_latents=True, device=torch_device) + inputs["num_inference_steps"] = 1 + inputs["timesteps"] = None + # Ensure usage of flash attention in torch 2.0 + with sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + image = pipe(**inputs).images + assert image.shape == (1, 64, 64, 3) + + image_slice = image[0, -3:, -3:, -1] + + expected_slice = np.array([0.1663, 0.1948, 0.2275, 0.1680, 0.1204, 0.1245, 0.1858, 0.1338, 0.2095]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3 diff --git a/tests/schedulers/test_scheduler_consistency_model.py b/tests/schedulers/test_scheduler_consistency_model.py new file mode 100644 index 000000000000..66f07d024783 --- /dev/null +++ b/tests/schedulers/test_scheduler_consistency_model.py @@ -0,0 +1,150 @@ +import torch + +from diffusers import CMStochasticIterativeScheduler + +from .test_schedulers import SchedulerCommonTest + + +class CMStochasticIterativeSchedulerTest(SchedulerCommonTest): + scheduler_classes = (CMStochasticIterativeScheduler,) + num_inference_steps = 10 + + def get_scheduler_config(self, **kwargs): + config = { + "num_train_timesteps": 201, + "sigma_min": 0.002, + "sigma_max": 80.0, + } + + config.update(**kwargs) + return config + + # Override test_step_shape to add CMStochasticIterativeScheduler-specific logic regarding timesteps + # Problem is that we don't know two timesteps that will always be in the timestep schedule from only the scheduler + # config; scaled sigma_max is always in the timestep schedule, but sigma_min is in the sigma schedule while scaled + # sigma_min is not in the timestep schedule + def test_step_shape(self): + num_inference_steps = 10 + + scheduler_config = self.get_scheduler_config() + scheduler = self.scheduler_classes[0](**scheduler_config) + + scheduler.set_timesteps(num_inference_steps) + + timestep_0 = scheduler.timesteps[0] + timestep_1 = scheduler.timesteps[1] + + sample = self.dummy_sample + residual = 0.1 * sample + + output_0 = scheduler.step(residual, timestep_0, sample).prev_sample + output_1 = scheduler.step(residual, timestep_1, sample).prev_sample + + self.assertEqual(output_0.shape, sample.shape) + self.assertEqual(output_0.shape, output_1.shape) + + def test_timesteps(self): + for timesteps in [10, 50, 100, 1000]: + self.check_over_configs(num_train_timesteps=timesteps) + + def test_clip_denoised(self): + for clip_denoised in [True, False]: + self.check_over_configs(clip_denoised=clip_denoised) + + def test_full_loop_no_noise_onestep(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + num_inference_steps = 1 + scheduler.set_timesteps(num_inference_steps) + timesteps = scheduler.timesteps + + generator = torch.manual_seed(0) + + model = self.dummy_model() + sample = self.dummy_sample_deter * scheduler.init_noise_sigma + + for i, t in enumerate(timesteps): + # 1. scale model input + scaled_sample = scheduler.scale_model_input(sample, t) + + # 2. predict noise residual + residual = model(scaled_sample, t) + + # 3. predict previous sample x_t-1 + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample + + sample = pred_prev_sample + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 192.7614) < 1e-2 + assert abs(result_mean.item() - 0.2510) < 1e-3 + + def test_full_loop_no_noise_multistep(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [106, 0] + scheduler.set_timesteps(timesteps=timesteps) + timesteps = scheduler.timesteps + + generator = torch.manual_seed(0) + + model = self.dummy_model() + sample = self.dummy_sample_deter * scheduler.init_noise_sigma + + for t in timesteps: + # 1. scale model input + scaled_sample = scheduler.scale_model_input(sample, t) + + # 2. predict noise residual + residual = model(scaled_sample, t) + + # 3. predict previous sample x_t-1 + pred_prev_sample = scheduler.step(residual, t, sample, generator=generator).prev_sample + + sample = pred_prev_sample + + result_sum = torch.sum(torch.abs(sample)) + result_mean = torch.mean(torch.abs(sample)) + + assert abs(result_sum.item() - 347.6357) < 1e-2 + assert abs(result_mean.item() - 0.4527) < 1e-3 + + def test_custom_timesteps_increasing_order(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [39, 30, 12, 15, 0] + + with self.assertRaises(ValueError, msg="`timesteps` must be in descending order."): + scheduler.set_timesteps(timesteps=timesteps) + + def test_custom_timesteps_passing_both_num_inference_steps_and_timesteps(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [39, 30, 12, 1, 0] + num_inference_steps = len(timesteps) + + with self.assertRaises(ValueError, msg="Can only pass one of `num_inference_steps` or `timesteps`."): + scheduler.set_timesteps(num_inference_steps=num_inference_steps, timesteps=timesteps) + + def test_custom_timesteps_too_large(self): + scheduler_class = self.scheduler_classes[0] + scheduler_config = self.get_scheduler_config() + scheduler = scheduler_class(**scheduler_config) + + timesteps = [scheduler.config.num_train_timesteps] + + with self.assertRaises( + ValueError, + msg="`timesteps` must start before `self.config.train_timesteps`: {scheduler.config.num_train_timesteps}}", + ): + scheduler.set_timesteps(timesteps=timesteps) diff --git a/tests/schedulers/test_schedulers.py b/tests/schedulers/test_schedulers.py index d1ae333c0cd2..d9423d621966 100755 --- a/tests/schedulers/test_schedulers.py +++ b/tests/schedulers/test_schedulers.py @@ -24,6 +24,7 @@ import diffusers from diffusers import ( + CMStochasticIterativeScheduler, DDIMScheduler, DEISMultistepScheduler, DiffusionPipeline, @@ -303,6 +304,11 @@ def check_over_configs(self, time_step=0, **config): scheduler_config = self.get_scheduler_config(**config) scheduler = scheduler_class(**scheduler_config) + if scheduler_class == CMStochasticIterativeScheduler: + # Get valid timestep based on sigma_max, which should always be in timestep schedule. + scaled_sigma_max = scheduler.sigma_to_t(scheduler.config.sigma_max) + time_step = scaled_sigma_max + if scheduler_class == VQDiffusionScheduler: num_vec_classes = scheduler_config["num_vec_classes"] sample = self.dummy_sample(num_vec_classes) @@ -323,7 +329,11 @@ def check_over_configs(self, time_step=0, **config): kwargs["num_inference_steps"] = num_inference_steps # Make sure `scale_model_input` is invoked to prevent a warning - if scheduler_class != VQDiffusionScheduler: + if scheduler_class == CMStochasticIterativeScheduler: + # Get valid timestep based on sigma_max, which should always be in timestep schedule. + _ = scheduler.scale_model_input(sample, scaled_sigma_max) + _ = new_scheduler.scale_model_input(sample, scaled_sigma_max) + elif scheduler_class != VQDiffusionScheduler: _ = scheduler.scale_model_input(sample, 0) _ = new_scheduler.scale_model_input(sample, 0) @@ -393,6 +403,10 @@ def test_from_save_pretrained(self): scheduler_config = self.get_scheduler_config() scheduler = scheduler_class(**scheduler_config) + if scheduler_class == CMStochasticIterativeScheduler: + # Get valid timestep based on sigma_max, which should always be in timestep schedule. + timestep = scheduler.sigma_to_t(scheduler.config.sigma_max) + if scheduler_class == VQDiffusionScheduler: num_vec_classes = scheduler_config["num_vec_classes"] sample = self.dummy_sample(num_vec_classes) @@ -539,6 +553,10 @@ def recursive_check(tuple_object, dict_object): scheduler_config = self.get_scheduler_config() scheduler = scheduler_class(**scheduler_config) + if scheduler_class == CMStochasticIterativeScheduler: + # Get valid timestep based on sigma_max, which should always be in timestep schedule. + timestep = scheduler.sigma_to_t(scheduler.config.sigma_max) + if scheduler_class == VQDiffusionScheduler: num_vec_classes = scheduler_config["num_vec_classes"] sample = self.dummy_sample(num_vec_classes) @@ -594,7 +612,12 @@ def test_scheduler_public_api(self): if scheduler_class != VQDiffusionScheduler: sample = self.dummy_sample - scaled_sample = scheduler.scale_model_input(sample, 0.0) + if scheduler_class == CMStochasticIterativeScheduler: + # Get valid timestep based on sigma_max, which should always be in timestep schedule. + scaled_sigma_max = scheduler.sigma_to_t(scheduler.config.sigma_max) + scaled_sample = scheduler.scale_model_input(sample, scaled_sigma_max) + else: + scaled_sample = scheduler.scale_model_input(sample, 0.0) self.assertEqual(sample.shape, scaled_sample.shape) def test_add_noise_device(self): @@ -606,7 +629,12 @@ def test_add_noise_device(self): scheduler.set_timesteps(100) sample = self.dummy_sample.to(torch_device) - scaled_sample = scheduler.scale_model_input(sample, 0.0) + if scheduler_class == CMStochasticIterativeScheduler: + # Get valid timestep based on sigma_max, which should always be in timestep schedule. + scaled_sigma_max = scheduler.sigma_to_t(scheduler.config.sigma_max) + scaled_sample = scheduler.scale_model_input(sample, scaled_sigma_max) + else: + scaled_sample = scheduler.scale_model_input(sample, 0.0) self.assertEqual(sample.shape, scaled_sample.shape) noise = torch.randn_like(scaled_sample).to(torch_device) @@ -637,7 +665,7 @@ def test_deprecated_kwargs(self): def test_trained_betas(self): for scheduler_class in self.scheduler_classes: - if scheduler_class == VQDiffusionScheduler: + if scheduler_class in (VQDiffusionScheduler, CMStochasticIterativeScheduler): continue scheduler_config = self.get_scheduler_config() From 41ea88f38c8508e3d6cd33929ef666a5ea905715 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 6 Jul 2023 10:55:24 +0530 Subject: [PATCH 179/199] Update consistency_models.mdx (#3961) --- docs/source/en/api/pipelines/consistency_models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/consistency_models.mdx b/docs/source/en/api/pipelines/consistency_models.mdx index 715743b87a12..f6ec212789a0 100644 --- a/docs/source/en/api/pipelines/consistency_models.mdx +++ b/docs/source/en/api/pipelines/consistency_models.mdx @@ -28,7 +28,7 @@ Available Checkpoints are: |:---:|:---:|:---:|:---:| | [ConsistencyModelPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/pipeline_consistency_models.py) | *Unconditional Image Generation* | | | -This pipeline was contributed by our community members [dg845](https://github.com/dg845) and [ayushtues](https://huggingface.co/ayushtues) :heart: +This pipeline was contributed by our community members [dg845](https://github.com/dg845) and [ayushtues](https://huggingface.co/ayushtues) ❤️ ## Usage Example From de1426119d061a0c683f79ec253330aa0130d898 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Thu, 6 Jul 2023 00:42:41 -0700 Subject: [PATCH 180/199] Make `UNet2DConditionOutput` pickle-able (#3857) * add default to unet output to prevent it from being a required arg * add unit test * make style * adjust unit test * mark as fast test * adjust assert statement in test --------- Co-authored-by: Prathik Rao Co-authored-by: root --- src/diffusers/models/unet_2d_condition.py | 2 +- tests/models/test_models_unet_2d_condition.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 868511ef6625..cd62a494d5aa 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -57,7 +57,7 @@ class UNet2DConditionOutput(BaseOutput): The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model. """ - sample: torch.FloatTensor + sample: torch.FloatTensor = None class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin): diff --git a/tests/models/test_models_unet_2d_condition.py b/tests/models/test_models_unet_2d_condition.py index 24da508227d2..4eeb1b926bec 100644 --- a/tests/models/test_models_unet_2d_condition.py +++ b/tests/models/test_models_unet_2d_condition.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy import gc import os import tempfile @@ -782,6 +783,22 @@ def test_custom_diffusion_xformers_on_off(self): assert (sample - on_sample).abs().max() < 1e-4 assert (sample - off_sample).abs().max() < 1e-4 + def test_pickle(self): + # enable deterministic behavior for gradient checkpointing + init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common() + + init_dict["attention_head_dim"] = (8, 16) + + model = self.model_class(**init_dict) + model.to(torch_device) + + with torch.no_grad(): + sample = model(**inputs_dict).sample + + sample_copy = copy.copy(sample) + + assert (sample - sample_copy).abs().max() < 1e-4 + @slow class UNet2DConditionModelIntegrationTests(unittest.TestCase): From 46af98267d668ae9fa61ac8a5e1e4e5c229f58d3 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 6 Jul 2023 14:22:43 +0530 Subject: [PATCH 181/199] [Consistency Models] correct checkpoint url in the doc (#3962) correct checkpoint url. --- docs/source/en/api/pipelines/consistency_models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/consistency_models.mdx b/docs/source/en/api/pipelines/consistency_models.mdx index f6ec212789a0..56ec2e0f3432 100644 --- a/docs/source/en/api/pipelines/consistency_models.mdx +++ b/docs/source/en/api/pipelines/consistency_models.mdx @@ -12,7 +12,7 @@ Resources: * [Original Code](https://github.com/openai/consistency_models) Available Checkpoints are: -- *cd_imagenet64_l2 (64x64 resolution)* [openai/consistency-model-pipelines](https://huggingface.co/openai/consistency-model-pipelines) +- *cd_imagenet64_l2 (64x64 resolution)* [openai/consistency-model-pipelines](https://huggingface.co/openai/diffusers-cd_imagenet64_l2) - *cd_imagenet64_lpips (64x64 resolution)* [openai/diffusers-cd_imagenet64_lpips](https://huggingface.co/openai/diffusers-cd_imagenet64_lpips) - *ct_imagenet64 (64x64 resolution)* [openai/diffusers-ct_imagenet64](https://huggingface.co/openai/diffusers-ct_imagenet64) - *cd_bedroom256_l2 (256x256 resolution)* [openai/diffusers-cd_bedroom256_l2](https://huggingface.co/openai/diffusers-cd_bedroom256_l2) From b62d9a1fdc0910bb864340b7bc29e86f6aa31d47 Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Thu, 6 Jul 2023 14:30:50 +0530 Subject: [PATCH 182/199] [Text-to-video] Add `torch.compile()` compatibility (#3949) * use sample directly instead of the dataclass. * more usage of directly samples instead of dataclasses * more usage of directly samples instead of dataclasses * use direct sample in the pipeline. * direct usage of sample in the img2img case. --- src/diffusers/models/unet_3d_blocks.py | 21 +++++++++++-------- src/diffusers/models/unet_3d_condition.py | 7 +++++-- .../pipeline_text_to_video_synth.py | 3 ++- .../pipeline_text_to_video_synth_img2img.py | 3 ++- 4 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/diffusers/models/unet_3d_blocks.py b/src/diffusers/models/unet_3d_blocks.py index 73bfa401932f..ab5c393518e2 100644 --- a/src/diffusers/models/unet_3d_blocks.py +++ b/src/diffusers/models/unet_3d_blocks.py @@ -250,10 +250,11 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, - ).sample + return_dict=False, + )[0] hidden_states = temp_attn( - hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs - ).sample + hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs, return_dict=False + )[0] hidden_states = resnet(hidden_states, temb) hidden_states = temp_conv(hidden_states, num_frames=num_frames) @@ -377,10 +378,11 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, - ).sample + return_dict=False, + )[0] hidden_states = temp_attn( - hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs - ).sample + hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs, return_dict=False + )[0] output_states += (hidden_states,) @@ -590,10 +592,11 @@ def forward( hidden_states, encoder_hidden_states=encoder_hidden_states, cross_attention_kwargs=cross_attention_kwargs, - ).sample + return_dict=False, + )[0] hidden_states = temp_attn( - hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs - ).sample + hidden_states, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs, return_dict=False + )[0] if self.upsamplers is not None: for upsampler in self.upsamplers: diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index 9bc89c571c52..ee4d0d7cab98 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -526,8 +526,11 @@ def forward( sample = self.conv_in(sample) sample = self.transformer_in( - sample, num_frames=num_frames, cross_attention_kwargs=cross_attention_kwargs - ).sample + sample, + num_frames=num_frames, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] # 3. down down_block_res_samples = (sample,) diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py index e30f183808a5..ecc330b5f504 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py @@ -648,7 +648,8 @@ def __call__( t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - ).sample + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: diff --git a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py index ce5109a58213..7a4b73cd3c35 100644 --- a/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py +++ b/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth_img2img.py @@ -723,7 +723,8 @@ def __call__( t, encoder_hidden_states=prompt_embeds, cross_attention_kwargs=cross_attention_kwargs, - ).sample + return_dict=False, + )[0] # perform guidance if do_classifier_free_guidance: From bc9a8cef6f258aafcd43ef64ac161218a7eae43a Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Jul 2023 13:37:27 +0200 Subject: [PATCH 183/199] [SD-XL] Add new pipelines (#3859) * Add new text encoder * add transformers depth * More * Correct conversion script * Fix more * Fix more * Correct more * correct text encoder * Finish all * proof that in works in run local xl * clean up * Get refiner to work * Add red castle * Fix batch size * Improve pipelines more * Finish text2image tests * Add img2img test * Fix more * fix import * Fix embeddings for classic models (#3888) Fix embeddings for classic SD models. * Allow multiple prompts to be passed to the refiner (#3895) * finish more * Apply suggestions from code review * add watermarker * Model offload (#3889) * Model offload. * Model offload for refiner / img2img * Hardcode encoder offload on img2img vae encode Saves some GPU RAM in img2img / refiner tasks so it remains below 8 GB. --------- Co-authored-by: Patrick von Platen * correct * fix * clean print * Update install warning for `invisible-watermark` * add: missing docstrings. * fix and simplify the usage example in img2img. * fix setup for watermarking. * Revert "fix setup for watermarking." This reverts commit 491bc9f5a640bbf46a97a8e52d6eff7e70eb8e4b. * fix: watermarking setup. * fix: op. * run make fix-copies. * make sure tests pass * improve convert * make tests pass * make tests pass * better error message * fiinsh * finish * Fix final test --------- Co-authored-by: Pedro Cuenca Co-authored-by: Sayak Paul --- .github/workflows/build_documentation.yml | 21 +- .github/workflows/build_pr_documentation.yml | 18 +- .github/workflows/pr_tests.yml | 2 +- docker/diffusers-pytorch-cpu/Dockerfile | 4 +- docker/diffusers-pytorch-cuda/Dockerfile | 4 +- .../stable_diffusion/stable_diffusion_xl.mdx | 42 + ..._original_stable_diffusion_to_diffusers.py | 8 + setup.py | 2 + src/diffusers/__init__.py | 9 + src/diffusers/dependency_versions_table.py | 1 + src/diffusers/models/attention_processor.py | 2 + src/diffusers/models/unet_2d_blocks.py | 13 +- src/diffusers/models/unet_2d_condition.py | 46 +- src/diffusers/pipelines/__init__.py | 10 + .../stable_diffusion/convert_from_ckpt.py | 150 ++- .../pipeline_stable_diffusion_upscale.py | 8 +- .../pipelines/stable_diffusion_xl/__init__.py | 31 + .../pipeline_stable_diffusion_xl.py | 788 ++++++++++++++++ .../pipeline_stable_diffusion_xl_img2img.py | 879 ++++++++++++++++++ .../stable_diffusion_xl/watermark.py | 31 + .../versatile_diffusion/modeling_text_unet.py | 57 +- src/diffusers/utils/__init__.py | 1 + ...formers_and_invisible_watermark_objects.py | 32 + src/diffusers/utils/import_utils.py | 23 +- tests/others/test_dependencies.py | 2 + .../pipelines/stable_diffusion_xl/__init__.py | 0 .../test_stable_diffusion_xl.py | 187 ++++ .../test_stable_diffusion_xl_img2img.py | 202 ++++ 28 files changed, 2512 insertions(+), 61 deletions(-) create mode 100644 docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx create mode 100644 src/diffusers/pipelines/stable_diffusion_xl/__init__.py create mode 100644 src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py create mode 100644 src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py create mode 100644 src/diffusers/pipelines/stable_diffusion_xl/watermark.py create mode 100644 src/diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py create mode 100644 tests/pipelines/stable_diffusion_xl/__init__.py create mode 100644 tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py create mode 100644 tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 6fc8d343cd91..79d2cdec0672 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -9,13 +9,20 @@ on: - v*-patch jobs: - build: - uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main - with: - commit_sha: ${{ github.sha }} - package: diffusers - notebook_folder: diffusers_doc - languages: en ko zh + build: + steps: + - name: Install dependencies + run: | + apt-get update && apt-get install libsndfile1-dev libgl1 -y + + - name: Build doc + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + package: diffusers + notebook_folder: diffusers_doc + languages: en ko zh + secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 85289c1eb75a..248644b7e9cd 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -9,9 +9,15 @@ concurrency: jobs: build: - uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main - with: - commit_sha: ${{ github.event.pull_request.head.sha }} - pr_number: ${{ github.event.number }} - package: diffusers - languages: en ko + steps: + - name: Install dependencies + run: | + apt-get update && apt-get install libsndfile1-dev libgl1 -y + + - name: Build doc + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main + with: + commit_sha: ${{ github.event.pull_request.head.sha }} + pr_number: ${{ github.event.number }} + package: diffusers + languages: en ko zh diff --git a/.github/workflows/pr_tests.yml b/.github/workflows/pr_tests.yml index 88c424ee49d3..acfe881188f0 100644 --- a/.github/workflows/pr_tests.yml +++ b/.github/workflows/pr_tests.yml @@ -62,7 +62,7 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev -y + apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] - name: Environment diff --git a/docker/diffusers-pytorch-cpu/Dockerfile b/docker/diffusers-pytorch-cpu/Dockerfile index a70eff4c852b..127c61a719c5 100644 --- a/docker/diffusers-pytorch-cpu/Dockerfile +++ b/docker/diffusers-pytorch-cpu/Dockerfile @@ -14,6 +14,7 @@ RUN apt update && \ libsndfile1-dev \ python3.8 \ python3-pip \ + libgl1 \ python3.8-venv && \ rm -rf /var/lib/apt/lists @@ -27,6 +28,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ torch \ torchvision \ torchaudio \ + invisible_watermark \ --extra-index-url https://download.pytorch.org/whl/cpu && \ python3 -m pip install --no-cache-dir \ accelerate \ @@ -40,4 +42,4 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ tensorboard \ transformers -CMD ["/bin/bash"] \ No newline at end of file +CMD ["/bin/bash"] diff --git a/docker/diffusers-pytorch-cuda/Dockerfile b/docker/diffusers-pytorch-cuda/Dockerfile index 6b56403a6f94..fab3b7082765 100644 --- a/docker/diffusers-pytorch-cuda/Dockerfile +++ b/docker/diffusers-pytorch-cuda/Dockerfile @@ -12,6 +12,7 @@ RUN apt update && \ curl \ ca-certificates \ libsndfile1-dev \ + libgl1 \ python3.8 \ python3-pip \ python3.8-venv && \ @@ -26,7 +27,8 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \ python3 -m pip install --no-cache-dir \ torch \ torchvision \ - torchaudio && \ + torchaudio \ + invisible_watermark && \ python3 -m pip install --no-cache-dir \ accelerate \ datasets \ diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx new file mode 100644 index 000000000000..b87d51af233b --- /dev/null +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx @@ -0,0 +1,42 @@ + + +# Stable diffusion XL + +Stable Diffusion 2 is a text-to-image _latent diffusion_ model built upon the work of [Stable Diffusion 1](https://stability.ai/blog/stable-diffusion-public-release). +The project to train Stable Diffusion 2 was led by Robin Rombach and Katherine Crowson from [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). + +*The Stable Diffusion 2.0 release includes robust text-to-image models trained using a brand new text encoder (OpenCLIP), developed by LAION with support from Stability AI, which greatly improves the quality of the generated images compared to earlier V1 releases. The text-to-image models in this release can generate images with default resolutions of both 512x512 pixels and 768x768 pixels. +These models are trained on an aesthetic subset of the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) created by the DeepFloyd team at Stability AI, which is then further filtered to remove adult content using [LAION’s NSFW filter](https://openreview.net/forum?id=M3Y74vmsMcY).* + +For more details about how Stable Diffusion 2 works and how it differs from Stable Diffusion 1, please refer to the official [launch announcement post](https://stability.ai/blog/stable-diffusion-v2-release). + +## Tips + +### Available checkpoints: + +- *Text-to-Image (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9) with [`StableDiffusionXLPipeline`] +- *Image-to-Image / Refiner (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9) with [`StableDiffusionXLImg2ImgPipeline`] + +TODO + +## StableDiffusionXLPipeline + +[[autodoc]] StableDiffusionXLPipeline + - all + - __call__ + +## StableDiffusionXLImg2ImgPipeline + +[[autodoc]] StableDiffusionXLImg2ImgPipeline + - all + - __call__ diff --git a/scripts/convert_original_stable_diffusion_to_diffusers.py b/scripts/convert_original_stable_diffusion_to_diffusers.py index de64095523b6..376c1e8726de 100644 --- a/scripts/convert_original_stable_diffusion_to_diffusers.py +++ b/scripts/convert_original_stable_diffusion_to_diffusers.py @@ -126,6 +126,13 @@ "--controlnet", action="store_true", default=None, help="Set flag if this is a controlnet checkpoint." ) parser.add_argument("--half", action="store_true", help="Save weights in half precision.") + parser.add_argument( + "--vae_path", + type=str, + default=None, + required=False, + help="Set to a path, hub id to an already converted vae to not convert it again.", + ) args = parser.parse_args() pipe = download_from_original_stable_diffusion_ckpt( @@ -144,6 +151,7 @@ stable_unclip_prior=args.stable_unclip_prior, clip_stats_path=args.clip_stats_path, controlnet=args.controlnet, + vae_path=args.vae_path, ) if args.half: diff --git a/setup.py b/setup.py index d6b083c22821..dd94c5acb7f3 100644 --- a/setup.py +++ b/setup.py @@ -89,6 +89,7 @@ "huggingface-hub>=0.13.2", "requests-mock==1.10.0", "importlib_metadata", + "invisible-watermark", "isort>=5.5.4", "jax>=0.2.8,!=0.3.2", "jaxlib>=0.1.65", @@ -193,6 +194,7 @@ def run(self): "compel", "datasets", "Jinja2", + "invisible-watermark", "k-diffusion", "librosa", "omegaconf", diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f0c25edd3fdc..61b323246e55 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -5,6 +5,7 @@ OptionalDependencyNotAvailable, is_flax_available, is_inflect_available, + is_invisible_watermark_available, is_k_diffusion_available, is_k_diffusion_version, is_librosa_available, @@ -179,6 +180,14 @@ VQDiffusionPipeline, ) +try: + if not (is_torch_available() and is_transformers_available() and is_invisible_watermark_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils.dummy_torch_and_transformers_and_invisible_watermark_objects import * # noqa F403 +else: + from .pipelines import StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline + try: if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()): raise OptionalDependencyNotAvailable() diff --git a/src/diffusers/dependency_versions_table.py b/src/diffusers/dependency_versions_table.py index 423d6c5347cd..68c8e914d940 100644 --- a/src/diffusers/dependency_versions_table.py +++ b/src/diffusers/dependency_versions_table.py @@ -13,6 +13,7 @@ "huggingface-hub": "huggingface-hub>=0.13.2", "requests-mock": "requests-mock==1.10.0", "importlib_metadata": "importlib_metadata", + "invisible-watermark": "invisible-watermark", "isort": "isort>=5.5.4", "jax": "jax>=0.2.8,!=0.3.2", "jaxlib": "jaxlib>=0.1.65", diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py index 0bc7886c2653..5b6a161f8466 100644 --- a/src/diffusers/models/attention_processor.py +++ b/src/diffusers/models/attention_processor.py @@ -1118,7 +1118,9 @@ def __call__( value = attn.to_v(encoder_hidden_states) head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index d4e7bd4e03f7..cb3452f4459c 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -38,6 +38,7 @@ def get_down_block( add_downsample, resnet_eps, resnet_act_fn, + transformer_layers_per_block=1, num_attention_heads=None, resnet_groups=None, cross_attention_dim=None, @@ -111,6 +112,7 @@ def get_down_block( raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D") return CrossAttnDownBlock2D( num_layers=num_layers, + transformer_layers_per_block=transformer_layers_per_block, in_channels=in_channels, out_channels=out_channels, temb_channels=temb_channels, @@ -232,6 +234,7 @@ def get_up_block( add_upsample, resnet_eps, resnet_act_fn, + transformer_layers_per_block=1, num_attention_heads=None, resnet_groups=None, cross_attention_dim=None, @@ -287,6 +290,7 @@ def get_up_block( raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock2D") return CrossAttnUpBlock2D( num_layers=num_layers, + transformer_layers_per_block=transformer_layers_per_block, in_channels=in_channels, out_channels=out_channels, prev_output_channel=prev_output_channel, @@ -517,6 +521,7 @@ def __init__( temb_channels: int, dropout: float = 0.0, num_layers: int = 1, + transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", @@ -559,7 +564,7 @@ def __init__( num_attention_heads, in_channels // num_attention_heads, in_channels=in_channels, - num_layers=1, + num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, @@ -862,6 +867,7 @@ def __init__( temb_channels: int, dropout: float = 0.0, num_layers: int = 1, + transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", @@ -906,7 +912,7 @@ def __init__( num_attention_heads, out_channels // num_attention_heads, in_channels=out_channels, - num_layers=1, + num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, @@ -1995,6 +2001,7 @@ def __init__( temb_channels: int, dropout: float = 0.0, num_layers: int = 1, + transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", @@ -2040,7 +2047,7 @@ def __init__( num_attention_heads, out_channels // num_attention_heads, in_channels=out_channels, - num_layers=1, + num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index cd62a494d5aa..9b9530ad7f83 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -98,7 +98,11 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): The dimension of the cross attention features. - encoder_hid_dim (`int`, *optional*, defaults to `None`): + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` dimension to `cross_attention_dim`. encoder_hid_dim_type (`str`, *optional*, defaults to `None`): @@ -115,6 +119,8 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin) addition_embed_type (`str`, *optional*, defaults to `None`): Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or "text". "text" will use the `TextTimeEmbedding` layer. + addition_time_embed_dim: (`int`, *optional*, defaults to `None`): + Dimension for the timestep embeddings. num_class_embeds (`int`, *optional*, defaults to `None`): Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing class conditioning with `class_embed_type` equal to `None`. @@ -170,6 +176,7 @@ def __init__( norm_num_groups: Optional[int] = 32, norm_eps: float = 1e-5, cross_attention_dim: Union[int, Tuple[int]] = 1280, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, encoder_hid_dim: Optional[int] = None, encoder_hid_dim_type: Optional[str] = None, attention_head_dim: Union[int, Tuple[int]] = 8, @@ -178,6 +185,7 @@ def __init__( use_linear_projection: bool = False, class_embed_type: Optional[str] = None, addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, num_class_embeds: Optional[int] = None, upcast_attention: bool = False, resnet_time_scale_shift: str = "default", @@ -351,6 +359,10 @@ def __init__( self.add_embedding = TextImageTimeEmbedding( text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim ) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift) + self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) + elif addition_embed_type is not None: raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") @@ -383,6 +395,9 @@ def __init__( if isinstance(layers_per_block, int): layers_per_block = [layers_per_block] * len(down_block_types) + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) + if class_embeddings_concat: # The time embeddings are concatenated with the class embeddings. The dimension of the # time embeddings passed to the down, middle, and up blocks is twice the dimension of the @@ -401,6 +416,7 @@ def __init__( down_block = get_down_block( down_block_type, num_layers=layers_per_block[i], + transformer_layers_per_block=transformer_layers_per_block[i], in_channels=input_channel, out_channels=output_channel, temb_channels=blocks_time_embed_dim, @@ -426,6 +442,7 @@ def __init__( # mid if mid_block_type == "UNetMidBlock2DCrossAttn": self.mid_block = UNetMidBlock2DCrossAttn( + transformer_layers_per_block=transformer_layers_per_block[-1], in_channels=block_out_channels[-1], temb_channels=blocks_time_embed_dim, resnet_eps=norm_eps, @@ -467,6 +484,7 @@ def __init__( reversed_num_attention_heads = list(reversed(num_attention_heads)) reversed_layers_per_block = list(reversed(layers_per_block)) reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block)) only_cross_attention = list(reversed(only_cross_attention)) output_channel = reversed_block_out_channels[0] @@ -487,6 +505,7 @@ def __init__( up_block = get_up_block( up_block_type, num_layers=reversed_layers_per_block[i] + 1, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], in_channels=input_channel, out_channels=output_channel, prev_output_channel=prev_output_channel, @@ -693,6 +712,9 @@ def forward( tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. Returns: [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: @@ -763,6 +785,7 @@ def forward( t_emb = t_emb.to(dtype=sample.dtype) emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None if self.class_embedding is not None: if class_labels is None: @@ -784,7 +807,6 @@ def forward( if self.config.addition_embed_type == "text": aug_emb = self.add_embedding(encoder_hidden_states) - emb = emb + aug_emb elif self.config.addition_embed_type == "text_image": # Kadinsky 2.1 - style if "image_embeds" not in added_cond_kwargs: @@ -796,7 +818,25 @@ def forward( text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) aug_emb = self.add_embedding(text_embs, image_embs) - emb = emb + aug_emb + elif self.config.addition_embed_type == "text_time": + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + + emb = emb + aug_emb if aug_emb is not None else emb if self.time_embed_act is not None: emb = self.time_embed_act(emb) diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 3926b3413e01..4092a1928958 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -1,6 +1,7 @@ from ..utils import ( OptionalDependencyNotAvailable, is_flax_available, + is_invisible_watermark_available, is_k_diffusion_available, is_librosa_available, is_note_seq_available, @@ -101,6 +102,15 @@ ) from .vq_diffusion import VQDiffusionPipeline + +try: + if not (is_torch_available() and is_transformers_available() and is_invisible_watermark_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ..utils.dummy_torch_and_transformers_and_invisible_watermark_objects import * # noqa F403 +else: + from .stable_diffusion_xl import StableDiffusionXLImg2ImgPipeline, StableDiffusionXLPipeline + try: if not is_onnx_available(): raise OptionalDependencyNotAvailable() diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index ba62f8d7f79e..99cfcb806795 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -233,7 +233,10 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa if controlnet: unet_params = original_config.model.params.control_stage_config.params else: - unet_params = original_config.model.params.unet_config.params + if original_config.model.params.unet_config is not None: + unet_params = original_config.model.params.unet_config.params + else: + unet_params = original_config.model.params.network_config.params vae_params = original_config.model.params.first_stage_config.params.ddconfig @@ -253,6 +256,15 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa up_block_types.append(block_type) resolution //= 2 + if unet_params.transformer_depth is not None: + transformer_layers_per_block = ( + unet_params.transformer_depth + if isinstance(unet_params.transformer_depth, int) + else list(unet_params.transformer_depth) + ) + else: + transformer_layers_per_block = 1 + vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1) head_dim = unet_params.num_heads if "num_heads" in unet_params else None @@ -262,14 +274,28 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa if use_linear_projection: # stable diffusion 2-base-512 and 2-768 if head_dim is None: - head_dim = [5, 10, 20, 20] + head_dim_mult = unet_params.model_channels // unet_params.num_head_channels + head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)] class_embed_type = None + addition_embed_type = None + addition_time_embed_dim = None projection_class_embeddings_input_dim = None + context_dim = None + + if unet_params.context_dim is not None: + context_dim = ( + unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0] + ) if "num_classes" in unet_params: if unet_params.num_classes == "sequential": - class_embed_type = "projection" + if context_dim in [2048, 1280]: + # SDXL + addition_embed_type = "text_time" + addition_time_embed_dim = 256 + else: + class_embed_type = "projection" assert "adm_in_channels" in unet_params projection_class_embeddings_input_dim = unet_params.adm_in_channels else: @@ -281,11 +307,14 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa "down_block_types": tuple(down_block_types), "block_out_channels": tuple(block_out_channels), "layers_per_block": unet_params.num_res_blocks, - "cross_attention_dim": unet_params.context_dim, + "cross_attention_dim": context_dim, "attention_head_dim": head_dim, "use_linear_projection": use_linear_projection, "class_embed_type": class_embed_type, + "addition_embed_type": addition_embed_type, + "addition_time_embed_dim": addition_time_embed_dim, "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim, + "transformer_layers_per_block": transformer_layers_per_block, } if controlnet: @@ -400,6 +429,12 @@ def convert_ldm_unet_checkpoint( else: raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}") + if config["addition_embed_type"] == "text_time": + new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"] + new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"] + new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"] + new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"] + new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"] new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"] @@ -745,9 +780,12 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder text_model_dict = {} + remove_prefixes = ["cond_stage_model.transformer", "conditioner.embedders.0.transformer"] + for key in keys: - if key.startswith("cond_stage_model.transformer"): - text_model_dict[key[len("cond_stage_model.transformer.") :]] = checkpoint[key] + for prefix in remove_prefixes: + if key.startswith(prefix): + text_model_dict[key[len(prefix + ".") :]] = checkpoint[key] text_model.load_state_dict(text_model_dict) @@ -755,10 +793,11 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder textenc_conversion_lst = [ - ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"), - ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"), - ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"), - ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"), + ("positional_embedding", "text_model.embeddings.position_embedding.weight"), + ("token_embedding.weight", "text_model.embeddings.token_embedding.weight"), + ("ln_final.weight", "text_model.final_layer_norm.weight"), + ("ln_final.bias", "text_model.final_layer_norm.bias"), + ("text_projection", "text_projection.weight"), ] textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst} @@ -845,27 +884,36 @@ def convert_paint_by_example_checkpoint(checkpoint): return model -def convert_open_clip_checkpoint(checkpoint): - text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") +def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."): + # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder") + text_model = CLIPTextModelWithProjection.from_pretrained( + "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280 + ) keys = list(checkpoint.keys()) text_model_dict = {} - if "cond_stage_model.model.text_projection" in checkpoint: - d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0]) + if prefix + "text_projection" in checkpoint: + d_model = int(checkpoint[prefix + "text_projection"].shape[0]) else: d_model = 1024 text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids") for key in keys: - if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer - continue - if key in textenc_conversion_map: - text_model_dict[textenc_conversion_map[key]] = checkpoint[key] - if key.startswith("cond_stage_model.model.transformer."): - new_key = key[len("cond_stage_model.model.transformer.") :] + # if "resblocks.23" in key: # Diffusers drops the final layer and only uses the penultimate layer + # continue + if key[len(prefix) :] in textenc_conversion_map: + if key.endswith("text_projection"): + value = checkpoint[key].T + else: + value = checkpoint[key] + + text_model_dict[textenc_conversion_map[key[len(prefix) :]]] = value + + if key.startswith(prefix + "transformer."): + new_key = key[len(prefix + "transformer.") :] if new_key.endswith(".in_proj_weight"): new_key = new_key[: -len(".in_proj_weight")] new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key) @@ -1029,6 +1077,7 @@ def download_from_original_stable_diffusion_ckpt( load_safety_checker: bool = True, pipeline_class: DiffusionPipeline = None, local_files_only=False, + vae_path=None, text_encoder=None, tokenizer=None, ) -> DiffusionPipeline: @@ -1096,6 +1145,8 @@ def download_from_original_stable_diffusion_ckpt( PaintByExamplePipeline, StableDiffusionControlNetPipeline, StableDiffusionPipeline, + StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLPipeline, StableUnCLIPImg2ImgPipeline, StableUnCLIPPipeline, ) @@ -1187,9 +1238,9 @@ def download_from_original_stable_diffusion_ckpt( checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema ) - num_train_timesteps = original_config.model.params.timesteps - beta_start = original_config.model.params.linear_start - beta_end = original_config.model.params.linear_end + num_train_timesteps = original_config.model.params.timesteps or 1000 + beta_start = original_config.model.params.linear_start or 0.02 + beta_end = original_config.model.params.linear_end or 0.085 scheduler = DDIMScheduler( beta_end=beta_end, @@ -1231,20 +1282,27 @@ def download_from_original_stable_diffusion_ckpt( converted_unet_checkpoint = convert_ldm_unet_checkpoint( checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema ) - unet.load_state_dict(converted_unet_checkpoint) # Convert the VAE model. - vae_config = create_vae_diffusers_config(original_config, image_size=image_size) - converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) + if vae_path is None: + vae_config = create_vae_diffusers_config(original_config, image_size=image_size) + converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config) - vae = AutoencoderKL(**vae_config) - vae.load_state_dict(converted_vae_checkpoint) + vae = AutoencoderKL(**vae_config) + vae.load_state_dict(converted_vae_checkpoint) + else: + vae = AutoencoderKL.from_pretrained(vae_path) # Convert the text model. - if model_type is None: + if model_type is None and original_config.model.params.cond_stage_config is not None: model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") + elif model_type is None and original_config.model.params.network_config is not None: + if original_config.model.params.network_config.params.context_dim == 2048: + model_type = "SDXL" + else: + model_type = "SDXL-Refiner" if model_type == "FrozenOpenCLIPEmbedder": text_model = convert_open_clip_checkpoint(checkpoint) @@ -1375,6 +1433,40 @@ def download_from_original_stable_diffusion_ckpt( safety_checker=safety_checker, feature_extractor=feature_extractor, ) + elif model_type in ["SDXL", "SDXL-Refiner"]: + if model_type == "SDXL": + tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") + text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only) + tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!") + text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.1.model.") + + pipe = StableDiffusionXLPipeline( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + text_encoder_2=text_encoder_2, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + force_zeros_for_empty_prompt=True, + ) + else: + tokenizer = None + text_encoder = None + tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!") + text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.0.model.") + + pipe = StableDiffusionXLImg2ImgPipeline( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + text_encoder_2=text_encoder_2, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + requires_aesthetics_score=True, + force_zeros_for_empty_prompt=False, + ) else: text_config = create_ldm_bert_config(original_config) text_model = convert_ldm_bert_checkpoint(checkpoint, text_config) diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py index 0fda05ea5ec2..48283bf31156 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py @@ -24,7 +24,12 @@ from ...image_processor import VaeImageProcessor from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel -from ...models.attention_processor import AttnProcessor2_0, LoRAXFormersAttnProcessor, XFormersAttnProcessor +from ...models.attention_processor import ( + AttnProcessor2_0, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + XFormersAttnProcessor, +) from ...schedulers import DDPMScheduler, KarrasDiffusionSchedulers from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor from ..pipeline_utils import DiffusionPipeline @@ -747,6 +752,7 @@ def __call__( AttnProcessor2_0, XFormersAttnProcessor, LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_0, ] # if xformers or torch_2_0 is used attention block does not need # to be in float32 which can save lots of memory diff --git a/src/diffusers/pipelines/stable_diffusion_xl/__init__.py b/src/diffusers/pipelines/stable_diffusion_xl/__init__.py new file mode 100644 index 000000000000..21e18e5746a7 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion_xl/__init__.py @@ -0,0 +1,31 @@ +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL + +from ...utils import BaseOutput, is_invisible_watermark_available, is_torch_available, is_transformers_available + + +@dataclass +# Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with StableDiffusion->StableDiffusionXL +class StableDiffusionXLPipelineOutput(BaseOutput): + """ + Output class for Stable Diffusion pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + nsfw_content_detected (`List[bool]`) + List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work" + (nsfw) content, or `None` if safety checking could not be performed. + """ + + images: Union[List[PIL.Image.Image], np.ndarray] + nsfw_content_detected: Optional[List[bool]] + + +if is_transformers_available() and is_torch_available() and is_invisible_watermark_available(): + from .pipeline_stable_diffusion_xl import StableDiffusionXLPipeline + from .pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipeline diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py new file mode 100644 index 000000000000..c50381c2eb23 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -0,0 +1,788 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import torch +from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer + +from ...image_processor import VaeImageProcessor +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet2DConditionModel +from ...models.attention_processor import ( + AttnProcessor2_0, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + XFormersAttnProcessor, +) +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionXLPipelineOutput +from .watermark import StableDiffusionXLWatermarker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionXLPipeline + + >>> pipe = StableDiffusionXLPipeline.from_pretrained( + ... "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + ``` +""" + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +class StableDiffusionXLPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + text_encoder_2: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + tokenizer_2: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + force_zeros_for_empty_prompt: bool = True, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + ) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + self.watermark = StableDiffusionXLWatermarker() + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + model_sequence = ( + [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] + ) + model_sequence.extend([self.unet, self.vae]) + + hook = None + for cpu_offloaded_model in model_sequence: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # Define tokenizers and text encoders + tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] + text_encoders = ( + [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] + ) + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + prompt_embeds_list = [] + for tokenizer, text_encoder in zip(tokenizers, text_encoders): + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, tokenizer) + + text_inputs = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = text_encoder( + text_input_ids.to(device), + output_hidden_states=True, + ) + + # We are only ALWAYS interested in the pooled output of the final text encoder + pooled_prompt_embeds = prompt_embeds[0] + prompt_embeds = prompt_embeds.hidden_states[-2] + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + prompt_embeds_list.append(prompt_embeds) + + prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) + + # get unconditional embeddings for classifier free guidance + zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt + if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) + elif do_classifier_free_guidance and negative_prompt_embeds is None: + negative_prompt = negative_prompt or "" + uncond_tokens: List[str] + if prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + negative_prompt_embeds_list = [] + for tokenizer, text_encoder in zip(tokenizers, text_encoders): + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + negative_prompt_embeds = text_encoder( + uncond_input.input_ids.to(device), + output_hidden_states=True, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + negative_pooled_prompt_embeds = negative_prompt_embeds[0] + negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + + negative_prompt_embeds_list.append(negative_prompt_embeds) + + negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( + bs_embed * num_images_per_prompt, -1 + ) + negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( + bs_embed * num_images_per_prompt, -1 + ) + + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + def _get_add_time_ids(self, original_size, crops_coords_top_left, target_size, dtype): + add_time_ids = list(original_size + crops_coords_top_left + target_size) + + passed_add_embed_dim = ( + self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim + ) + expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + if expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Tuple[int, int] = (1024, 1024), + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Tuple[int, int] = (1024, 1024), + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + TODO + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + TODO + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + TODO + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a + `tuple. When returning a tuple, the first element is a list with the generated images, and the second + element is a list of `bool`s denoting whether the corresponding generated image likely represents + "not-safe-for-work" (nsfw) content, according to the `safety_checker`. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + + # 4. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + + timesteps = self.scheduler.timesteps + + # 5. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + add_time_ids = self._get_add_time_ids( + original_size, crops_coords_top_left, target_size, dtype=prompt_embeds.dtype + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) + add_time_ids = torch.cat([add_time_ids, add_time_ids], dim=0) + + prompt_embeds = prompt_embeds.to(device) + add_text_embeds = add_text_embeds.to(device) + add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # make sure the VAE is in float32 mode, as it overflows in float16 + self.vae.to(dtype=torch.float32) + + use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [ + AttnProcessor2_0, + XFormersAttnProcessor, + LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_0, + ] + # if xformers or torch_2_0 is used attention block does not need + # to be in float32 which can save lots of memory + if not use_torch_2_0_or_xformers: + self.vae.post_quant_conv.to(latents.dtype) + self.vae.decoder.conv_in.to(latents.dtype) + self.vae.decoder.mid_block.to(latents.dtype) + else: + latents = latents.float() + + if not output_type == "latent": + # CHECK there is problem here (PVP) + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + has_nsfw_concept = None + else: + image = latents + has_nsfw_concept = None + return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None) + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.watermark.apply_watermark(image) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py new file mode 100644 index 000000000000..329a626ada2e --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -0,0 +1,879 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Tuple, Union + +import numpy as np +import PIL.Image +import torch +from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer + +from ...image_processor import VaeImageProcessor +from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...models import AutoencoderKL, UNet2DConditionModel +from ...models.attention_processor import ( + AttnProcessor2_0, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + XFormersAttnProcessor, +) +from ...schedulers import KarrasDiffusionSchedulers +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from ..pipeline_utils import DiffusionPipeline +from . import StableDiffusionXLPipelineOutput +from .watermark import StableDiffusionXLWatermarker + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import StableDiffusionXLImg2ImgPipeline + >>> from diffusers.utils import load_image + + >>> pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained( + ... "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + >>> url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png" + + >>> init_image = load_image(url).convert("RGB") + >>> prompt = "a photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt, image=init_image).images[0] + ``` +""" + + +def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): + """ + Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 + """ + std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True) + std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True) + # rescale the results from guidance (fixes overexposure) + noise_pred_rescaled = noise_cfg * (std_text / std_cfg) + # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images + noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg + return noise_cfg + + +class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + In addition the pipeline inherits the following loading methods: + - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] + - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] + - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + + as well as the following saving methods: + - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder. Stable Diffusion uses the text portion of + [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically + the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + """ + _optional_components = ["tokenizer", "text_encoder"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + text_encoder_2: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + tokenizer_2: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + requires_aesthetics_score: bool = False, + force_zeros_for_empty_prompt: bool = True, + ): + super().__init__() + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_2=text_encoder_2, + tokenizer=tokenizer, + tokenizer_2=tokenizer_2, + unet=unet, + scheduler=scheduler, + ) + self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) + self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.vae_scale_factor = 8 + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + self.watermark = StableDiffusionXLWatermarker() + + def enable_vae_slicing(self): + r""" + Enable sliced VAE decoding. + + When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several + steps. This is useful to save some memory and allow larger batch sizes. + """ + self.vae.enable_slicing() + + def disable_vae_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_slicing() + + def enable_vae_tiling(self): + r""" + Enable tiled VAE decoding. + + When this option is enabled, the VAE will split the input tensor into tiles to compute decoding and encoding in + several steps. This is useful to save a large amount of memory and to allow the processing of larger images. + """ + self.vae.enable_tiling() + + def disable_vae_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to + computing decoding in one step. + """ + self.vae.disable_tiling() + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet, + text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a + `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called. + Note that offloading happens on a submodule basis. Memory savings are higher than with + `enable_model_cpu_offload`, but performance is lower. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"): + from accelerate import cpu_offload + else: + raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + model_sequence = ( + [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] + ) + model_sequence.extend([self.unet, self.vae]) + + hook = None + for cpu_offloaded_model in model_sequence: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + # Define tokenizers and text encoders + tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] + text_encoders = ( + [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] + ) + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + prompt_embeds_list = [] + for tokenizer, text_encoder in zip(tokenizers, text_encoders): + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, tokenizer) + + text_inputs = tokenizer( + prompt, + padding="max_length", + max_length=tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {tokenizer.model_max_length} tokens: {removed_text}" + ) + + prompt_embeds = text_encoder( + text_input_ids.to(device), + output_hidden_states=True, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + pooled_prompt_embeds = prompt_embeds[0] + + prompt_embeds = prompt_embeds.hidden_states[-2] + + prompt_embeds = prompt_embeds + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + prompt_embeds_list.append(prompt_embeds) + + prompt_embeds = torch.concat(prompt_embeds_list, dim=-1) + + # get unconditional embeddings for classifier free guidance + zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt + if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds) + elif do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + negative_prompt_embeds_list = [] + for tokenizer, text_encoder in zip(tokenizers, text_encoders): + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + negative_prompt_embeds = text_encoder( + uncond_input.input_ids.to(device), + output_hidden_states=True, + ) + # We are only ALWAYS interested in the pooled output of the final text encoder + negative_pooled_prompt_embeds = negative_prompt_embeds[0] + + negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=text_encoder.dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + + negative_prompt_embeds_list.append(negative_prompt_embeds) + + negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1) + + pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( + bs_embed * num_images_per_prompt, -1 + ) + negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view( + bs_embed * num_images_per_prompt, -1 + ) + + return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds + + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, prompt, strength, callback_steps, negative_prompt=None, prompt_embeds=None, negative_prompt_embeds=None + ): + if strength < 0 or strength > 1: + raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") + + if (callback_steps is None) or ( + callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + # Offload text encoder if `enable_model_cpu_offload` was enabled + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.text_encoder_2.to("cpu") + torch.cuda.empty_cache() + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + + else: + # make sure the VAE is in float32 mode, as it overflows in float16 + image = image.float() + self.vae.to(dtype=torch.float32) + + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.vae.encode(image).latent_dist.sample(generator) + + self.vae.to(dtype) + init_latents = init_latents.to(dtype) + + init_latents = self.vae.config.scaling_factor * init_latents + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + return latents + + def _get_add_time_ids( + self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list(original_size + crops_coords_top_left + (negative_aesthetic_score,)) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(original_size + crops_coords_top_left + target_size) + + passed_add_embed_dim = ( + self.unet.config.addition_time_embed_dim * len(add_time_ids) + self.text_encoder_2.config.projection_dim + ) + expected_add_embed_dim = self.unet.add_embedding.linear_1.in_features + + if ( + expected_add_embed_dim > passed_add_embed_dim + and (expected_add_embed_dim - passed_add_embed_dim) == self.unet.config.addition_time_embed_dim + ): + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to enable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=True)` to make sure `aesthetic_score` {aesthetic_score} and `negative_aesthetic_score` {negative_aesthetic_score} is correctly used by the model." + ) + elif ( + expected_add_embed_dim < passed_add_embed_dim + and (passed_add_embed_dim - expected_add_embed_dim) == self.unet.config.addition_time_embed_dim + ): + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. Please make sure to disable `requires_aesthetics_score` with `pipe.register_to_config(requires_aesthetics_score=False)` to make sure `target_size` {target_size} is correctly used by the model." + ) + elif expected_add_embed_dim != passed_add_embed_dim: + raise ValueError( + f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`." + ) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: Union[ + torch.FloatTensor, + PIL.Image.Image, + np.ndarray, + List[torch.FloatTensor], + List[PIL.Image.Image], + List[np.ndarray], + ] = None, + strength: float = 0.3, + num_inference_steps: int = 50, + guidance_scale: float = 5.0, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guidance_rescale: float = 0.0, + original_size: Tuple[int, int] = (1024, 1024), + crops_coords_top_left: Tuple[int, int] = (0, 0), + target_size: Tuple[int, int] = (1024, 1024), + aesthetic_score: float = 6.0, + negative_aesthetic_score: float = 2.5, + ): + r""" + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + instead. + image (`torch.FloatTensor` or `PIL.Image.Image` or `np.ndarray` or `List[torch.FloatTensor]` or `List[PIL.Image.Image]` or `List[np.ndarray]`): + The image(s) to modify with the pipeline. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 7.5): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to + [`schedulers.DDIMScheduler`], will be ignored for others. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that will be called every `callback_steps` steps during inference. The function will be + called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function will be called. If not specified, the callback will be + called at every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.cross_attention](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/cross_attention.py). + guidance_rescale (`float`, *optional*, defaults to 0.7): + Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are + Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of + [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). + Guidance rescale factor should fix overexposure when using zero terminal SNR. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + TODO + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + TODO + target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + TODO + aesthetic_score (`float`, *optional*, defaults to 6.0): + TODO + negative_aesthetic_score (`float`, *optional*, defaults to 2.5): + TDOO + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: + [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a + `tuple. When returning a tuple, the first element is a list with the generated images, and the second + element is a list of `bool`s denoting whether the corresponding generated image likely represents + "not-safe-for-work" (nsfw) content, according to the `safety_checker`. + """ + # 1. Check inputs. Raise error if not correct + self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None + ) + ( + prompt_embeds, + negative_prompt_embeds, + pooled_prompt_embeds, + negative_pooled_prompt_embeds, + ) = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + + # 4. Preprocess image + image = self.image_processor.preprocess(image) + + # 5. Prepare timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + + # 6. Prepare latent variables + latents = self.prepare_latents( + image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator + ) + # 7. Prepare extra step kwargs. + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 8. Prepare added time ids & embeddings + add_text_embeds = pooled_prompt_embeds + add_time_ids, add_neg_time_ids = self._get_add_time_ids( + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + dtype=prompt_embeds.dtype, + ) + + if do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) + add_time_ids = torch.cat([add_neg_time_ids, add_time_ids], dim=0) + + prompt_embeds = prompt_embeds.to(device) + add_text_embeds = add_text_embeds.to(device) + add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) + + # 9. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # predict the noise residual + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + + if do_classifier_free_guidance and guidance_rescale > 0.0: + # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf + noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(i, t, latents) + + # make sure the VAE is in float32 mode, as it overflows in float16 + self.vae.to(dtype=torch.float32) + + use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [ + AttnProcessor2_0, + XFormersAttnProcessor, + LoRAXFormersAttnProcessor, + LoRAAttnProcessor2_0, + ] + # if xformers or torch_2_0 is used attention block does not need + # to be in float32 which can save lots of memory + if not use_torch_2_0_or_xformers: + self.vae.post_quant_conv.to(latents.dtype) + self.vae.decoder.conv_in.to(latents.dtype) + self.vae.decoder.mid_block.to(latents.dtype) + else: + latents = latents.float() + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0] + has_nsfw_concept = None + else: + image = latents + return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None) + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.watermark.apply_watermark(image) + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/src/diffusers/pipelines/stable_diffusion_xl/watermark.py b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py new file mode 100644 index 000000000000..bc6c9bf649b1 --- /dev/null +++ b/src/diffusers/pipelines/stable_diffusion_xl/watermark.py @@ -0,0 +1,31 @@ +import numpy as np +import torch +from imwatermark import WatermarkEncoder + + +# Copied from https://github.com/Stability-AI/generative-models/blob/613af104c6b85184091d42d374fef420eddb356d/scripts/demo/streamlit_helpers.py#L66 +WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110 +# bin(x)[2:] gives bits of x as str, use int to convert them to 0/1 +WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]] + + +class StableDiffusionXLWatermarker: + def __init__(self): + self.watermark = WATERMARK_BITS + self.encoder = WatermarkEncoder() + + self.encoder.set_watermark("bits", self.watermark) + + def apply_watermark(self, images: torch.FloatTensor): + # can't encode images that are smaller than 256 + if images.shape[-1] < 256: + return images + + images = (255 * (images / 2 + 0.5)).cpu().permute(0, 2, 3, 1).float().numpy() + + images = [self.encoder.encode(image, "dwtDct") for image in images] + + images = torch.from_numpy(np.array(images)).permute(0, 3, 1, 2) + + images = torch.clamp(2 * (images / 255 - 0.5), min=-1.0, max=1.0) + return images diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index dd5b7f77c1ce..95a5562b367a 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -189,7 +189,11 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization. cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280): The dimension of the cross attention features. - encoder_hid_dim (`int`, *optional*, defaults to `None`): + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlockFlat`], [`~models.unet_2d_blocks.CrossAttnUpBlockFlat`], + [`~models.unet_2d_blocks.UNetMidBlockFlatCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` dimension to `cross_attention_dim`. encoder_hid_dim_type (`str`, *optional*, defaults to `None`): @@ -206,6 +210,8 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin): addition_embed_type (`str`, *optional*, defaults to `None`): Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or "text". "text" will use the `TextTimeEmbedding` layer. + addition_time_embed_dim: (`int`, *optional*, defaults to `None`): + Dimension for the timestep embeddings. num_class_embeds (`int`, *optional*, defaults to `None`): Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing class conditioning with `class_embed_type` equal to `None`. @@ -266,6 +272,7 @@ def __init__( norm_num_groups: Optional[int] = 32, norm_eps: float = 1e-5, cross_attention_dim: Union[int, Tuple[int]] = 1280, + transformer_layers_per_block: Union[int, Tuple[int]] = 1, encoder_hid_dim: Optional[int] = None, encoder_hid_dim_type: Optional[str] = None, attention_head_dim: Union[int, Tuple[int]] = 8, @@ -274,6 +281,7 @@ def __init__( use_linear_projection: bool = False, class_embed_type: Optional[str] = None, addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, num_class_embeds: Optional[int] = None, upcast_attention: bool = False, resnet_time_scale_shift: str = "default", @@ -454,6 +462,10 @@ def __init__( self.add_embedding = TextImageTimeEmbedding( text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim ) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift) + self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) + elif addition_embed_type is not None: raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") @@ -486,6 +498,9 @@ def __init__( if isinstance(layers_per_block, int): layers_per_block = [layers_per_block] * len(down_block_types) + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) + if class_embeddings_concat: # The time embeddings are concatenated with the class embeddings. The dimension of the # time embeddings passed to the down, middle, and up blocks is twice the dimension of the @@ -504,6 +519,7 @@ def __init__( down_block = get_down_block( down_block_type, num_layers=layers_per_block[i], + transformer_layers_per_block=transformer_layers_per_block[i], in_channels=input_channel, out_channels=output_channel, temb_channels=blocks_time_embed_dim, @@ -529,6 +545,7 @@ def __init__( # mid if mid_block_type == "UNetMidBlockFlatCrossAttn": self.mid_block = UNetMidBlockFlatCrossAttn( + transformer_layers_per_block=transformer_layers_per_block[-1], in_channels=block_out_channels[-1], temb_channels=blocks_time_embed_dim, resnet_eps=norm_eps, @@ -570,6 +587,7 @@ def __init__( reversed_num_attention_heads = list(reversed(num_attention_heads)) reversed_layers_per_block = list(reversed(layers_per_block)) reversed_cross_attention_dim = list(reversed(cross_attention_dim)) + reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block)) only_cross_attention = list(reversed(only_cross_attention)) output_channel = reversed_block_out_channels[0] @@ -590,6 +608,7 @@ def __init__( up_block = get_up_block( up_block_type, num_layers=reversed_layers_per_block[i] + 1, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], in_channels=input_channel, out_channels=output_channel, prev_output_channel=prev_output_channel, @@ -796,6 +815,9 @@ def forward( tuple. cross_attention_kwargs (`dict`, *optional*): A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. Returns: [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: @@ -866,6 +888,7 @@ def forward( t_emb = t_emb.to(dtype=sample.dtype) emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None if self.class_embedding is not None: if class_labels is None: @@ -887,7 +910,6 @@ def forward( if self.config.addition_embed_type == "text": aug_emb = self.add_embedding(encoder_hidden_states) - emb = emb + aug_emb elif self.config.addition_embed_type == "text_image": # Kadinsky 2.1 - style if "image_embeds" not in added_cond_kwargs: @@ -900,7 +922,27 @@ def forward( text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) aug_emb = self.add_embedding(text_embs, image_embs) - emb = emb + aug_emb + elif self.config.addition_embed_type == "text_time": + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires" + " the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires" + " the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + + emb = emb + aug_emb if aug_emb is not None else emb if self.time_embed_act is not None: emb = self.time_embed_act(emb) @@ -1212,6 +1254,7 @@ def __init__( temb_channels: int, dropout: float = 0.0, num_layers: int = 1, + transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", @@ -1256,7 +1299,7 @@ def __init__( num_attention_heads, out_channels // num_attention_heads, in_channels=out_channels, - num_layers=1, + num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, @@ -1446,6 +1489,7 @@ def __init__( temb_channels: int, dropout: float = 0.0, num_layers: int = 1, + transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", @@ -1491,7 +1535,7 @@ def __init__( num_attention_heads, out_channels // num_attention_heads, in_channels=out_channels, - num_layers=1, + num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, @@ -1592,6 +1636,7 @@ def __init__( temb_channels: int, dropout: float = 0.0, num_layers: int = 1, + transformer_layers_per_block: int = 1, resnet_eps: float = 1e-6, resnet_time_scale_shift: str = "default", resnet_act_fn: str = "swish", @@ -1634,7 +1679,7 @@ def __init__( num_attention_heads, in_channels // num_attention_heads, in_channels=in_channels, - num_layers=1, + num_layers=transformer_layers_per_block, cross_attention_dim=cross_attention_dim, norm_num_groups=resnet_groups, use_linear_projection=use_linear_projection, diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index 36cbe82f79e7..bdb7f020a0aa 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -58,6 +58,7 @@ is_flax_available, is_ftfy_available, is_inflect_available, + is_invisible_watermark_available, is_k_diffusion_available, is_k_diffusion_version, is_librosa_available, diff --git a/src/diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py new file mode 100644 index 000000000000..6b09b971fbc5 --- /dev/null +++ b/src/diffusers/utils/dummy_torch_and_transformers_and_invisible_watermark_objects.py @@ -0,0 +1,32 @@ +# This file is autogenerated by the command `make fix-copies`, do not edit. +from ..utils import DummyObject, requires_backends + + +class StableDiffusionXLImg2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers", "invisible_watermark"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers", "invisible_watermark"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers", "invisible_watermark"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers", "invisible_watermark"]) + + +class StableDiffusionXLPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers", "invisible_watermark"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers", "invisible_watermark"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers", "invisible_watermark"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers", "invisible_watermark"]) diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 4ded0f272462..287992207e5a 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -294,6 +294,13 @@ except importlib_metadata.PackageNotFoundError: _torchsde_available = False +_invisible_watermark_available = importlib.util.find_spec("imwatermark") is not None +try: + _invisible_watermark_version = importlib_metadata.version("invisible-watermark") + logger.debug(f"Successfully imported invisible-watermark version {_invisible_watermark_version}") +except importlib_metadata.PackageNotFoundError: + _invisible_watermark_available = False + def is_torch_available(): return _torch_available @@ -383,6 +390,10 @@ def is_torchsde_available(): return _torchsde_available +def is_invisible_watermark_available(): + return _invisible_watermark_available + + # docstyle-ignore FLAX_IMPORT_ERROR = """ {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the @@ -491,6 +502,11 @@ def is_torchsde_available(): {0} requires the torchsde library but it was not found in your environment. You can install it with pip: `pip install torchsde` """ +# docstyle-ignore +INVISIBLE_WATERMARK_IMPORT_ERROR = """ +{0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install git+https://github.com/patrickvonplaten/invisible-watermark.git@remove_onnxruntime_depedency` +""" + BACKENDS_MAPPING = OrderedDict( [ @@ -508,10 +524,11 @@ def is_torchsde_available(): ("note_seq", (is_note_seq_available, NOTE_SEQ_IMPORT_ERROR)), ("wandb", (is_wandb_available, WANDB_IMPORT_ERROR)), ("omegaconf", (is_omegaconf_available, OMEGACONF_IMPORT_ERROR)), - ("tensorboard", (_tensorboard_available, TENSORBOARD_IMPORT_ERROR)), - ("compel", (_compel_available, COMPEL_IMPORT_ERROR)), + ("tensorboard", (is_tensorboard_available, TENSORBOARD_IMPORT_ERROR)), + ("compel", (is_compel_available, COMPEL_IMPORT_ERROR)), ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)), - ("torchsde", (_torchsde_available, TORCHSDE_IMPORT_ERROR)), + ("torchsde", (is_torchsde_available, TORCHSDE_IMPORT_ERROR)), + ("invisible_watermark", (is_invisible_watermark_available, INVISIBLE_WATERMARK_IMPORT_ERROR)), ] ) diff --git a/tests/others/test_dependencies.py b/tests/others/test_dependencies.py index 9bee7a0db3ed..3436cf92d896 100644 --- a/tests/others/test_dependencies.py +++ b/tests/others/test_dependencies.py @@ -34,4 +34,6 @@ def test_backend_registration(self): for backend in cls_module._backends: if backend == "k_diffusion": backend = "k-diffusion" + elif backend == "invisible_watermark": + backend = "invisible-watermark" assert backend in deps, f"{backend} is not in the deps table!" diff --git a/tests/pipelines/stable_diffusion_xl/__init__.py b/tests/pipelines/stable_diffusion_xl/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py new file mode 100644 index 000000000000..f5535df938b0 --- /dev/null +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl.py @@ -0,0 +1,187 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DiffusionPipeline, + EulerDiscreteScheduler, + StableDiffusionXLPipeline, + UNet2DConditionModel, +) +from diffusers.utils import slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS +from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin + + +enable_full_determinism() + + +class StableDiffusionXLPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): + pipeline_class = StableDiffusionXLPipeline + params = TEXT_TO_IMAGE_PARAMS + batch_params = TEXT_TO_IMAGE_BATCH_PARAMS + image_params = TEXT_TO_IMAGE_IMAGE_PARAMS + image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + # SD2-specific config below + attention_head_dim=(2, 4), + use_linear_projection=True, + addition_embed_type="text_time", + addition_time_embed_dim=8, + transformer_layers_per_block=(1, 2), + projection_class_embeddings_input_dim=80, # 6 * 8 + 32 + cross_attention_dim=64, + ) + scheduler = EulerDiscreteScheduler( + beta_start=0.00085, + beta_end=0.012, + steps_offset=1, + beta_schedule="scaled_linear", + timestep_spacing="leading", + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + # SD2-specific config below + hidden_act="gelu", + projection_dim=32, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip", local_files_only=True) + + text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) + tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip", local_files_only=True) + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "text_encoder_2": text_encoder_2, + "tokenizer_2": tokenizer_2, + # "safety_checker": None, + # "feature_extractor": None, + } + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "output_type": "numpy", + } + return inputs + + def test_stable_diffusion_xl_euler(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionXLPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + expected_slice = np.array([0.5873, 0.6128, 0.4797, 0.5122, 0.5674, 0.4639, 0.5227, 0.5149, 0.4747]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_attention_slicing_forward_pass(self): + super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) + + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=3e-3) + + +@slow +@require_torch_gpu +class StableDiffusionXLPipelineSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) + latents = torch.from_numpy(latents).to(device=device, dtype=dtype) + inputs = { + "prompt": "a photograph of an astronaut riding a horse", + "latents": latents, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "numpy", + } + return inputs + + def test_stable_diffusion_default_euler(self): + pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) + assert np.abs(image_slice - expected_slice).max() < 7e-3 diff --git a/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py new file mode 100644 index 000000000000..d2434ebb223c --- /dev/null +++ b/tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py @@ -0,0 +1,202 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer + +from diffusers import ( + AutoencoderKL, + DiffusionPipeline, + EulerDiscreteScheduler, + StableDiffusionXLImg2ImgPipeline, + UNet2DConditionModel, +) +from diffusers.utils import floats_tensor, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..pipeline_params import ( + IMAGE_TO_IMAGE_IMAGE_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, + TEXT_GUIDED_IMAGE_VARIATION_PARAMS, +) +from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin + + +enable_full_determinism() + + +class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase): + pipeline_class = StableDiffusionXLImg2ImgPipeline + params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"} + required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"} + batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS + image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS + + def get_dummy_components(self): + torch.manual_seed(0) + unet = UNet2DConditionModel( + block_out_channels=(32, 64), + layers_per_block=2, + sample_size=32, + in_channels=4, + out_channels=4, + down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"), + up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"), + # SD2-specific config below + attention_head_dim=(2, 4), + use_linear_projection=True, + addition_embed_type="text_time", + addition_time_embed_dim=8, + transformer_layers_per_block=(1, 2), + projection_class_embeddings_input_dim=80, # 6 * 8 + 32 + cross_attention_dim=64, + ) + scheduler = EulerDiscreteScheduler( + beta_start=0.00085, + beta_end=0.012, + steps_offset=1, + beta_schedule="scaled_linear", + timestep_spacing="leading", + ) + torch.manual_seed(0) + vae = AutoencoderKL( + block_out_channels=[32, 64], + in_channels=3, + out_channels=3, + down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"], + up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"], + latent_channels=4, + sample_size=128, + ) + torch.manual_seed(0) + text_encoder_config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=32, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + # SD2-specific config below + hidden_act="gelu", + projection_dim=32, + ) + text_encoder = CLIPTextModel(text_encoder_config) + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip", local_files_only=True) + + text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config) + tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip", local_files_only=True) + + components = { + "unet": unet, + "scheduler": scheduler, + "vae": vae, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "text_encoder_2": text_encoder_2, + "tokenizer_2": tokenizer_2, + # "safety_checker": None, + # "feature_extractor": None, + } + return components + + def get_dummy_inputs(self, device, seed=0): + image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device) + image = image / 2 + 0.5 + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "A painting of a squirrel eating a burger", + "image": image, + "generator": generator, + "num_inference_steps": 2, + "guidance_scale": 5.0, + "output_type": "numpy", + "strength": 0.75, + } + return inputs + + def test_stable_diffusion_xl_img2img_euler(self): + device = "cpu" # ensure determinism for the device-dependent torch.Generator + components = self.get_dummy_components() + sd_pipe = StableDiffusionXLImg2ImgPipeline(**components) + sd_pipe = sd_pipe.to(device) + sd_pipe.set_progress_bar_config(disable=None) + + inputs = self.get_dummy_inputs(device) + image = sd_pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (1, 32, 32, 3) + + expected_slice = np.array([0.4656, 0.4840, 0.4439, 0.6698, 0.5574, 0.4524, 0.5799, 0.5943, 0.5165]) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_attention_slicing_forward_pass(self): + super().test_attention_slicing_forward_pass(expected_max_diff=3e-3) + + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=3e-3) + + # TODO(Patrick, Sayak) - skip for now as this requires more refiner tests + def test_save_load_optional_components(self): + pass + + +@slow +@require_torch_gpu +class StableDiffusionXLImg2ImgPipelineSlowTests(unittest.TestCase): + def tearDown(self): + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0): + generator = torch.Generator(device=generator_device).manual_seed(seed) + latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64)) + latents = torch.from_numpy(latents).to(device=device, dtype=dtype) + inputs = { + "prompt": "a photograph of an astronaut riding a horse", + "latents": latents, + "generator": generator, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "numpy", + } + return inputs + + def test_stable_diffusion_default_euler(self): + pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-base") + pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + inputs = self.get_inputs(torch_device) + image = pipe(**inputs).images + image_slice = image[0, -3:, -3:, -1].flatten() + + assert image.shape == (1, 512, 512, 3) + expected_slice = np.array([0.49493, 0.47896, 0.40798, 0.54214, 0.53212, 0.48202, 0.47656, 0.46329, 0.48506]) + assert np.abs(image_slice - expected_slice).max() < 7e-3 From 746215670a61af1034c470d0b6555be9c60cb7b6 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 6 Jul 2023 03:05:42 -1000 Subject: [PATCH 184/199] Kandinsky_v22_yiyi (#3936) * Kandinsky2_2 * fix init kandinsky2_2 * kandinsky2_2 fix inpainting * rename pipelines: remove decoder + 2_2 -> V22 * Update scheduling_unclip.py * remove text_encoder and tokenizer arguments from doc string * add test for text2img * add tests for text2img & img2img * fix * add test for inpaint * add prior tests * style * copies * add controlnet test * style * add a test for controlnet_img2img * update prior_emb2emb api to accept image_embedding or image * add a test for prior_emb2emb * style * remove try except * example * fix * add doc string examples to all kandinsky pipelines * style * update doc * style * add a top about 2.2 * Apply suggestions from code review Co-authored-by: Patrick von Platen * vae -> movq * vae -> movq * style * fix the #copied from * remove decoder from file name * update doc: add a section for kandinsky 2.2 * fix * fix-copies * add coped from * add copies from for prior * add copies from for prior emb2emb * copy from for img2img * copied from for inpaint * more copied from * more copies from * more copies * remove the yiyi comments * Apply suggestions from code review * Self-contained example, pipeline order * Import prior output instead of redefining. * Style * Make VQModel compatible with model offload. * Fix copies --------- Co-authored-by: Shahmatov Arseniy <62886550+cene555@users.noreply.github.com> Co-authored-by: yiyixuxu Co-authored-by: Patrick von Platen Co-authored-by: Pedro Cuenca --- docs/source/en/api/pipelines/kandinsky.mdx | 275 +++++++- src/diffusers/__init__.py | 7 + src/diffusers/models/embeddings.py | 67 ++ src/diffusers/models/unet_2d_condition.py | 47 +- src/diffusers/models/vq_model.py | 4 +- src/diffusers/pipelines/__init__.py | 9 + src/diffusers/pipelines/kandinsky/__init__.py | 2 +- .../pipelines/kandinsky/pipeline_kandinsky.py | 1 + .../kandinsky/pipeline_kandinsky_inpaint.py | 1 + .../kandinsky/pipeline_kandinsky_prior.py | 1 + .../pipelines/kandinsky2_2/__init__.py | 7 + .../kandinsky2_2/pipeline_kandinsky2_2.py | 317 +++++++++ .../pipeline_kandinsky2_2_controlnet.py | 372 +++++++++++ ...ipeline_kandinsky2_2_controlnet_img2img.py | 434 +++++++++++++ .../pipeline_kandinsky2_2_img2img.py | 398 ++++++++++++ .../pipeline_kandinsky2_2_inpainting.py | 531 +++++++++++++++ .../pipeline_kandinsky2_2_prior.py | 541 ++++++++++++++++ .../pipeline_kandinsky2_2_prior_emb2emb.py | 605 ++++++++++++++++++ .../versatile_diffusion/modeling_text_unet.py | 50 +- src/diffusers/schedulers/scheduling_unclip.py | 24 + .../dummy_torch_and_transformers_objects.py | 105 +++ tests/pipelines/kandinsky_v22/__init__.py | 0 .../pipelines/kandinsky_v22/test_kandinsky.py | 254 ++++++++ .../test_kandinsky_controlnet.py | 272 ++++++++ .../test_kandinsky_controlnet_img2img.py | 290 +++++++++ .../kandinsky_v22/test_kandinsky_img2img.py | 277 ++++++++ .../kandinsky_v22/test_kandinsky_inpaint.py | 287 +++++++++ .../kandinsky_v22/test_kandinsky_prior.py | 236 +++++++ .../test_kandinsky_prior_emb2emb.py | 257 ++++++++ 29 files changed, 5646 insertions(+), 25 deletions(-) create mode 100644 src/diffusers/pipelines/kandinsky2_2/__init__.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py create mode 100644 src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py create mode 100644 tests/pipelines/kandinsky_v22/__init__.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_prior.py create mode 100644 tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py diff --git a/docs/source/en/api/pipelines/kandinsky.mdx b/docs/source/en/api/pipelines/kandinsky.mdx index bf551249ef05..6b6c64a08951 100644 --- a/docs/source/en/api/pipelines/kandinsky.mdx +++ b/docs/source/en/api/pipelines/kandinsky.mdx @@ -11,19 +11,12 @@ specific language governing permissions and limitations under the License. ## Overview -Kandinsky 2.1 inherits best practices from [DALL-E 2](https://arxiv.org/abs/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas. +Kandinsky inherits best practices from [DALL-E 2](https://huggingface.co/papers/2204.06125) and [Latent Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/latent_diffusion), while introducing some new ideas. It uses [CLIP](https://huggingface.co/docs/transformers/model_doc/clip) for encoding images and text, and a diffusion image prior (mapping) between latent spaces of CLIP modalities. This approach enhances the visual performance of the model and unveils new horizons in blending images and text-guided image manipulation. -The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov) and the original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2) +The Kandinsky model is created by [Arseniy Shakhmatov](https://github.com/cene555), [Anton Razzhigaev](https://github.com/razzant), [Aleksandr Nikolich](https://github.com/AlexWortega), [Igor Pavlov](https://github.com/boomb0om), [Andrey Kuznetsov](https://github.com/kuznetsoffandrey) and [Denis Dimitrov](https://github.com/denndimitrov). The original codebase can be found [here](https://github.com/ai-forever/Kandinsky-2) -## Available Pipelines: - -| Pipeline | Tasks | -|---|---| -| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | -| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | -| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | ## Usage example @@ -135,6 +128,7 @@ prompt = "birds eye view of a quilted paper style alien planet landscape, vibran ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/alienplanet.png) + ### Text Guided Image-to-Image Generation The same Kandinsky model weights can be used for text-guided image-to-image translation. In this case, just make sure to load the weights using the [`KandinskyImg2ImgPipeline`] pipeline. @@ -283,6 +277,207 @@ image.save("starry_cat.png") ![img](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-docs/starry_cat.png) +### Text-to-Image Generation with ControlNet Conditioning + +In the following, we give a simple example of how to use [`KandinskyV22ControlnetPipeline`] to add control to the text-to-image generation with a depth image. + +First, let's take an image and extract its depth map. + +```python +from diffusers.utils import load_image + +img = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png" +).resize((768, 768)) +``` +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/cat.png) + +We can use the `depth-estimation` pipeline from transformers to process the image and retrieve its depth map. + +```python +import torch +import numpy as np + +from transformers import pipeline +from diffusers.utils import load_image + + +def make_hint(image, depth_estimator): + image = depth_estimator(image)["depth"] + image = np.array(image) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + detected_map = torch.from_numpy(image).float() / 255.0 + hint = detected_map.permute(2, 0, 1) + return hint + + +depth_estimator = pipeline("depth-estimation") +hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda") +``` +Now, we load the prior pipeline and the text-to-image controlnet pipeline + +```python +from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline + +pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 +) +pipe_prior = pipe_prior.to("cuda") + +pipe = KandinskyV22ControlnetPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 +) +pipe = pipe.to("cuda") +``` + +We pass the prompt and negative prompt through the prior to generate image embeddings + +```python +prompt = "A robot, 4k photo" + +negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature" + +generator = torch.Generator(device="cuda").manual_seed(43) +image_emb, zero_image_emb = pipe_prior( + prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator +).to_tuple() +``` + +Now we can pass the image embeddings and the depth image we extracted to the controlnet pipeline. With Kandinsky 2.2, only prior pipelines accept `prompt` input. You do not need to pass the prompt to the controlnet pipeline. + +```python +images = pipe( + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + hint=hint, + num_inference_steps=50, + generator=generator, + height=768, + width=768, +).images + +images[0].save("robot_cat.png") +``` + +The output image looks as follow: +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat_text2img.png) + +### Image-to-Image Generation with ControlNet Conditioning + +Kandinsky 2.2 also includes a [`KandinskyV22ControlnetImg2ImgPipeline`] that will allow you to add control to the image generation process with both the image and its depth map. This pipeline works really well with [`KandinskyV22PriorEmb2EmbPipeline`], which generates image embeddings based on both a text prompt and an image. + +For our robot cat example, we will pass the prompt and cat image together to the prior pipeline to generate an image embedding. We will then use that image embedding and the depth map of the cat to further control the image generation process. + +We can use the same cat image and its depth map from the last example. + +```python +import torch +import numpy as np + +from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline +from diffusers.utils import load_image +from transformers import pipeline + +img = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinskyv22/cat.png" +).resize((768, 768)) + + +def make_hint(image, depth_estimator): + image = depth_estimator(image)["depth"] + image = np.array(image) + image = image[:, :, None] + image = np.concatenate([image, image, image], axis=2) + detected_map = torch.from_numpy(image).float() / 255.0 + hint = detected_map.permute(2, 0, 1) + return hint + + +depth_estimator = pipeline("depth-estimation") +hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda") + +pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 +) +pipe_prior = pipe_prior.to("cuda") + +pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 +) +pipe = pipe.to("cuda") + +prompt = "A robot, 4k photo" +negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature" + +generator = torch.Generator(device="cuda").manual_seed(43) + +# run prior pipeline + +img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator) +negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator) + +# run controlnet img2img pipeline +images = pipe( + image=img, + strength=0.5, + image_embeds=img_emb.image_embeds, + negative_image_embeds=negative_emb.image_embeds, + hint=hint, + num_inference_steps=50, + generator=generator, + height=768, + width=768, +).images + +images[0].save("robot_cat.png") +``` + +Here is the output. Compared with the output from our text-to-image controlnet example, it kept a lot more cat facial details from the original image and worked into the robot style we asked for. + +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/robot_cat.png) + +## Kandinsky 2.2 + +The Kandinsky 2.2 release includes robust new text-to-image models that support text-to-image generation, image-to-image generation, image interpolation, and text-guided image inpainting. The general workflow to perform these tasks using Kandinsky 2.2 is the same as in Kandinsky 2.1. First, you will need to use a prior pipeline to generate image embeddings based on your text prompt, and then use one of the image decoding pipelines to generate the output image. The only difference is that in Kandinsky 2.2, all of the decoding pipelines no longer accept the `prompt` input, and the image generation process is conditioned with only `image_embeds` and `negative_image_embeds`. + +Let's look at an example of how to perform text-to-image generation using Kandinsky 2.2. + +First, let's create the prior pipeline and text-to-image pipeline with Kandinsky 2.2 checkpoints. + +```python +from diffusers import DiffusionPipeline +import torch + +pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16) +pipe_prior.to("cuda") + +t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16) +t2i_pipe.to("cuda") +``` + +You can then use `pipe_prior` to generate image embeddings. + +```python +prompt = "portrait of a women, blue eyes, cinematic" +negative_prompt = "low quality, bad quality" + +image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple() +``` + +Now you can pass these embeddings to the text-to-image pipeline. When using Kandinsky 2.2 you don't need to pass the `prompt` (but you do with the previous version, Kandinsky 2.1). + +``` +image = t2i_pipe(image_embeds=image_embeds, negative_image_embeds=negative_image_embeds, height=768, width=768).images[ + 0 +] +image.save("portrait.png") +``` +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinskyv22/%20blue%20eyes.png) + +We used the text-to-image pipeline as an example, but the same process applies to all decoding pipelines in Kandinsky 2.2. For more information, please refer to our API section for each pipeline. + + ## Optimization Running Kandinsky in inference requires running both a first prior pipeline: [`KandinskyPriorPipeline`] @@ -335,30 +530,84 @@ t2i_pipe.unet = torch.compile(t2i_pipe.unet, mode="reduce-overhead", fullgraph=T After compilation you should see a very fast inference time. For more information, feel free to have a look at [Our PyTorch 2.0 benchmark](https://huggingface.co/docs/diffusers/main/en/optimization/torch2.0). +## Available Pipelines: + +| Pipeline | Tasks | +|---|---| +| [pipeline_kandinsky2_2.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py) | *Text-to-Image Generation* | +| [pipeline_kandinsky.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py) | *Text-to-Image Generation* | +| [pipeline_kandinsky2_2_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpaint.py) | *Image-Guided Image Generation* | +| [pipeline_kandinsky_inpaint.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py) | *Image-Guided Image Generation* | +| [pipeline_kandinsky2_2_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py) | *Image-Guided Image Generation* | +| [pipeline_kandinsky_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py) | *Image-Guided Image Generation* | +| [pipeline_kandinsky2_2_controlnet.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py) | *Image-Guided Image Generation* | +| [pipeline_kandinsky2_2_controlnet_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py) | *Image-Guided Image Generation* | + + +### KandinskyV22Pipeline +[[autodoc]] KandinskyV22Pipeline + - all + - __call__ + +### KandinskyV22ControlnetPipeline +[[autodoc]] KandinskyV22ControlnetPipeline + - all + - __call__ + +### KandinskyV22ControlnetImg2ImgPipeline + +[[autodoc]] KandinskyV22ControlnetImg2ImgPipeline + - all + - __call__ +### KandinskyV22Img2ImgPipeline + +[[autodoc]] KandinskyV22Img2ImgPipeline + - all + - __call__ + +### KandinskyV22InpaintPipeline + +[[autodoc]] KandinskyV22InpaintPipeline + - all + - __call__ + +### KandinskyV22PriorPipeline + +[[autodoc]] ## KandinskyV22PriorPipeline + - all + - __call__ + - interpolate + +### KandinskyV22PriorEmb2EmbPipeline + +[[autodoc]] KandinskyV22PriorEmb2EmbPipeline + - all + - __call__ + - interpolate -## KandinskyPriorPipeline +### KandinskyPriorPipeline [[autodoc]] KandinskyPriorPipeline - all - __call__ - interpolate -## KandinskyPipeline +### KandinskyPipeline [[autodoc]] KandinskyPipeline - all - __call__ -## KandinskyImg2ImgPipeline +### KandinskyImg2ImgPipeline [[autodoc]] KandinskyImg2ImgPipeline - all - __call__ -## KandinskyInpaintPipeline +### KandinskyInpaintPipeline [[autodoc]] KandinskyInpaintPipeline - all diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index 61b323246e55..f425dc13ec2c 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -139,6 +139,13 @@ KandinskyInpaintPipeline, KandinskyPipeline, KandinskyPriorPipeline, + KandinskyV22ControlnetImg2ImgPipeline, + KandinskyV22ControlnetPipeline, + KandinskyV22Img2ImgPipeline, + KandinskyV22InpaintPipeline, + KandinskyV22Pipeline, + KandinskyV22PriorEmb2EmbPipeline, + KandinskyV22PriorPipeline, LDMTextToImagePipeline, PaintByExamplePipeline, SemanticStableDiffusionPipeline, diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py index 4dd16f0dd5ff..a5a0c5549ee9 100644 --- a/src/diffusers/models/embeddings.py +++ b/src/diffusers/models/embeddings.py @@ -376,6 +376,29 @@ def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTenso return torch.cat([image_text_embeds, text_embeds], dim=1) +class ImageProjection(nn.Module): + def __init__( + self, + image_embed_dim: int = 768, + cross_attention_dim: int = 768, + num_image_text_embeds: int = 32, + ): + super().__init__() + + self.num_image_text_embeds = num_image_text_embeds + self.image_embeds = nn.Linear(image_embed_dim, self.num_image_text_embeds * cross_attention_dim) + self.norm = nn.LayerNorm(cross_attention_dim) + + def forward(self, image_embeds: torch.FloatTensor): + batch_size = image_embeds.shape[0] + + # image + image_embeds = self.image_embeds(image_embeds) + image_embeds = image_embeds.reshape(batch_size, self.num_image_text_embeds, -1) + image_embeds = self.norm(image_embeds) + return image_embeds + + class CombinedTimestepLabelEmbeddings(nn.Module): def __init__(self, num_classes, embedding_dim, class_dropout_prob=0.1): super().__init__() @@ -429,6 +452,50 @@ def forward(self, text_embeds: torch.FloatTensor, image_embeds: torch.FloatTenso return time_image_embeds + time_text_embeds +class ImageTimeEmbedding(nn.Module): + def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536): + super().__init__() + self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) + self.image_norm = nn.LayerNorm(time_embed_dim) + + def forward(self, image_embeds: torch.FloatTensor): + # image + time_image_embeds = self.image_proj(image_embeds) + time_image_embeds = self.image_norm(time_image_embeds) + return time_image_embeds + + +class ImageHintTimeEmbedding(nn.Module): + def __init__(self, image_embed_dim: int = 768, time_embed_dim: int = 1536): + super().__init__() + self.image_proj = nn.Linear(image_embed_dim, time_embed_dim) + self.image_norm = nn.LayerNorm(time_embed_dim) + self.input_hint_block = nn.Sequential( + nn.Conv2d(3, 16, 3, padding=1), + nn.SiLU(), + nn.Conv2d(16, 16, 3, padding=1), + nn.SiLU(), + nn.Conv2d(16, 32, 3, padding=1, stride=2), + nn.SiLU(), + nn.Conv2d(32, 32, 3, padding=1), + nn.SiLU(), + nn.Conv2d(32, 96, 3, padding=1, stride=2), + nn.SiLU(), + nn.Conv2d(96, 96, 3, padding=1), + nn.SiLU(), + nn.Conv2d(96, 256, 3, padding=1, stride=2), + nn.SiLU(), + nn.Conv2d(256, 4, 3, padding=1), + ) + + def forward(self, image_embeds: torch.FloatTensor, hint: torch.FloatTensor): + # image + time_image_embeds = self.image_proj(image_embeds) + time_image_embeds = self.image_norm(time_image_embeds) + hint = self.input_hint_block(hint) + return time_image_embeds, hint + + class AttentionPooling(nn.Module): # Copied from https://github.com/deep-floyd/IF/blob/2f91391f27dd3c468bf174be5805b4cc92980c0b/deepfloyd_if/model/nn.py#L54 diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 9b9530ad7f83..1f1d69c6042e 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -25,6 +25,9 @@ from .attention_processor import AttentionProcessor, AttnProcessor from .embeddings import ( GaussianFourierProjection, + ImageHintTimeEmbedding, + ImageProjection, + ImageTimeEmbedding, TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, @@ -306,7 +309,12 @@ def __init__( image_embed_dim=cross_attention_dim, cross_attention_dim=cross_attention_dim, ) - + elif encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 + self.encoder_hid_proj = ImageProjection( + image_embed_dim=encoder_hid_dim, + cross_attention_dim=cross_attention_dim, + ) elif encoder_hid_dim_type is not None: raise ValueError( f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." @@ -362,7 +370,12 @@ def __init__( elif addition_embed_type == "text_time": self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift) self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) - + elif addition_embed_type == "image": + # Kandinsky 2.2 + self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) + elif addition_embed_type == "image_hint": + # Kandinsky 2.2 ControlNet + self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) elif addition_embed_type is not None: raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") @@ -808,7 +821,7 @@ def forward( if self.config.addition_embed_type == "text": aug_emb = self.add_embedding(encoder_hidden_states) elif self.config.addition_embed_type == "text_image": - # Kadinsky 2.1 - style + # Kandinsky 2.1 - style if "image_embeds" not in added_cond_kwargs: raise ValueError( f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" @@ -816,7 +829,6 @@ def forward( image_embs = added_cond_kwargs.get("image_embeds") text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) - aug_emb = self.add_embedding(text_embs, image_embs) elif self.config.addition_embed_type == "text_time": if "text_embeds" not in added_cond_kwargs: @@ -835,6 +847,24 @@ def forward( add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) add_embeds = add_embeds.to(emb.dtype) aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = torch.cat([sample, hint], dim=1) emb = emb + aug_emb if aug_emb is not None else emb @@ -852,7 +882,14 @@ def forward( image_embeds = added_cond_kwargs.get("image_embeds") encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) - + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) # 2. pre-process sample = self.conv_in(sample) diff --git a/src/diffusers/models/vq_model.py b/src/diffusers/models/vq_model.py index 32f944dacac9..687449e8c755 100644 --- a/src/diffusers/models/vq_model.py +++ b/src/diffusers/models/vq_model.py @@ -18,7 +18,7 @@ import torch.nn as nn from ..configuration_utils import ConfigMixin, register_to_config -from ..utils import BaseOutput +from ..utils import BaseOutput, apply_forward_hook from .modeling_utils import ModelMixin from .vae import Decoder, DecoderOutput, Encoder, VectorQuantizer @@ -116,6 +116,7 @@ def __init__( norm_type=norm_type, ) + @apply_forward_hook def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOutput: h = self.encoder(x) h = self.quant_conv(h) @@ -125,6 +126,7 @@ def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> VQEncoderOut return VQEncoderOutput(latents=h) + @apply_forward_hook def decode( self, h: torch.FloatTensor, force_not_quantize: bool = False, return_dict: bool = True ) -> Union[DecoderOutput, torch.FloatTensor]: diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index 4092a1928958..b8bee3299aff 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -65,6 +65,15 @@ KandinskyPipeline, KandinskyPriorPipeline, ) + from .kandinsky2_2 import ( + KandinskyV22ControlnetImg2ImgPipeline, + KandinskyV22ControlnetPipeline, + KandinskyV22Img2ImgPipeline, + KandinskyV22InpaintPipeline, + KandinskyV22Pipeline, + KandinskyV22PriorEmb2EmbPipeline, + KandinskyV22PriorPipeline, + ) from .latent_diffusion import LDMTextToImagePipeline from .paint_by_example import PaintByExamplePipeline from .semantic_stable_diffusion import SemanticStableDiffusionPipeline diff --git a/src/diffusers/pipelines/kandinsky/__init__.py b/src/diffusers/pipelines/kandinsky/__init__.py index c8eecba0c7f2..242ff799e529 100644 --- a/src/diffusers/pipelines/kandinsky/__init__.py +++ b/src/diffusers/pipelines/kandinsky/__init__.py @@ -15,5 +15,5 @@ from .pipeline_kandinsky import KandinskyPipeline from .pipeline_kandinsky_img2img import KandinskyImg2ImgPipeline from .pipeline_kandinsky_inpaint import KandinskyInpaintPipeline - from .pipeline_kandinsky_prior import KandinskyPriorPipeline + from .pipeline_kandinsky_prior import KandinskyPriorPipeline, KandinskyPriorPipelineOutput from .text_encoder import MultilingualCLIP diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py index 7b3537ea6895..489f59b3ad9a 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky.py @@ -115,6 +115,7 @@ def __init__( ) self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): if latents is None: latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py index 04810ddb6e0a..75e58386cff0 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_inpaint.py @@ -275,6 +275,7 @@ def __init__( ) self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): if latents is None: latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) diff --git a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py index a0208d5858b1..a35f406e0a05 100644 --- a/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py +++ b/src/diffusers/pipelines/kandinsky/pipeline_kandinsky_prior.py @@ -274,6 +274,7 @@ def interpolate( return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb) + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): if latents is None: latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) diff --git a/src/diffusers/pipelines/kandinsky2_2/__init__.py b/src/diffusers/pipelines/kandinsky2_2/__init__.py new file mode 100644 index 000000000000..648164b9f1ba --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/__init__.py @@ -0,0 +1,7 @@ +from .pipeline_kandinsky2_2 import KandinskyV22Pipeline +from .pipeline_kandinsky2_2_controlnet import KandinskyV22ControlnetPipeline +from .pipeline_kandinsky2_2_controlnet_img2img import KandinskyV22ControlnetImg2ImgPipeline +from .pipeline_kandinsky2_2_img2img import KandinskyV22Img2ImgPipeline +from .pipeline_kandinsky2_2_inpainting import KandinskyV22InpaintPipeline +from .pipeline_kandinsky2_2_prior import KandinskyV22PriorPipeline +from .pipeline_kandinsky2_2_prior_emb2emb import KandinskyV22PriorEmb2EmbPipeline diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py new file mode 100644 index 000000000000..4a116e1e600b --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2.py @@ -0,0 +1,317 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union + +import torch + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDPMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline + >>> import torch + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior") + >>> pipe_prior.to("cuda") + >>> prompt = "red cat, 4k photo" + >>> out = pipe_prior(prompt) + >>> image_emb = out.image_embeds + >>> zero_image_emb = out.negative_image_embeds + >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder") + >>> pipe.to("cuda") + >>> image = pipe( + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=50, + ... ).images + >>> image[0].save("cat.png") + ``` +""" + + +def downscale_height_and_width(height, width, scale_factor=8): + new_height = height // scale_factor**2 + if height % scale_factor**2 != 0: + new_height += 1 + new_width = width // scale_factor**2 + if width % scale_factor**2 != 0: + new_width += 1 + return new_height * scale_factor, new_width * scale_factor + + +class KandinskyV22Pipeline(DiffusionPipeline): + """ + Pipeline for text-to-image generation using Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ Decoder to generate the image from the latents. + """ + + def __init__( + self, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + movq: VQModel, + ): + super().__init__() + + self.register_modules( + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + num_images_per_prompt: int = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Args: + Function invoked when calling the pipeline for generation. + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + device = self._execution_device + + do_classifier_free_guidance = guidance_scale > 1.0 + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + batch_size = image_embeds.shape[0] * num_images_per_prompt + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps_tensor = self.scheduler.timesteps + + num_channels_latents = self.unet.config.in_channels + + height, width = downscale_height_and_width(height, width, self.movq_scale_factor) + + # create initial latent + latents = self.prepare_latents( + (batch_size, num_channels_latents, height, width), + image_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + added_cond_kwargs = {"image_embeds": image_embeds} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=None, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + )[0] + # post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py new file mode 100644 index 000000000000..73fc20b5e0f2 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet.py @@ -0,0 +1,372 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union + +import torch + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDPMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> import numpy as np + + >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22ControlnetPipeline + >>> from transformers import pipeline + >>> from diffusers.utils import load_image + + + >>> def make_hint(image, depth_estimator): + ... image = depth_estimator(image)["depth"] + ... image = np.array(image) + ... image = image[:, :, None] + ... image = np.concatenate([image, image, image], axis=2) + ... detected_map = torch.from_numpy(image).float() / 255.0 + ... hint = detected_map.permute(2, 0, 1) + ... return hint + + + >>> depth_estimator = pipeline("depth-estimation") + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior = pipe_prior.to("cuda") + + >>> pipe = KandinskyV22ControlnetPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + + + >>> img = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ).resize((768, 768)) + + >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda") + + >>> prompt = "A robot, 4k photo" + >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature" + + >>> generator = torch.Generator(device="cuda").manual_seed(43) + + >>> image_emb, zero_image_emb = pipe_prior( + ... prompt=prompt, negative_prompt=negative_prior_prompt, generator=generator + ... ).to_tuple() + + >>> images = pipe( + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... hint=hint, + ... num_inference_steps=50, + ... generator=generator, + ... height=768, + ... width=768, + ... ).images + + >>> images[0].save("robot_cat.png") + ``` +""" + + +# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width +def downscale_height_and_width(height, width, scale_factor=8): + new_height = height // scale_factor**2 + if height % scale_factor**2 != 0: + new_height += 1 + new_width = width // scale_factor**2 + if width % scale_factor**2 != 0: + new_width += 1 + return new_height * scale_factor, new_width * scale_factor + + +class KandinskyV22ControlnetPipeline(DiffusionPipeline): + """ + Pipeline for text-to-image generation using Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ Decoder to generate the image from the latents. + """ + + def __init__( + self, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + movq: VQModel, + ): + super().__init__() + + self.register_modules( + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + hint: torch.FloatTensor, + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + num_images_per_prompt: int = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + hint (`torch.FloatTensor`): + The controlnet condition. + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + device = self._execution_device + + do_classifier_free_guidance = guidance_scale > 1.0 + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + if isinstance(hint, list): + hint = torch.cat(hint, dim=0) + + batch_size = image_embeds.shape[0] * num_images_per_prompt + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + hint = hint.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device) + hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device) + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps_tensor = self.scheduler.timesteps + + num_channels_latents = self.movq.config.latent_channels + + height, width = downscale_height_and_width(height, width, self.movq_scale_factor) + + # create initial latent + latents = self.prepare_latents( + (batch_size, num_channels_latents, height, width), + image_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=None, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + )[0] + # post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py new file mode 100644 index 000000000000..3e001e89e490 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py @@ -0,0 +1,434 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +from PIL import Image + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDPMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> import numpy as np + + >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22ControlnetImg2ImgPipeline + >>> from transformers import pipeline + >>> from diffusers.utils import load_image + + + >>> def make_hint(image, depth_estimator): + ... image = depth_estimator(image)["depth"] + ... image = np.array(image) + ... image = image[:, :, None] + ... image = np.concatenate([image, image, image], axis=2) + ... detected_map = torch.from_numpy(image).float() / 255.0 + ... hint = detected_map.permute(2, 0, 1) + ... return hint + + + >>> depth_estimator = pipeline("depth-estimation") + + >>> pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior = pipe_prior.to("cuda") + + >>> pipe = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + + >>> img = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ).resize((768, 768)) + + + >>> hint = make_hint(img, depth_estimator).unsqueeze(0).half().to("cuda") + + >>> prompt = "A robot, 4k photo" + >>> negative_prior_prompt = "lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature" + + >>> generator = torch.Generator(device="cuda").manual_seed(43) + + >>> img_emb = pipe_prior(prompt=prompt, image=img, strength=0.85, generator=generator) + >>> negative_emb = pipe_prior(prompt=negative_prior_prompt, image=img, strength=1, generator=generator) + + >>> images = pipe( + ... image=img, + ... strength=0.5, + ... image_embeds=img_emb.image_embeds, + ... negative_image_embeds=negative_emb.image_embeds, + ... hint=hint, + ... num_inference_steps=50, + ... generator=generator, + ... height=768, + ... width=768, + ... ).images + + >>> images[0].save("robot_cat.png") + ``` +""" + + +# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width +def downscale_height_and_width(height, width, scale_factor=8): + new_height = height // scale_factor**2 + if height % scale_factor**2 != 0: + new_height += 1 + new_width = width // scale_factor**2 + if width % scale_factor**2 != 0: + new_width += 1 + return new_height * scale_factor, new_width * scale_factor + + +# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image +def prepare_image(pil_image, w=512, h=512): + pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1) + arr = np.array(pil_image.convert("RGB")) + arr = arr.astype(np.float32) / 127.5 - 1 + arr = np.transpose(arr, [2, 0, 1]) + image = torch.from_numpy(arr).unsqueeze(0) + return image + + +class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline): + """ + Pipeline for image-to-image generation using Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ Decoder to generate the image from the latents. + """ + + def __init__( + self, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + movq: VQModel, + ): + super().__init__() + + self.register_modules( + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2_img2img.KandinskyV22Img2ImgPipeline.prepare_latents + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + + else: + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.movq.encode(image).latent_dist.sample(generator) + + init_latents = self.movq.config.scaling_factor * init_latents + + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + + latents = init_latents + + return latents + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + hint: torch.FloatTensor, + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + strength: float = 0.3, + num_images_per_prompt: int = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded + again. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + hint (`torch.FloatTensor`): + The controlnet condition. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + device = self._execution_device + + do_classifier_free_guidance = guidance_scale > 1.0 + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + if isinstance(hint, list): + hint = torch.cat(hint, dim=0) + + batch_size = image_embeds.shape[0] + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + hint = hint.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device) + hint = torch.cat([hint, hint], dim=0).to(dtype=self.unet.dtype, device=device) + + if not isinstance(image, list): + image = [image] + if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image): + raise ValueError( + f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support PIL image and pytorch tensor" + ) + + image = torch.cat([prepare_image(i, width, height) for i in image], dim=0) + image = image.to(dtype=image_embeds.dtype, device=device) + + latents = self.movq.encode(image)["latents"] + latents = latents.repeat_interleave(num_images_per_prompt, dim=0) + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + height, width = downscale_height_and_width(height, width, self.movq_scale_factor) + latents = self.prepare_latents( + latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator + ) + for i, t in enumerate(self.progress_bar(timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + added_cond_kwargs = {"image_embeds": image_embeds, "hint": hint} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=None, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + )[0] + + # post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py new file mode 100644 index 000000000000..0a5f77b0ff2d --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py @@ -0,0 +1,398 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +from PIL import Image + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDPMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22Img2ImgPipeline, KandinskyV22PriorPipeline + >>> from diffusers.utils import load_image + >>> import torch + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> prompt = "A red cartoon frog, 4k" + >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) + + >>> pipe = KandinskyV22Img2ImgPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 + ... ) + >>> pipe.to("cuda") + + >>> init_image = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/frog.png" + ... ) + + >>> image = pipe( + ... image=init_image, + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=100, + ... strength=0.2, + ... ).images + + >>> image[0].save("red_frog.png") + ``` +""" + + +# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width +def downscale_height_and_width(height, width, scale_factor=8): + new_height = height // scale_factor**2 + if height % scale_factor**2 != 0: + new_height += 1 + new_width = width // scale_factor**2 + if width % scale_factor**2 != 0: + new_width += 1 + return new_height * scale_factor, new_width * scale_factor + + +# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image +def prepare_image(pil_image, w=512, h=512): + pil_image = pil_image.resize((w, h), resample=Image.BICUBIC, reducing_gap=1) + arr = np.array(pil_image.convert("RGB")) + arr = arr.astype(np.float32) / 127.5 - 1 + arr = np.transpose(arr, [2, 0, 1]) + image = torch.from_numpy(arr).unsqueeze(0) + return image + + +class KandinskyV22Img2ImgPipeline(DiffusionPipeline): + """ + Pipeline for image-to-image generation using Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ Decoder to generate the image from the latents. + """ + + def __init__( + self, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + movq: VQModel, + ): + super().__init__() + + self.register_modules( + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)): + raise ValueError( + f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}" + ) + + image = image.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + + else: + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + elif isinstance(generator, list): + init_latents = [ + self.movq.encode(image[i : i + 1]).latent_dist.sample(generator[i]) for i in range(batch_size) + ] + init_latents = torch.cat(init_latents, dim=0) + else: + init_latents = self.movq.encode(image).latent_dist.sample(generator) + + init_latents = self.movq.config.scaling_factor * init_latents + + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + + latents = init_latents + + return latents + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]], + negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + strength: float = 0.3, + num_images_per_prompt: int = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`): + `Image`, or tensor representing an image batch, that will be used as the starting point for the + process. Can also accpet image latents as `image`, if passing latents directly, it will not be encoded + again. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will + be maximum and the denoising process will run for the full number of iterations specified in + `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + device = self._execution_device + + do_classifier_free_guidance = guidance_scale > 1.0 + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + batch_size = image_embeds.shape[0] + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device) + + if not isinstance(image, list): + image = [image] + if not all(isinstance(i, (PIL.Image.Image, torch.Tensor)) for i in image): + raise ValueError( + f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support PIL image and pytorch tensor" + ) + + image = torch.cat([prepare_image(i, width, height) for i in image], dim=0) + image = image.to(dtype=image_embeds.dtype, device=device) + + latents = self.movq.encode(image)["latents"] + latents = latents.repeat_interleave(num_images_per_prompt, dim=0) + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + height, width = downscale_height_and_width(height, width, self.movq_scale_factor) + latents = self.prepare_latents( + latents, latent_timestep, batch_size, num_images_per_prompt, image_embeds.dtype, device, generator + ) + for i, t in enumerate(self.progress_bar(timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + added_cond_kwargs = {"image_embeds": image_embeds} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=None, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + )[0] + + # post-processing + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py new file mode 100644 index 000000000000..151312979f81 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_inpainting.py @@ -0,0 +1,531 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from copy import deepcopy +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +import torch.nn.functional as F +from PIL import Image + +from ...models import UNet2DConditionModel, VQModel +from ...pipelines import DiffusionPipeline +from ...pipelines.pipeline_utils import ImagePipelineOutput +from ...schedulers import DDPMScheduler +from ...utils import ( + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22InpaintPipeline, KandinskyV22PriorPipeline + >>> from diffusers.utils import load_image + >>> import torch + >>> import numpy as np + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> prompt = "a hat" + >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False) + + >>> pipe = KandinskyV22InpaintPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16 + ... ) + >>> pipe.to("cuda") + + >>> init_image = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ) + + >>> mask = np.ones((768, 768), dtype=np.float32) + >>> mask[:250, 250:-250] = 0 + + >>> out = pipe( + ... image=init_image, + ... mask_image=mask, + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=50, + ... ) + + >>> image = out.images[0] + >>> image.save("cat_with_hat.png") + ``` +""" + + +# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width +def downscale_height_and_width(height, width, scale_factor=8): + new_height = height // scale_factor**2 + if height % scale_factor**2 != 0: + new_height += 1 + new_width = width // scale_factor**2 + if width % scale_factor**2 != 0: + new_width += 1 + return new_height * scale_factor, new_width * scale_factor + + +# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask +def prepare_mask(masks): + prepared_masks = [] + for mask in masks: + old_mask = deepcopy(mask) + for i in range(mask.shape[1]): + for j in range(mask.shape[2]): + if old_mask[0][i][j] == 1: + continue + if i != 0: + mask[:, i - 1, j] = 0 + if j != 0: + mask[:, i, j - 1] = 0 + if i != 0 and j != 0: + mask[:, i - 1, j - 1] = 0 + if i != mask.shape[1] - 1: + mask[:, i + 1, j] = 0 + if j != mask.shape[2] - 1: + mask[:, i, j + 1] = 0 + if i != mask.shape[1] - 1 and j != mask.shape[2] - 1: + mask[:, i + 1, j + 1] = 0 + prepared_masks.append(mask) + return torch.stack(prepared_masks, dim=0) + + +# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_inpaint.prepare_mask_and_masked_image +def prepare_mask_and_masked_image(image, mask, height, width): + r""" + Prepares a pair (mask, image) to be consumed by the Kandinsky inpaint pipeline. This means that those inputs will + be converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for + the ``image`` and ``1`` for the ``mask``. + + The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be + binarized (``mask > 0.5``) and cast to ``torch.float32`` too. + + Args: + image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint. + It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width`` + ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``. + mask (_type_): The mask to apply to the image, i.e. regions to inpaint. + It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width`` + ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + + + Raises: + ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask + should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions. + TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not + (ot the other way around). + + Returns: + tuple[torch.Tensor]: The pair (mask, image) as ``torch.Tensor`` with 4 + dimensions: ``batch x channels x height x width``. + """ + + if image is None: + raise ValueError("`image` input cannot be undefined.") + + if mask is None: + raise ValueError("`mask_image` input cannot be undefined.") + + if isinstance(image, torch.Tensor): + if not isinstance(mask, torch.Tensor): + raise TypeError(f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not") + + # Batch single image + if image.ndim == 3: + assert image.shape[0] == 3, "Image outside a batch should be of shape (3, H, W)" + image = image.unsqueeze(0) + + # Batch and add channel dim for single mask + if mask.ndim == 2: + mask = mask.unsqueeze(0).unsqueeze(0) + + # Batch single mask or add channel dim + if mask.ndim == 3: + # Single batched mask, no channel dim or single mask not batched but channel dim + if mask.shape[0] == 1: + mask = mask.unsqueeze(0) + + # Batched masks no channel dim + else: + mask = mask.unsqueeze(1) + + assert image.ndim == 4 and mask.ndim == 4, "Image and Mask must have 4 dimensions" + assert image.shape[-2:] == mask.shape[-2:], "Image and Mask must have the same spatial dimensions" + assert image.shape[0] == mask.shape[0], "Image and Mask must have the same batch size" + + # Check image is in [-1, 1] + if image.min() < -1 or image.max() > 1: + raise ValueError("Image should be in [-1, 1] range") + + # Check mask is in [0, 1] + if mask.min() < 0 or mask.max() > 1: + raise ValueError("Mask should be in [0, 1] range") + + # Binarize mask + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + + # Image as float32 + image = image.to(dtype=torch.float32) + elif isinstance(mask, torch.Tensor): + raise TypeError(f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not") + else: + # preprocess image + if isinstance(image, (PIL.Image.Image, np.ndarray)): + image = [image] + + if isinstance(image, list) and isinstance(image[0], PIL.Image.Image): + # resize all images w.r.t passed height an width + image = [i.resize((width, height), resample=Image.BICUBIC, reducing_gap=1) for i in image] + image = [np.array(i.convert("RGB"))[None, :] for i in image] + image = np.concatenate(image, axis=0) + elif isinstance(image, list) and isinstance(image[0], np.ndarray): + image = np.concatenate([i[None, :] for i in image], axis=0) + + image = image.transpose(0, 3, 1, 2) + image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 + + # preprocess mask + if isinstance(mask, (PIL.Image.Image, np.ndarray)): + mask = [mask] + + if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image): + mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask] + mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) + mask = mask.astype(np.float32) / 255.0 + elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): + mask = np.concatenate([m[None, None, :] for m in mask], axis=0) + + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + mask = torch.from_numpy(mask) + + return mask, image + + +class KandinskyV22InpaintPipeline(DiffusionPipeline): + """ + Pipeline for text-guided image inpainting using Kandinsky2.1 + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + scheduler ([`DDIMScheduler`]): + A scheduler to be used in combination with `unet` to generate image latents. + unet ([`UNet2DConditionModel`]): + Conditional U-Net architecture to denoise the image embedding. + movq ([`VQModel`]): + MoVQ Decoder to generate the image from the latents. + """ + + def __init__( + self, + unet: UNet2DConditionModel, + scheduler: DDPMScheduler, + movq: VQModel, + ): + super().__init__() + + self.register_modules( + unet=unet, + scheduler=scheduler, + movq=movq, + ) + self.movq_scale_factor = 2 ** (len(self.movq.config.block_out_channels) - 1) + + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.unet, + self.movq, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + # Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.KandinskyV22Pipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.unet, self.movq]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if not hasattr(self.unet, "_hf_hook"): + return self.device + for module in self.unet.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + image: Union[torch.FloatTensor, PIL.Image.Image], + mask_image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], + negative_image_embeds: Union[torch.FloatTensor, List[torch.FloatTensor]], + height: int = 512, + width: int = 512, + num_inference_steps: int = 100, + guidance_scale: float = 4.0, + num_images_per_prompt: int = 1, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + ): + """ + Args: + Function invoked when calling the pipeline for generation. + image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for text prompt, that will be used to condition the image generation. + image (`PIL.Image.Image`): + `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will + be masked out with `mask_image` and repainted according to `prompt`. + mask_image (`np.array`): + Tensor representing an image batch, to mask `image`. Black pixels in the mask will be repainted, while + white pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single + channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, + so the expected shape would be `(B, H, W, 1)`. + negative_image_embeds (`torch.FloatTensor` or `List[torch.FloatTensor]`): + The clip image embeddings for negative text prompt, will be used to condition the image generation. + height (`int`, *optional*, defaults to 512): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to 512): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"` + (`np.array`) or `"pt"` (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`~pipelines.ImagePipelineOutput`] or `tuple` + """ + device = self._execution_device + + do_classifier_free_guidance = guidance_scale > 1.0 + + if isinstance(image_embeds, list): + image_embeds = torch.cat(image_embeds, dim=0) + batch_size = image_embeds.shape[0] * num_images_per_prompt + if isinstance(negative_image_embeds, list): + negative_image_embeds = torch.cat(negative_image_embeds, dim=0) + + if do_classifier_free_guidance: + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + negative_image_embeds = negative_image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + image_embeds = torch.cat([negative_image_embeds, image_embeds], dim=0).to(dtype=self.unet.dtype, device=device) + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps_tensor = self.scheduler.timesteps + + # preprocess image and mask + mask_image, image = prepare_mask_and_masked_image(image, mask_image, height, width) + + image = image.to(dtype=image_embeds.dtype, device=device) + image = self.movq.encode(image)["latents"] + + mask_image = mask_image.to(dtype=image_embeds.dtype, device=device) + + image_shape = tuple(image.shape[-2:]) + mask_image = F.interpolate( + mask_image, + image_shape, + mode="nearest", + ) + mask_image = prepare_mask(mask_image) + masked_image = image * mask_image + + mask_image = mask_image.repeat_interleave(num_images_per_prompt, dim=0) + masked_image = masked_image.repeat_interleave(num_images_per_prompt, dim=0) + if do_classifier_free_guidance: + mask_image = mask_image.repeat(2, 1, 1, 1) + masked_image = masked_image.repeat(2, 1, 1, 1) + + num_channels_latents = self.movq.config.latent_channels + + height, width = downscale_height_and_width(height, width, self.movq_scale_factor) + + # create initial latent + latents = self.prepare_latents( + (batch_size, num_channels_latents, height, width), + image_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + noise = torch.clone(latents) + for i, t in enumerate(self.progress_bar(timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + latent_model_input = torch.cat([latent_model_input, masked_image, mask_image], dim=1) + + added_cond_kwargs = {"image_embeds": image_embeds} + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=None, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + if do_classifier_free_guidance: + noise_pred, variance_pred = noise_pred.split(latents.shape[1], dim=1) + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + _, variance_pred_text = variance_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) + noise_pred = torch.cat([noise_pred, variance_pred_text], dim=1) + + if not ( + hasattr(self.scheduler.config, "variance_type") + and self.scheduler.config.variance_type in ["learned", "learned_range"] + ): + noise_pred, _ = noise_pred.split(latents.shape[1], dim=1) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, + t, + latents, + generator=generator, + )[0] + init_latents_proper = image[:1] + init_mask = mask_image[:1] + + if i < len(timesteps_tensor) - 1: + noise_timestep = timesteps_tensor[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) + + latents = init_mask * init_latents_proper + (1 - init_mask) * latents + # post-processing + latents = mask_image[:1] * image[:1] + (1 - mask_image[:1]) * latents + image = self.movq.decode(latents, force_not_quantize=True)["sample"] + + if output_type not in ["pt", "np", "pil"]: + raise ValueError(f"Only the output types `pt`, `pil` and `np` are supported not output_type={output_type}") + + if output_type in ["np", "pil"]: + image = image * 0.5 + 0.5 + image = image.clamp(0, 1) + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + + if output_type == "pil": + image = self.numpy_to_pil(image) + + if not return_dict: + return (image,) + + return ImagePipelineOutput(images=image) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py new file mode 100644 index 000000000000..3b9974a5dd70 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior.py @@ -0,0 +1,541 @@ +from typing import List, Optional, Union + +import PIL +import torch +from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection + +from ...models import PriorTransformer +from ...pipelines import DiffusionPipeline +from ...schedulers import UnCLIPScheduler +from ...utils import ( + is_accelerate_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..kandinsky import KandinskyPriorPipelineOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorPipeline + >>> import torch + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-prior") + >>> pipe_prior.to("cuda") + >>> prompt = "red cat, 4k photo" + >>> image_emb, negative_image_emb = pipe_prior(prompt).to_tuple() + + >>> pipe = KandinskyV22Pipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder") + >>> pipe.to("cuda") + >>> image = pipe( + ... image_embeds=image_emb, + ... negative_image_embeds=negative_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=50, + ... ).images + >>> image[0].save("cat.png") + ``` +""" + +EXAMPLE_INTERPOLATE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22PriorPipeline, KandinskyV22Pipeline + >>> from diffusers.utils import load_image + >>> import PIL + >>> import torch + >>> from torchvision import transforms + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + >>> img1 = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ) + >>> img2 = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/starry_night.jpeg" + ... ) + >>> images_texts = ["a cat", img1, img2] + >>> weights = [0.3, 0.3, 0.4] + >>> out = pipe_prior.interpolate(images_texts, weights) + >>> pipe = KandinskyV22Pipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 + ... ) + >>> pipe.to("cuda") + >>> image = pipe( + ... image_embeds=out.image_embeds, + ... negative_image_embeds=out.negative_image_embeds, + ... height=768, + ... width=768, + ... num_inference_steps=50, + ... ).images[0] + >>> image.save("starry_cat.png") + ``` +""" + + +class KandinskyV22PriorPipeline(DiffusionPipeline): + """ + Pipeline for generating image prior for Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + prior ([`PriorTransformer`]): + The canonincal unCLIP prior to approximate the image embedding from the text embedding. + image_encoder ([`CLIPVisionModelWithProjection`]): + Frozen image-encoder. + text_encoder ([`CLIPTextModelWithProjection`]): + Frozen text-encoder. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + scheduler ([`UnCLIPScheduler`]): + A scheduler to be used in combination with `prior` to generate image embedding. + image_processor ([`CLIPImageProcessor`]): + A image_processor to be used to preprocess image from clip. + """ + + def __init__( + self, + prior: PriorTransformer, + image_encoder: CLIPVisionModelWithProjection, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + scheduler: UnCLIPScheduler, + image_processor: CLIPImageProcessor, + ): + super().__init__() + + self.register_modules( + prior=prior, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, + image_encoder=image_encoder, + image_processor=image_processor, + ) + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) + def interpolate( + self, + images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + weights: List[float], + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + negative_prior_prompt: Optional[str] = None, + negative_prompt: Union[str] = "", + guidance_scale: float = 4.0, + device=None, + ): + """ + Function invoked when using the prior pipeline for interpolation. + + Args: + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + list of prompts and images to guide the image generation. + weights: (`List[float]`): + list of weights for each condition in `images_and_prompts` + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + negative_prior_prompt (`str`, *optional*): + The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if + `guidance_scale` is less than `1`). + negative_prompt (`str` or `List[str]`, *optional*): + The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if + `guidance_scale` is less than `1`). + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + + Examples: + + Returns: + [`KandinskyPriorPipelineOutput`] or `tuple` + """ + + device = device or self.device + + if len(images_and_prompts) != len(weights): + raise ValueError( + f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length" + ) + + image_embeddings = [] + for cond, weight in zip(images_and_prompts, weights): + if isinstance(cond, str): + image_emb = self( + cond, + num_inference_steps=num_inference_steps, + num_images_per_prompt=num_images_per_prompt, + generator=generator, + latents=latents, + negative_prompt=negative_prior_prompt, + guidance_scale=guidance_scale, + ).image_embeds.unsqueeze(0) + + elif isinstance(cond, (PIL.Image.Image, torch.Tensor)): + if isinstance(cond, PIL.Image.Image): + cond = ( + self.image_processor(cond, return_tensors="pt") + .pixel_values[0] + .unsqueeze(0) + .to(dtype=self.image_encoder.dtype, device=device) + ) + + image_emb = self.image_encoder(cond)["image_embeds"].repeat(num_images_per_prompt, 1).unsqueeze(0) + + else: + raise ValueError( + f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor` but is {type(cond)}" + ) + + image_embeddings.append(image_emb * weight) + + image_emb = torch.cat(image_embeddings).sum(dim=0) + + out_zero = self( + negative_prompt, + num_inference_steps=num_inference_steps, + num_images_per_prompt=num_images_per_prompt, + generator=generator, + latents=latents, + negative_prompt=negative_prior_prompt, + guidance_scale=guidance_scale, + ) + zero_image_emb = out_zero.negative_image_embeds if negative_prompt == "" else out_zero.image_embeds + + return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=zero_image_emb) + + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed + def get_zero_embed(self, batch_size=1, device=None): + device = device or self.device + zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to( + device=device, dtype=self.image_encoder.dtype + ) + zero_image_emb = self.image_encoder(zero_img)["image_embeds"] + zero_image_emb = zero_image_emb.repeat(batch_size, 1) + return zero_image_emb + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.image_encoder, + self.text_encoder, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + @property + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"): + return self.device + for module in self.text_encoder.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + text_mask = text_inputs.attention_mask.bool().to(device) + + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] + + text_encoder_output = self.text_encoder(text_input_ids.to(device)) + + prompt_embeds = text_encoder_output.text_embeds + text_encoder_hidden_states = text_encoder_output.last_hidden_state + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + uncond_text_mask = uncond_input.attention_mask.bool().to(device) + negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device)) + + negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds + uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len) + + seq_len = uncond_text_encoder_hidden_states.shape[1] + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + # done duplicates + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) + + text_mask = torch.cat([uncond_text_mask, text_mask]) + + return prompt_embeds, text_encoder_hidden_states, text_mask + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + guidance_scale: float = 4.0, + output_type: Optional[str] = "pt", # pt only + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + output_type (`str`, *optional*, defaults to `"pt"`): + The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"` + (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`KandinskyPriorPipelineOutput`] or `tuple` + """ + + if isinstance(prompt, str): + prompt = [prompt] + elif not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif not isinstance(negative_prompt, list) and negative_prompt is not None: + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + + # if the negative prompt is defined we double the batch size to + # directly retrieve the negative prompt embedding + if negative_prompt is not None: + prompt = prompt + negative_prompt + negative_prompt = 2 * negative_prompt + + device = self._execution_device + + batch_size = len(prompt) + batch_size = batch_size * num_images_per_prompt + + do_classifier_free_guidance = guidance_scale > 1.0 + prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + # prior + self.scheduler.set_timesteps(num_inference_steps, device=device) + prior_timesteps_tensor = self.scheduler.timesteps + + embedding_dim = self.prior.config.embedding_dim + + latents = self.prepare_latents( + (batch_size, embedding_dim), + prompt_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + + for i, t in enumerate(self.progress_bar(prior_timesteps_tensor)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + predicted_image_embedding = self.prior( + latent_model_input, + timestep=t, + proj_embedding=prompt_embeds, + encoder_hidden_states=text_encoder_hidden_states, + attention_mask=text_mask, + ).predicted_image_embedding + + if do_classifier_free_guidance: + predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2) + predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * ( + predicted_image_embedding_text - predicted_image_embedding_uncond + ) + + if i + 1 == prior_timesteps_tensor.shape[0]: + prev_timestep = None + else: + prev_timestep = prior_timesteps_tensor[i + 1] + + latents = self.scheduler.step( + predicted_image_embedding, + timestep=t, + sample=latents, + generator=generator, + prev_timestep=prev_timestep, + ).prev_sample + + latents = self.prior.post_process_latents(latents) + + image_embeddings = latents + + # if negative prompt has been defined, we retrieve split the image embedding into two + if negative_prompt is None: + zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device) + else: + image_embeddings, zero_embeds = image_embeddings.chunk(2) + + if output_type not in ["pt", "np"]: + raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}") + + if output_type == "np": + image_embeddings = image_embeddings.cpu().numpy() + zero_embeds = zero_embeds.cpu().numpy() + + if not return_dict: + return (image_embeddings, zero_embeds) + + return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds) diff --git a/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py new file mode 100644 index 000000000000..ae46af9c4551 --- /dev/null +++ b/src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_prior_emb2emb.py @@ -0,0 +1,605 @@ +from typing import List, Optional, Union + +import PIL +import torch +from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection + +from ...models import PriorTransformer +from ...pipelines import DiffusionPipeline +from ...schedulers import UnCLIPScheduler +from ...utils import ( + is_accelerate_available, + logging, + randn_tensor, + replace_example_docstring, +) +from ..kandinsky import KandinskyPriorPipelineOutput + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22Pipeline, KandinskyV22PriorEmb2EmbPipeline + >>> import torch + + >>> pipe_prior = KandinskyPriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> prompt = "red cat, 4k photo" + >>> img = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ) + >>> image_emb, nagative_image_emb = pipe_prior(prompt, image=img, strength=0.2).to_tuple() + + >>> pipe = KandinskyPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-decoder, torch_dtype=torch.float16" + ... ) + >>> pipe.to("cuda") + + >>> image = pipe( + ... image_embeds=image_emb, + ... negative_image_embeds=negative_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=100, + ... ).images + + >>> image[0].save("cat.png") + ``` +""" + +EXAMPLE_INTERPOLATE_DOC_STRING = """ + Examples: + ```py + >>> from diffusers import KandinskyV22PriorEmb2EmbPipeline, KandinskyV22Pipeline + >>> from diffusers.utils import load_image + >>> import PIL + + >>> import torch + >>> from torchvision import transforms + + >>> pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ... ) + >>> pipe_prior.to("cuda") + + >>> img1 = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/cat.png" + ... ) + + >>> img2 = load_image( + ... "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + ... "/kandinsky/starry_night.jpeg" + ... ) + + >>> images_texts = ["a cat", img1, img2] + >>> weights = [0.3, 0.3, 0.4] + >>> image_emb, zero_image_emb = pipe_prior.interpolate(images_texts, weights) + + >>> pipe = KandinskyV22Pipeline.from_pretrained( + ... "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 + ... ) + >>> pipe.to("cuda") + + >>> image = pipe( + ... image_embeds=image_emb, + ... negative_image_embeds=zero_image_emb, + ... height=768, + ... width=768, + ... num_inference_steps=150, + ... ).images[0] + + >>> image.save("starry_cat.png") + ``` +""" + + +class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline): + """ + Pipeline for generating image prior for Kandinsky + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + prior ([`PriorTransformer`]): + The canonincal unCLIP prior to approximate the image embedding from the text embedding. + image_encoder ([`CLIPVisionModelWithProjection`]): + Frozen image-encoder. + text_encoder ([`CLIPTextModelWithProjection`]): + Frozen text-encoder. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + scheduler ([`UnCLIPScheduler`]): + A scheduler to be used in combination with `prior` to generate image embedding. + """ + + def __init__( + self, + prior: PriorTransformer, + image_encoder: CLIPVisionModelWithProjection, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + scheduler: UnCLIPScheduler, + image_processor: CLIPImageProcessor, + ): + super().__init__() + + self.register_modules( + prior=prior, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, + image_encoder=image_encoder, + image_processor=image_processor, + ) + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start:] + + return timesteps, num_inference_steps - t_start + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_INTERPOLATE_DOC_STRING) + def interpolate( + self, + images_and_prompts: List[Union[str, PIL.Image.Image, torch.FloatTensor]], + weights: List[float], + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + negative_prior_prompt: Optional[str] = None, + negative_prompt: Union[str] = "", + guidance_scale: float = 4.0, + device=None, + ): + """ + Function invoked when using the prior pipeline for interpolation. + + Args: + images_and_prompts (`List[Union[str, PIL.Image.Image, torch.FloatTensor]]`): + list of prompts and images to guide the image generation. + weights: (`List[float]`): + list of weights for each condition in `images_and_prompts` + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + negative_prior_prompt (`str`, *optional*): + The prompt not to guide the prior diffusion process. Ignored when not using guidance (i.e., ignored if + `guidance_scale` is less than `1`). + negative_prompt (`str` or `List[str]`, *optional*): + The prompt not to guide the image generation. Ignored when not using guidance (i.e., ignored if + `guidance_scale` is less than `1`). + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + + Examples: + + Returns: + [`KandinskyPriorPipelineOutput`] or `tuple` + """ + + device = device or self.device + + if len(images_and_prompts) != len(weights): + raise ValueError( + f"`images_and_prompts` contains {len(images_and_prompts)} items and `weights` contains {len(weights)} items - they should be lists of same length" + ) + + image_embeddings = [] + for cond, weight in zip(images_and_prompts, weights): + if isinstance(cond, str): + image_emb = self( + cond, + num_inference_steps=num_inference_steps, + num_images_per_prompt=num_images_per_prompt, + generator=generator, + latents=latents, + negative_prompt=negative_prior_prompt, + guidance_scale=guidance_scale, + ).image_embeds.unsqueeze(0) + + elif isinstance(cond, (PIL.Image.Image, torch.Tensor)): + image_emb = self._encode_image( + cond, device=device, num_images_per_prompt=num_images_per_prompt + ).unsqueeze(0) + + else: + raise ValueError( + f"`images_and_prompts` can only contains elements to be of type `str`, `PIL.Image.Image` or `torch.Tensor` but is {type(cond)}" + ) + + image_embeddings.append(image_emb * weight) + + image_emb = torch.cat(image_embeddings).sum(dim=0) + + return KandinskyPriorPipelineOutput(image_embeds=image_emb, negative_image_embeds=torch.randn_like(image_emb)) + + def _encode_image( + self, + image: Union[torch.Tensor, List[PIL.Image.Image]], + device, + num_images_per_prompt, + ): + if not isinstance(image, torch.Tensor): + image = self.image_processor(image, return_tensors="pt").pixel_values.to( + dtype=self.image_encoder.dtype, device=device + ) + + image_emb = self.image_encoder(image)["image_embeds"] # B, D + image_emb = image_emb.repeat_interleave(num_images_per_prompt, dim=0) + image_emb.to(device=device) + + return image_emb + + def prepare_latents(self, emb, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None): + emb = emb.to(device=device, dtype=dtype) + + batch_size = batch_size * num_images_per_prompt + + init_latents = emb + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = torch.cat([init_latents], dim=0) + + shape = init_latents.shape + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + + # get latents + init_latents = self.scheduler.add_noise(init_latents, noise, timestep) + latents = init_latents + + return latents + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.get_zero_embed + def get_zero_embed(self, batch_size=1, device=None): + device = device or self.device + zero_img = torch.zeros(1, 3, self.image_encoder.config.image_size, self.image_encoder.config.image_size).to( + device=device, dtype=self.image_encoder.dtype + ) + zero_image_emb = self.image_encoder(zero_img)["image_embeds"] + zero_image_emb = zero_image_emb.repeat(batch_size, 1) + return zero_image_emb + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline.enable_sequential_cpu_offload + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [ + self.image_encoder, + self.text_encoder, + ] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + @property + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._execution_device + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"): + return self.device + for module in self.text_encoder.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + # Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_prior.KandinskyPriorPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + ): + batch_size = len(prompt) if isinstance(prompt, list) else 1 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + text_mask = text_inputs.attention_mask.bool().to(device) + + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length] + + text_encoder_output = self.text_encoder(text_input_ids.to(device)) + + prompt_embeds = text_encoder_output.text_embeds + text_encoder_hidden_states = text_encoder_output.last_hidden_state + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + text_encoder_hidden_states = text_encoder_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + text_mask = text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + uncond_text_mask = uncond_input.attention_mask.bool().to(device) + negative_prompt_embeds_text_encoder_output = self.text_encoder(uncond_input.input_ids.to(device)) + + negative_prompt_embeds = negative_prompt_embeds_text_encoder_output.text_embeds + uncond_text_encoder_hidden_states = negative_prompt_embeds_text_encoder_output.last_hidden_state + + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + + seq_len = negative_prompt_embeds.shape[1] + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len) + + seq_len = uncond_text_encoder_hidden_states.shape[1] + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.repeat(1, num_images_per_prompt, 1) + uncond_text_encoder_hidden_states = uncond_text_encoder_hidden_states.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + uncond_text_mask = uncond_text_mask.repeat_interleave(num_images_per_prompt, dim=0) + + # done duplicates + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + text_encoder_hidden_states = torch.cat([uncond_text_encoder_hidden_states, text_encoder_hidden_states]) + + text_mask = torch.cat([uncond_text_mask, text_mask]) + + return prompt_embeds, text_encoder_hidden_states, text_mask + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]], + image: Union[torch.Tensor, List[torch.Tensor], PIL.Image.Image, List[PIL.Image.Image]], + strength: float = 0.3, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + guidance_scale: float = 4.0, + output_type: Optional[str] = "pt", # pt only + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + strength (`float`, *optional*, defaults to 0.8): + Conceptually, indicates how much to transform the reference `emb`. Must be between 0 and 1. `image` + will be used as a starting point, adding more noise to it the larger the `strength`. The number of + denoising steps depends on the amount of noise initially added. + emb (`torch.FloatTensor`): + The image embedding. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored + if `guidance_scale` is less than `1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + output_type (`str`, *optional*, defaults to `"pt"`): + The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"` + (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`KandinskyPriorPipelineOutput`] or `tuple` + """ + + if isinstance(prompt, str): + prompt = [prompt] + elif not isinstance(prompt, list): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if isinstance(negative_prompt, str): + negative_prompt = [negative_prompt] + elif not isinstance(negative_prompt, list) and negative_prompt is not None: + raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}") + + # if the negative prompt is defined we double the batch size to + # directly retrieve the negative prompt embedding + if negative_prompt is not None: + prompt = prompt + negative_prompt + negative_prompt = 2 * negative_prompt + + device = self._execution_device + + batch_size = len(prompt) + batch_size = batch_size * num_images_per_prompt + + do_classifier_free_guidance = guidance_scale > 1.0 + prompt_embeds, text_encoder_hidden_states, text_mask = self._encode_prompt( + prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt + ) + + if not isinstance(image, List): + image = [image] + + if isinstance(image[0], torch.Tensor): + image = torch.cat(image, dim=0) + + if isinstance(image, torch.Tensor) and image.ndim == 2: + # allow user to pass image_embeds directly + image_embeds = image.repeat_interleave(num_images_per_prompt, dim=0) + elif isinstance(image, torch.Tensor) and image.ndim != 4: + raise ValueError( + f" if pass `image` as pytorch tensor, or a list of pytorch tensor, please make sure each tensor has shape [batch_size, channels, height, width], currently {image[0].unsqueeze(0).shape}" + ) + else: + image_embeds = self._encode_image(image, device, num_images_per_prompt) + + # prior + self.scheduler.set_timesteps(num_inference_steps, device=device) + + latents = image_embeds + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) + latent_timestep = timesteps[:1].repeat(batch_size) + latents = self.prepare_latents( + latents, + latent_timestep, + batch_size // num_images_per_prompt, + num_images_per_prompt, + prompt_embeds.dtype, + device, + generator, + ) + + for i, t in enumerate(self.progress_bar(timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + + predicted_image_embedding = self.prior( + latent_model_input, + timestep=t, + proj_embedding=prompt_embeds, + encoder_hidden_states=text_encoder_hidden_states, + attention_mask=text_mask, + ).predicted_image_embedding + + if do_classifier_free_guidance: + predicted_image_embedding_uncond, predicted_image_embedding_text = predicted_image_embedding.chunk(2) + predicted_image_embedding = predicted_image_embedding_uncond + guidance_scale * ( + predicted_image_embedding_text - predicted_image_embedding_uncond + ) + + if i + 1 == timesteps.shape[0]: + prev_timestep = None + else: + prev_timestep = timesteps[i + 1] + + latents = self.scheduler.step( + predicted_image_embedding, + timestep=t, + sample=latents, + generator=generator, + prev_timestep=prev_timestep, + ).prev_sample + + latents = self.prior.post_process_latents(latents) + + image_embeddings = latents + + # if negative prompt has been defined, we retrieve split the image embedding into two + if negative_prompt is None: + zero_embeds = self.get_zero_embed(latents.shape[0], device=latents.device) + else: + image_embeddings, zero_embeds = image_embeddings.chunk(2) + + if output_type not in ["pt", "np"]: + raise ValueError(f"Only the output types `pt` and `np` are supported not output_type={output_type}") + + if output_type == "np": + image_embeddings = image_embeddings.cpu().numpy() + zero_embeds = zero_embeds.cpu().numpy() + + if not return_dict: + return (image_embeddings, zero_embeds) + + return KandinskyPriorPipelineOutput(image_embeds=image_embeddings, negative_image_embeds=zero_embeds) diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 95a5562b367a..79e28a42f4c6 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -18,6 +18,9 @@ from ...models.dual_transformer_2d import DualTransformer2DModel from ...models.embeddings import ( GaussianFourierProjection, + ImageHintTimeEmbedding, + ImageProjection, + ImageTimeEmbedding, TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, @@ -409,7 +412,12 @@ def __init__( image_embed_dim=cross_attention_dim, cross_attention_dim=cross_attention_dim, ) - + elif encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 + self.encoder_hid_proj = ImageProjection( + image_embed_dim=encoder_hid_dim, + cross_attention_dim=cross_attention_dim, + ) elif encoder_hid_dim_type is not None: raise ValueError( f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." @@ -465,7 +473,12 @@ def __init__( elif addition_embed_type == "text_time": self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift) self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) - + elif addition_embed_type == "image": + # Kandinsky 2.2 + self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) + elif addition_embed_type == "image_hint": + # Kandinsky 2.2 ControlNet + self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim) elif addition_embed_type is not None: raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") @@ -911,7 +924,7 @@ def forward( if self.config.addition_embed_type == "text": aug_emb = self.add_embedding(encoder_hidden_states) elif self.config.addition_embed_type == "text_image": - # Kadinsky 2.1 - style + # Kandinsky 2.1 - style if "image_embeds" not in added_cond_kwargs: raise ValueError( f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires" @@ -920,7 +933,6 @@ def forward( image_embs = added_cond_kwargs.get("image_embeds") text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states) - aug_emb = self.add_embedding(text_embs, image_embs) elif self.config.addition_embed_type == "text_time": if "text_embeds" not in added_cond_kwargs: @@ -941,6 +953,26 @@ def forward( add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) add_embeds = add_embeds.to(emb.dtype) aug_emb = self.add_embedding(add_embeds) + elif self.config.addition_embed_type == "image": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the" + " keyword argument `image_embeds` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + aug_emb = self.add_embedding(image_embs) + elif self.config.addition_embed_type == "image_hint": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires" + " the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`" + ) + image_embs = added_cond_kwargs.get("image_embeds") + hint = added_cond_kwargs.get("hint") + aug_emb, hint = self.add_embedding(image_embs, hint) + sample = torch.cat([sample, hint], dim=1) emb = emb + aug_emb if aug_emb is not None else emb @@ -959,7 +991,15 @@ def forward( image_embeds = added_cond_kwargs.get("image_embeds") encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds) - + elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj": + # Kandinsky 2.2 - style + if "image_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires" + " the keyword argument `image_embeds` to be passed in `added_conditions`" + ) + image_embeds = added_cond_kwargs.get("image_embeds") + encoder_hidden_states = self.encoder_hid_proj(image_embeds) # 2. pre-process sample = self.conv_in(sample) diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index d44edcb1812a..d7f927658c8a 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -307,3 +307,27 @@ def step( return (pred_prev_sample,) return UnCLIPSchedulerOutput(prev_sample=pred_prev_sample, pred_original_sample=pred_original_sample) + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise + def add_noise( + self, + original_samples: torch.FloatTensor, + noise: torch.FloatTensor, + timesteps: torch.IntTensor, + ) -> torch.FloatTensor: + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples + alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(original_samples.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 0dbc8f1f6f99..6d39c0c67d9d 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -227,6 +227,111 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class KandinskyV22ControlnetImg2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyV22ControlnetPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyV22Img2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyV22InpaintPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyV22Pipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyV22PriorEmb2EmbPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class KandinskyV22PriorPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class LDMTextToImagePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/tests/pipelines/kandinsky_v22/__init__.py b/tests/pipelines/kandinsky_v22/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky.py b/tests/pipelines/kandinsky_v22/test_kandinsky.py new file mode 100644 index 000000000000..162c96d4b3e2 --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky.py @@ -0,0 +1,254 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch + +from diffusers import DDIMScheduler, KandinskyV22Pipeline, KandinskyV22PriorPipeline, UNet2DConditionModel, VQModel +from diffusers.utils import floats_tensor, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyV22PipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22Pipeline + params = [ + "image_embeds", + "negative_image_embeds", + ] + batch_params = ["image_embeds", "negative_image_embeds"] + required_optional_params = [ + "generator", + "height", + "width", + "latents", + "guidance_scale", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 4, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "image", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 64], + "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + unet = self.dummy_unet + movq = self.dummy_movq + + scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="linear", + beta_start=0.00085, + beta_end=0.012, + clip_sample=False, + set_alpha_to_one=False, + steps_offset=1, + prediction_type="epsilon", + thresholding=False, + ) + + components = { + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to( + device + ) + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "generator": generator, + "height": 64, + "width": 64, + "guidance_scale": 4.0, + "num_inference_steps": 2, + "output_type": "np", + } + return inputs + + def test_kandinsky(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.6237976, 1.0, 0.36441332, 1.0, 0.70639634, 0.29877186, 0.85652125, 0.5216843, 0.54454046] + ) + + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + +@slow +@require_torch_gpu +class KandinskyV22PipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_text2img(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/kandinskyv22_text2img_cat_fp16.npy" + ) + + pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyV22Pipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + pipeline.set_progress_bar_config(disable=None) + + prompt = "red cat, 4k photo" + + generator = torch.Generator(device="cuda").manual_seed(0) + image_emb, zero_image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + negative_prompt="", + ).to_tuple() + + generator = torch.Generator(device="cuda").manual_seed(0) + output = pipeline( + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + generator=generator, + num_inference_steps=100, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (512, 512, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py new file mode 100644 index 000000000000..a50bdb50a47b --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet.py @@ -0,0 +1,272 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch + +from diffusers import ( + DDIMScheduler, + KandinskyV22ControlnetPipeline, + KandinskyV22PriorPipeline, + UNet2DConditionModel, + VQModel, +) +from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyV22ControlnetPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22ControlnetPipeline + params = ["image_embeds", "negative_image_embeds", "hint"] + batch_params = ["image_embeds", "negative_image_embeds", "hint"] + required_optional_params = [ + "generator", + "height", + "width", + "latents", + "guidance_scale", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 8, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "image_hint", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 32, 64, 64], + "down_block_types": [ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "AttnDownEncoderBlock2D", + ], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + unet = self.dummy_unet + movq = self.dummy_movq + + scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="linear", + beta_start=0.00085, + beta_end=0.012, + clip_sample=False, + set_alpha_to_one=False, + steps_offset=1, + prediction_type="epsilon", + thresholding=False, + ) + + components = { + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to( + device + ) + + # create hint + hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "hint": hint, + "generator": generator, + "height": 64, + "width": 64, + "guidance_scale": 4.0, + "num_inference_steps": 2, + "output_type": "np", + } + return inputs + + def test_kandinsky_controlnet(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.6959826, 0.868279, 0.7558092, 0.68769467, 0.85805804, 0.65977496, 0.44885302, 0.5959111, 0.4251595] + ) + + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + +@slow +@require_torch_gpu +class KandinskyV22ControlnetPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_controlnet(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/kandinskyv22_controlnet_robotcat_fp16.npy" + ) + + hint = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/hint_image_cat.png" + ) + hint = torch.from_numpy(np.array(hint)).float() / 255.0 + hint = hint.permute(2, 0, 1).unsqueeze(0) + + pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyV22ControlnetPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + pipeline.set_progress_bar_config(disable=None) + + prompt = "A robot, 4k photo" + + generator = torch.Generator(device="cuda").manual_seed(0) + image_emb, zero_image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + negative_prompt="", + ).to_tuple() + + generator = torch.Generator(device="cuda").manual_seed(0) + output = pipeline( + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + hint=hint, + generator=generator, + num_inference_steps=100, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (512, 512, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py new file mode 100644 index 000000000000..9ff2936cbd72 --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_controlnet_img2img.py @@ -0,0 +1,290 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from PIL import Image + +from diffusers import ( + DDIMScheduler, + KandinskyV22ControlnetImg2ImgPipeline, + KandinskyV22PriorEmb2EmbPipeline, + UNet2DConditionModel, + VQModel, +) +from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyV22ControlnetImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22ControlnetImg2ImgPipeline + params = ["image_embeds", "negative_image_embeds", "image", "hint"] + batch_params = ["image_embeds", "negative_image_embeds", "image", "hint"] + required_optional_params = [ + "generator", + "height", + "width", + "strength", + "guidance_scale", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 8, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "image_hint", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 32, 64, 64], + "down_block_types": [ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "AttnDownEncoderBlock2D", + ], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": ["AttnUpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + unet = self.dummy_unet + movq = self.dummy_movq + + ddim_config = { + "num_train_timesteps": 1000, + "beta_schedule": "linear", + "beta_start": 0.00085, + "beta_end": 0.012, + "clip_sample": False, + "set_alpha_to_one": False, + "steps_offset": 0, + "prediction_type": "epsilon", + "thresholding": False, + } + + scheduler = DDIMScheduler(**ddim_config) + + components = { + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to( + device + ) + # create init_image + image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256)) + # create hint + hint = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "image": init_image, + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "hint": hint, + "generator": generator, + "height": 64, + "width": 64, + "num_inference_steps": 10, + "guidance_scale": 7.0, + "strength": 0.2, + "output_type": "np", + } + return inputs + + def test_kandinsky_controlnet_img2img(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.54985034, 0.55509365, 0.52561504, 0.5570494, 0.5593818, 0.5263979, 0.50285643, 0.5069846, 0.51196736] + ) + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + +@slow +@require_torch_gpu +class KandinskyV22ControlnetImg2ImgPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_controlnet_img2img(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/kandinskyv22_controlnet_img2img_robotcat_fp16.npy" + ) + + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" + ) + init_image = init_image.resize((512, 512)) + + hint = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/hint_image_cat.png" + ) + hint = torch.from_numpy(np.array(hint)).float() / 255.0 + hint = hint.permute(2, 0, 1).unsqueeze(0) + + prompt = "A robot, 4k photo" + + pipe_prior = KandinskyV22PriorEmb2EmbPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyV22ControlnetImg2ImgPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-controlnet-depth", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + + pipeline.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + + image_emb, zero_image_emb = pipe_prior( + prompt, + image=init_image, + strength=0.85, + generator=generator, + negative_prompt="", + ).to_tuple() + + output = pipeline( + image=init_image, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + hint=hint, + generator=generator, + num_inference_steps=100, + height=512, + width=512, + strength=0.5, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (512, 512, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py new file mode 100644 index 000000000000..069854325fd4 --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_img2img.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from PIL import Image + +from diffusers import ( + DDIMScheduler, + KandinskyV22Img2ImgPipeline, + KandinskyV22PriorPipeline, + UNet2DConditionModel, + VQModel, +) +from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyV22Img2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22Img2ImgPipeline + params = ["image_embeds", "negative_image_embeds", "image"] + batch_params = [ + "image_embeds", + "negative_image_embeds", + "image", + ] + required_optional_params = [ + "generator", + "height", + "width", + "strength", + "guidance_scale", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 4, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "image", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 64], + "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + unet = self.dummy_unet + movq = self.dummy_movq + + ddim_config = { + "num_train_timesteps": 1000, + "beta_schedule": "linear", + "beta_start": 0.00085, + "beta_end": 0.012, + "clip_sample": False, + "set_alpha_to_one": False, + "steps_offset": 0, + "prediction_type": "epsilon", + "thresholding": False, + } + + scheduler = DDIMScheduler(**ddim_config) + + components = { + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to( + device + ) + # create init_image + image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256)) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "image": init_image, + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "generator": generator, + "height": 64, + "width": 64, + "num_inference_steps": 10, + "guidance_scale": 7.0, + "strength": 0.2, + "output_type": "np", + } + return inputs + + def test_kandinsky_img2img(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.6199778, 0.63984406, 0.46145785, 0.62944984, 0.5622215, 0.47306132, 0.47441456, 0.4607606, 0.48719263] + ) + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + +@slow +@require_torch_gpu +class KandinskyV22Img2ImgPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_img2img(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/kandinskyv22_img2img_frog.npy" + ) + + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" + ) + prompt = "A red cartoon frog, 4k" + + pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyV22Img2ImgPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + + pipeline.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + image_emb, zero_image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + negative_prompt="", + ).to_tuple() + + output = pipeline( + image=init_image, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + generator=generator, + num_inference_steps=100, + height=768, + width=768, + strength=0.2, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (768, 768, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py new file mode 100644 index 000000000000..9be3993acc6f --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_inpaint.py @@ -0,0 +1,287 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from PIL import Image + +from diffusers import ( + DDIMScheduler, + KandinskyV22InpaintPipeline, + KandinskyV22PriorPipeline, + UNet2DConditionModel, + VQModel, +) +from diffusers.utils import floats_tensor, load_image, load_numpy, slow, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +enable_full_determinism() + + +class KandinskyV22InpaintPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22InpaintPipeline + params = ["image_embeds", "negative_image_embeds", "image", "mask_image"] + batch_params = [ + "image_embeds", + "negative_image_embeds", + "image", + "mask_image", + ] + required_optional_params = [ + "generator", + "height", + "width", + "latents", + "guidance_scale", + "num_inference_steps", + "return_dict", + "guidance_scale", + "num_images_per_prompt", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_unet(self): + torch.manual_seed(0) + + model_kwargs = { + "in_channels": 9, + # Out channels is double in channels because predicts mean and variance + "out_channels": 8, + "addition_embed_type": "image", + "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"), + "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"), + "mid_block_type": "UNetMidBlock2DSimpleCrossAttn", + "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2), + "layers_per_block": 1, + "encoder_hid_dim": self.text_embedder_hidden_size, + "encoder_hid_dim_type": "image_proj", + "cross_attention_dim": self.cross_attention_dim, + "attention_head_dim": 4, + "resnet_time_scale_shift": "scale_shift", + "class_embed_type": None, + } + + model = UNet2DConditionModel(**model_kwargs) + return model + + @property + def dummy_movq_kwargs(self): + return { + "block_out_channels": [32, 64], + "down_block_types": ["DownEncoderBlock2D", "AttnDownEncoderBlock2D"], + "in_channels": 3, + "latent_channels": 4, + "layers_per_block": 1, + "norm_num_groups": 8, + "norm_type": "spatial", + "num_vq_embeddings": 12, + "out_channels": 3, + "up_block_types": [ + "AttnUpDecoderBlock2D", + "UpDecoderBlock2D", + ], + "vq_embed_dim": 4, + } + + @property + def dummy_movq(self): + torch.manual_seed(0) + model = VQModel(**self.dummy_movq_kwargs) + return model + + def get_dummy_components(self): + unet = self.dummy_unet + movq = self.dummy_movq + + scheduler = DDIMScheduler( + num_train_timesteps=1000, + beta_schedule="linear", + beta_start=0.00085, + beta_end=0.012, + clip_sample=False, + set_alpha_to_one=False, + steps_offset=1, + prediction_type="epsilon", + thresholding=False, + ) + + components = { + "unet": unet, + "scheduler": scheduler, + "movq": movq, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed)).to(device) + negative_image_embeds = floats_tensor((1, self.text_embedder_hidden_size), rng=random.Random(seed + 1)).to( + device + ) + # create init_image + image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256)) + # create mask + mask = np.ones((64, 64), dtype=np.float32) + mask[:32, :32] = 0 + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "image": init_image, + "mask_image": mask, + "image_embeds": image_embeds, + "negative_image_embeds": negative_image_embeds, + "generator": generator, + "height": 64, + "width": 64, + "num_inference_steps": 2, + "guidance_scale": 4.0, + "output_type": "np", + } + return inputs + + def test_kandinsky_inpaint(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -3:, -3:, -1] + image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1] + + print(f"image.shape {image.shape}") + + assert image.shape == (1, 64, 64, 3) + + expected_slice = np.array( + [0.50775903, 0.49527195, 0.48824543, 0.50192237, 0.48644906, 0.49373814, 0.4780598, 0.47234827, 0.48327848] + ) + + assert ( + np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_slice.flatten()}" + assert ( + np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + ), f" expected_slice {expected_slice}, but got {image_from_tuple_slice.flatten()}" + + def test_inference_batch_single_identical(self): + super().test_inference_batch_single_identical(expected_max_diff=3e-3) + + +@slow +@require_torch_gpu +class KandinskyV22InpaintPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_kandinsky_inpaint(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/kandinskyv22/kandinskyv22_inpaint_cat_with_hat_fp16.npy" + ) + + init_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png" + ) + mask = np.ones((768, 768), dtype=np.float32) + mask[:250, 250:-250] = 0 + + prompt = "a hat" + + pipe_prior = KandinskyV22PriorPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-prior", torch_dtype=torch.float16 + ) + pipe_prior.to(torch_device) + + pipeline = KandinskyV22InpaintPipeline.from_pretrained( + "kandinsky-community/kandinsky-2-2-decoder-inpaint", torch_dtype=torch.float16 + ) + pipeline = pipeline.to(torch_device) + pipeline.set_progress_bar_config(disable=None) + + generator = torch.Generator(device="cpu").manual_seed(0) + image_emb, zero_image_emb = pipe_prior( + prompt, + generator=generator, + num_inference_steps=5, + negative_prompt="", + ).to_tuple() + + output = pipeline( + image=init_image, + mask_image=mask, + image_embeds=image_emb, + negative_image_embeds=zero_image_emb, + generator=generator, + num_inference_steps=100, + height=768, + width=768, + output_type="np", + ) + + image = output.images[0] + + assert image.shape == (768, 768, 3) + + assert_mean_pixel_difference(image, expected_image) diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py new file mode 100644 index 000000000000..1b8cefa91f4e --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior.py @@ -0,0 +1,236 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +import torch +from torch import nn +from transformers import ( + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from diffusers import KandinskyV22PriorPipeline, PriorTransformer, UnCLIPScheduler +from diffusers.utils import torch_device +from diffusers.utils.testing_utils import enable_full_determinism, skip_mps + +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22PriorPipeline + params = ["prompt"] + batch_params = ["prompt", "negative_prompt"] + required_optional_params = [ + "num_images_per_prompt", + "generator", + "num_inference_steps", + "latents", + "negative_prompt", + "guidance_scale", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_tokenizer(self): + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=self.text_embedder_hidden_size, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + return CLIPTextModelWithProjection(config) + + @property + def dummy_prior(self): + torch.manual_seed(0) + + model_kwargs = { + "num_attention_heads": 2, + "attention_head_dim": 12, + "embedding_dim": self.text_embedder_hidden_size, + "num_layers": 1, + } + + model = PriorTransformer(**model_kwargs) + # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0 + model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape)) + return model + + @property + def dummy_image_encoder(self): + torch.manual_seed(0) + config = CLIPVisionConfig( + hidden_size=self.text_embedder_hidden_size, + image_size=224, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_channels=3, + num_hidden_layers=5, + patch_size=14, + ) + + model = CLIPVisionModelWithProjection(config) + return model + + @property + def dummy_image_processor(self): + image_processor = CLIPImageProcessor( + crop_size=224, + do_center_crop=True, + do_normalize=True, + do_resize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + resample=3, + size=224, + ) + + return image_processor + + def get_dummy_components(self): + prior = self.dummy_prior + image_encoder = self.dummy_image_encoder + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + image_processor = self.dummy_image_processor + + scheduler = UnCLIPScheduler( + variance_type="fixed_small_log", + prediction_type="sample", + num_train_timesteps=1000, + clip_sample=True, + clip_sample_range=10.0, + ) + + components = { + "prior": prior, + "image_encoder": image_encoder, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "scheduler": scheduler, + "image_processor": image_processor, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "horse", + "generator": generator, + "guidance_scale": 4.0, + "num_inference_steps": 2, + "output_type": "np", + } + return inputs + + def test_kandinsky_prior(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.image_embeds + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -10:] + image_from_tuple_slice = image_from_tuple[0, -10:] + + assert image.shape == (1, 32) + + expected_slice = np.array( + [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + + @skip_mps + def test_inference_batch_single_identical(self): + test_max_difference = torch_device == "cpu" + relax_max_difference = True + test_mean_pixel_difference = False + + self._test_inference_batch_single_identical( + test_max_difference=test_max_difference, + relax_max_difference=relax_max_difference, + test_mean_pixel_difference=test_mean_pixel_difference, + ) + + @skip_mps + def test_attention_slicing_forward_pass(self): + test_max_difference = torch_device == "cpu" + test_mean_pixel_difference = False + + self._test_attention_slicing_forward_pass( + test_max_difference=test_max_difference, + test_mean_pixel_difference=test_mean_pixel_difference, + ) diff --git a/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py new file mode 100644 index 000000000000..8e8caec181a1 --- /dev/null +++ b/tests/pipelines/kandinsky_v22/test_kandinsky_prior_emb2emb.py @@ -0,0 +1,257 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import unittest + +import numpy as np +import torch +from PIL import Image +from torch import nn +from transformers import ( + CLIPImageProcessor, + CLIPTextConfig, + CLIPTextModelWithProjection, + CLIPTokenizer, + CLIPVisionConfig, + CLIPVisionModelWithProjection, +) + +from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler +from diffusers.utils import floats_tensor, torch_device +from diffusers.utils.testing_utils import enable_full_determinism, skip_mps + +from ..test_pipelines_common import PipelineTesterMixin + + +enable_full_determinism() + + +class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = KandinskyV22PriorEmb2EmbPipeline + params = ["prompt", "image"] + batch_params = ["prompt", "image"] + required_optional_params = [ + "num_images_per_prompt", + "strength", + "generator", + "num_inference_steps", + "latents", + "negative_prompt", + "guidance_scale", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def block_out_channels_0(self): + return self.time_input_dim + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def cross_attention_dim(self): + return 100 + + @property + def dummy_tokenizer(self): + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=self.text_embedder_hidden_size, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + return CLIPTextModelWithProjection(config) + + @property + def dummy_prior(self): + torch.manual_seed(0) + + model_kwargs = { + "num_attention_heads": 2, + "attention_head_dim": 12, + "embedding_dim": self.text_embedder_hidden_size, + "num_layers": 1, + } + + model = PriorTransformer(**model_kwargs) + # clip_std and clip_mean is initialized to be 0 so PriorTransformer.post_process_latents will always return 0 - set clip_std to be 1 so it won't return 0 + model.clip_std = nn.Parameter(torch.ones(model.clip_std.shape)) + return model + + @property + def dummy_image_encoder(self): + torch.manual_seed(0) + config = CLIPVisionConfig( + hidden_size=self.text_embedder_hidden_size, + image_size=224, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_channels=3, + num_hidden_layers=5, + patch_size=14, + ) + + model = CLIPVisionModelWithProjection(config) + return model + + @property + def dummy_image_processor(self): + image_processor = CLIPImageProcessor( + crop_size=224, + do_center_crop=True, + do_normalize=True, + do_resize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + resample=3, + size=224, + ) + + return image_processor + + def get_dummy_components(self): + prior = self.dummy_prior + image_encoder = self.dummy_image_encoder + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + image_processor = self.dummy_image_processor + + scheduler = UnCLIPScheduler( + variance_type="fixed_small_log", + prediction_type="sample", + num_train_timesteps=1000, + clip_sample=True, + clip_sample_range=10.0, + ) + + components = { + "prior": prior, + "image_encoder": image_encoder, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "scheduler": scheduler, + "image_processor": image_processor, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + + image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + image = image.cpu().permute(0, 2, 3, 1)[0] + init_image = Image.fromarray(np.uint8(image)).convert("RGB").resize((256, 256)) + + inputs = { + "prompt": "horse", + "image": init_image, + "strength": 0.5, + "generator": generator, + "guidance_scale": 4.0, + "num_inference_steps": 2, + "output_type": "np", + } + return inputs + + def test_kandinsky_prior_emb2emb(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.image_embeds + + image_from_tuple = pipe( + **self.get_dummy_inputs(device), + return_dict=False, + )[0] + + image_slice = image[0, -10:] + image_from_tuple_slice = image_from_tuple[0, -10:] + + assert image.shape == (1, 32) + + expected_slice = np.array( + [ + 0.1071284, + 1.3330271, + 0.61260223, + -0.6691065, + -0.3846852, + -1.0303661, + 0.22716111, + 0.03348901, + 0.30040675, + -0.24805029, + ] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2 + + @skip_mps + def test_inference_batch_single_identical(self): + test_max_difference = torch_device == "cpu" + relax_max_difference = True + test_mean_pixel_difference = False + + self._test_inference_batch_single_identical( + test_max_difference=test_max_difference, + relax_max_difference=relax_max_difference, + test_mean_pixel_difference=test_mean_pixel_difference, + ) + + @skip_mps + def test_attention_slicing_forward_pass(self): + test_max_difference = torch_device == "cpu" + test_mean_pixel_difference = False + + self._test_attention_slicing_forward_pass( + test_max_difference=test_max_difference, + test_mean_pixel_difference=test_mean_pixel_difference, + ) From 45f6d52b109604d6754ffefa9e289acd1df92994 Mon Sep 17 00:00:00 2001 From: YiYi Xu Date: Thu, 6 Jul 2023 03:20:42 -1000 Subject: [PATCH 185/199] Add Shap-E (#3742) * refactor prior_transformer adding conversion script add pipeline add step_index from pipeline, + remove permute add zero pad token remove copy from statement for betas_for_alpha_bar function * add * add * update conversion script for renderer model * refactor camera a little bit * clean up * style * fix copies * Update src/diffusers/schedulers/scheduling_heun_discrete.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/shap_e/pipeline_shap_e.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/shap_e/pipeline_shap_e.py Co-authored-by: Patrick von Platen * alpha_transform_type * remove step_index argument * remove get_sigmas_karras * remove _yiyi_sigma_to_t * move the rescale prompt_embeds from prior_transformer to pipeline * replace baddbmm with einsum to match origial repo * Revert "replace baddbmm with einsum to match origial repo" This reverts commit 3f6b435d65dad3e5514cad2f5dd9e4419ca78e0b. * add step_index to scale_model_input * Revert "move the rescale prompt_embeds from prior_transformer to pipeline" This reverts commit 5b5a8e6be918fefd114a2945ed89d8e8fa8be21b. * move rescale from prior_transformer to pipeline * correct step_index in scale_model_input * remove print lines * refactor prior - reduce arguments * make style * add prior_image * arg embedding_proj_norm -> norm_embedding_proj * add pre-norm for proj_embedding * move rescale prompt from pipeline to _encode_prompt * add img2img pipeline * style * copies * Update src/diffusers/models/prior_transformer.py Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py add arg: encoder_hid_proj Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py add new config: norm_in_type Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py add new config: added_emb_type Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py rename out_dim -> clip_embed_dim Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py rename config: out_dim -> clip_embed_dim Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py Co-authored-by: Patrick von Platen * Update src/diffusers/models/prior_transformer.py Co-authored-by: Patrick von Platen * finish refactor prior_tranformer * make style * refactor renderer * fix * make style * refactor img2img * remove params_proj * add test * add upcast_softmax to prior_transformer * enable num_images_per_prompt, add save_gif utility * add * add fast test * make style * add slow test * style * add test for img2img * refactor * enable batching * style * refactor scheduler * update test * style * attempt to solve batch related tests timeout * add doc * Update src/diffusers/pipelines/shap_e/pipeline_shap_e.py Co-authored-by: Patrick von Platen * Update src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py Co-authored-by: Patrick von Platen * hardcode rendering related config * update betas_for_alpha_bar on ddpm_scheduler * fix copies * fix * export_to_gif * style * second attempt to speed up batching tests * add doc page to index * Remove intermediate clipping * 3rd attempt to speed up batching tests * Remvoe time index * simplify scheduler * Fix more * Fix more * fix more * make style * fix schedulers * fix some more tests * finish * add one more test * Apply suggestions from code review Co-authored-by: Sayak Paul Co-authored-by: Pedro Cuenca Co-authored-by: Patrick von Platen * style * apply feedbacks * style * fix copies * add one example * style * add example for img2img * fix doc * fix more doc strings * size -> frame_size * style * update doc * style * fix on doc * update repo name * improve the usage example in shap-e img2img * add usage examples in the shap-e docs. * consolidate examples. * minor fix. * update doc * Apply suggestions from code review * Apply suggestions from code review * remove upcast * Make sure background is white * Update src/diffusers/pipelines/shap_e/pipeline_shap_e.py * Apply suggestions from code review * Finish * Apply suggestions from code review * Update src/diffusers/pipelines/shap_e/pipeline_shap_e.py * Make style --------- Co-authored-by: yiyixuxu Co-authored-by: Patrick von Platen Co-authored-by: Sayak Paul Co-authored-by: Pedro Cuenca --- docs/source/en/_toctree.yml | 2 + docs/source/en/api/pipelines/shap_e.mdx | 139 ++++ scripts/convert_shap_e_to_diffusers.py | 594 +++++++++++++++ src/diffusers/__init__.py | 2 + src/diffusers/models/prior_transformer.py | 149 +++- src/diffusers/pipelines/__init__.py | 1 + src/diffusers/pipelines/shap_e/__init__.py | 27 + src/diffusers/pipelines/shap_e/camera.py | 147 ++++ .../pipelines/shap_e/pipeline_shap_e.py | 390 ++++++++++ .../shap_e/pipeline_shap_e_img2img.py | 349 +++++++++ src/diffusers/pipelines/shap_e/renderer.py | 709 ++++++++++++++++++ src/diffusers/schedulers/scheduling_ddim.py | 23 +- .../schedulers/scheduling_ddim_inverse.py | 23 +- .../schedulers/scheduling_ddim_parallel.py | 23 +- src/diffusers/schedulers/scheduling_ddpm.py | 23 +- .../schedulers/scheduling_ddpm_parallel.py | 23 +- .../schedulers/scheduling_deis_multistep.py | 23 +- .../scheduling_dpmsolver_multistep.py | 23 +- .../scheduling_dpmsolver_multistep_inverse.py | 23 +- .../schedulers/scheduling_dpmsolver_sde.py | 44 +- .../scheduling_dpmsolver_singlestep.py | 23 +- .../scheduling_euler_ancestral_discrete.py | 23 +- .../schedulers/scheduling_euler_discrete.py | 23 +- .../schedulers/scheduling_heun_discrete.py | 63 +- .../scheduling_k_dpm_2_ancestral_discrete.py | 44 +- .../schedulers/scheduling_k_dpm_2_discrete.py | 44 +- .../schedulers/scheduling_lms_discrete.py | 23 +- src/diffusers/schedulers/scheduling_pndm.py | 23 +- .../schedulers/scheduling_repaint.py | 23 +- src/diffusers/schedulers/scheduling_unclip.py | 23 +- src/diffusers/utils/__init__.py | 2 +- .../dummy_torch_and_transformers_objects.py | 30 + src/diffusers/utils/testing_utils.py | 15 + tests/pipelines/shap_e/__init__.py | 0 tests/pipelines/shap_e/test_shap_e.py | 265 +++++++ tests/pipelines/shap_e/test_shap_e_img2img.py | 281 +++++++ tests/schedulers/test_scheduler_heun.py | 8 +- 37 files changed, 3534 insertions(+), 116 deletions(-) create mode 100644 docs/source/en/api/pipelines/shap_e.mdx create mode 100644 scripts/convert_shap_e_to_diffusers.py create mode 100644 src/diffusers/pipelines/shap_e/__init__.py create mode 100644 src/diffusers/pipelines/shap_e/camera.py create mode 100644 src/diffusers/pipelines/shap_e/pipeline_shap_e.py create mode 100644 src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py create mode 100644 src/diffusers/pipelines/shap_e/renderer.py create mode 100644 tests/pipelines/shap_e/__init__.py create mode 100644 tests/pipelines/shap_e/test_shap_e.py create mode 100644 tests/pipelines/shap_e/test_shap_e_img2img.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index db9e72a4ea20..470d8c5c189d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -226,6 +226,8 @@ title: Self-Attention Guidance - local: api/pipelines/semantic_stable_diffusion title: Semantic Guidance + - local: api/pipelines/shap_e + title: Shap-E - local: api/pipelines/spectrogram_diffusion title: Spectrogram Diffusion - sections: diff --git a/docs/source/en/api/pipelines/shap_e.mdx b/docs/source/en/api/pipelines/shap_e.mdx new file mode 100644 index 000000000000..fcb32da31bca --- /dev/null +++ b/docs/source/en/api/pipelines/shap_e.mdx @@ -0,0 +1,139 @@ + + +# Shap-E + +## Overview + + +The Shap-E model was proposed in [Shap-E: Generating Conditional 3D Implicit Functions](https://arxiv.org/abs/2305.02463) by Alex Nichol and Heewon Jun from [OpenAI](https://github.com/openai). + +The abstract of the paper is the following: + +*We present Shap-E, a conditional generative model for 3D assets. Unlike recent work on 3D generative models which produce a single output representation, Shap-E directly generates the parameters of implicit functions that can be rendered as both textured meshes and neural radiance fields. We train Shap-E in two stages: first, we train an encoder that deterministically maps 3D assets into the parameters of an implicit function; second, we train a conditional diffusion model on outputs of the encoder. When trained on a large dataset of paired 3D and text data, our resulting models are capable of generating complex and diverse 3D assets in a matter of seconds. When compared to Point-E, an explicit generative model over point clouds, Shap-E converges faster and reaches comparable or better sample quality despite modeling a higher-dimensional, multi-representation output space.* + +The original codebase can be found [here](https://github.com/openai/shap-e). + +## Available Pipelines: + +| Pipeline | Tasks | +|---|---| +| [pipeline_shap_e.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e.py) | *Text-to-Image Generation* | +| [pipeline_shap_e_img2img.py](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py) | *Image-to-Image Generation* | + +## Available checkpoints + +* [`openai/shap-e`](https://huggingface.co/openai/shap-e) +* [`openai/shap-e-img2img`](https://huggingface.co/openai/shap-e-img2img) + +## Usage Examples + +In the following, we will walk you through some examples of how to use Shap-E pipelines to create 3D objects in gif format. + +### Text-to-3D image generation + +We can use [`ShapEPipeline`] to create 3D object based on a text prompt. In this example, we will make a birthday cupcake for :firecracker: diffusers library's 1 year birthday. The workflow to use the Shap-E text-to-image pipeline is same as how you would use other text-to-image pipelines in diffusers. + +```python +import torch + +from diffusers import DiffusionPipeline + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +repo = "openai/shap-e" +pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16) +pipe = pipe.to(device) + +guidance_scale = 15.0 +prompt = ["A firecracker", "A birthday cupcake"] + +images = pipe( + prompt, + guidance_scale=guidance_scale, + num_inference_steps=64, + frame_size=256, +).images +``` + +The output of [`ShapEPipeline`] is a list of lists of images frames. Each list of frames can be used to create a 3D object. Let's use the `export_to_gif` utility function in diffusers to make a 3D cupcake! + +```python +from diffusers.utils import export_to_gif + +export_to_gif(images[0], "firecracker_3d.gif") +export_to_gif(images[1], "cake_3d.gif") +``` +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/firecracker_out.gif) +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/cake_out.gif) + + +### Image-to-Image generation + +You can use [`ShapEImg2ImgPipeline`] along with other text-to-image pipelines in diffusers and turn your 2D generation into 3D. + +In this example, We will first genrate a cheeseburger with a simple prompt "A cheeseburger, white background" + +```python +from diffusers import DiffusionPipeline +import torch + +pipe_prior = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16) +pipe_prior.to("cuda") + +t2i_pipe = DiffusionPipeline.from_pretrained("kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16) +t2i_pipe.to("cuda") + +prompt = "A cheeseburger, white background" + +image_embeds, negative_image_embeds = pipe_prior(prompt, guidance_scale=1.0).to_tuple() +image = t2i_pipe( + prompt, + image_embeds=image_embeds, + negative_image_embeds=negative_image_embeds, +).images[0] + +image.save("burger.png") +``` + +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_in.png) + +we will then use the Shap-E image-to-image pipeline to turn it into a 3D cheeseburger :) + +```python +from PIL import Image +from diffusers.utils import export_to_gif + +repo = "openai/shap-e-img2img" +pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16) +pipe = pipe.to("cuda") + +guidance_scale = 3.0 +image = Image.open("burger.png").resize((256, 256)) + +images = pipe( + image, + guidance_scale=guidance_scale, + num_inference_steps=64, + frame_size=256, +).images + +gif_path = export_to_gif(images[0], "burger_3d.gif") +``` +![img](https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/shap_e/burger_out.gif) + +## ShapEPipeline +[[autodoc]] ShapEPipeline + - all + - __call__ + +## ShapEImg2ImgPipeline +[[autodoc]] ShapEImg2ImgPipeline + - all + - __call__ \ No newline at end of file diff --git a/scripts/convert_shap_e_to_diffusers.py b/scripts/convert_shap_e_to_diffusers.py new file mode 100644 index 000000000000..d92db176f422 --- /dev/null +++ b/scripts/convert_shap_e_to_diffusers.py @@ -0,0 +1,594 @@ +import argparse +import tempfile + +import torch +from accelerate import load_checkpoint_and_dispatch + +from diffusers.models.prior_transformer import PriorTransformer +from diffusers.pipelines.shap_e import ShapERenderer + + +""" +Example - From the diffusers root directory: + +Download weights: +```sh +$ wget "https://openaipublic.azureedge.net/main/shap-e/text_cond.pt" +``` + +Convert the model: +```sh +$ python scripts/convert_shap_e_to_diffusers.py \ + --prior_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/text_cond.pt \ + --prior_image_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/image_cond.pt \ + --transmitter_checkpoint_path /home/yiyi_huggingface_co/shap-e/shap_e_model_cache/transmitter.pt\ + --dump_path /home/yiyi_huggingface_co/model_repo/shap-e/renderer\ + --debug renderer +``` +""" + + +# prior + +PRIOR_ORIGINAL_PREFIX = "wrapped" + +PRIOR_CONFIG = { + "num_attention_heads": 16, + "attention_head_dim": 1024 // 16, + "num_layers": 24, + "embedding_dim": 1024, + "num_embeddings": 1024, + "additional_embeddings": 0, + "time_embed_act_fn": "gelu", + "norm_in_type": "layer", + "encoder_hid_proj_type": None, + "added_emb_type": None, + "time_embed_dim": 1024 * 4, + "embedding_proj_dim": 768, + "clip_embed_dim": 1024 * 2, +} + + +def prior_model_from_original_config(): + model = PriorTransformer(**PRIOR_CONFIG) + + return model + + +def prior_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + + # .time_embed.c_fc -> .time_embedding.linear_1 + diffusers_checkpoint.update( + { + "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_fc.weight"], + "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_fc.bias"], + } + ) + + # .time_embed.c_proj -> .time_embedding.linear_2 + diffusers_checkpoint.update( + { + "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_proj.weight"], + "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.time_embed.c_proj.bias"], + } + ) + + # .input_proj -> .proj_in + diffusers_checkpoint.update( + { + "proj_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.weight"], + "proj_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.input_proj.bias"], + } + ) + + # .clip_emb -> .embedding_proj + diffusers_checkpoint.update( + { + "embedding_proj.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.weight"], + "embedding_proj.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.clip_embed.bias"], + } + ) + + # .pos_emb -> .positional_embedding + diffusers_checkpoint.update({"positional_embedding": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.pos_emb"][None, :]}) + + # .ln_pre -> .norm_in + diffusers_checkpoint.update( + { + "norm_in.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_pre.weight"], + "norm_in.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_pre.bias"], + } + ) + + # .backbone.resblocks. -> .transformer_blocks. + for idx in range(len(model.transformer_blocks)): + diffusers_transformer_prefix = f"transformer_blocks.{idx}" + original_transformer_prefix = f"{PRIOR_ORIGINAL_PREFIX}.backbone.resblocks.{idx}" + + # .attn -> .attn1 + diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1" + original_attention_prefix = f"{original_transformer_prefix}.attn" + diffusers_checkpoint.update( + prior_attention_to_diffusers( + checkpoint, + diffusers_attention_prefix=diffusers_attention_prefix, + original_attention_prefix=original_attention_prefix, + attention_head_dim=model.attention_head_dim, + ) + ) + + # .mlp -> .ff + diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff" + original_ff_prefix = f"{original_transformer_prefix}.mlp" + diffusers_checkpoint.update( + prior_ff_to_diffusers( + checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix + ) + ) + + # .ln_1 -> .norm1 + diffusers_checkpoint.update( + { + f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[ + f"{original_transformer_prefix}.ln_1.weight" + ], + f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"], + } + ) + + # .ln_2 -> .norm3 + diffusers_checkpoint.update( + { + f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[ + f"{original_transformer_prefix}.ln_2.weight" + ], + f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"], + } + ) + + # .ln_post -> .norm_out + diffusers_checkpoint.update( + { + "norm_out.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.weight"], + "norm_out.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.ln_post.bias"], + } + ) + + # .output_proj -> .proj_to_clip_embeddings + diffusers_checkpoint.update( + { + "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.weight"], + "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_ORIGINAL_PREFIX}.output_proj.bias"], + } + ) + + return diffusers_checkpoint + + +def prior_attention_to_diffusers( + checkpoint, *, diffusers_attention_prefix, original_attention_prefix, attention_head_dim +): + diffusers_checkpoint = {} + + # .c_qkv -> .{to_q, to_k, to_v} + [q_weight, k_weight, v_weight], [q_bias, k_bias, v_bias] = split_attentions( + weight=checkpoint[f"{original_attention_prefix}.c_qkv.weight"], + bias=checkpoint[f"{original_attention_prefix}.c_qkv.bias"], + split=3, + chunk_size=attention_head_dim, + ) + + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.to_q.weight": q_weight, + f"{diffusers_attention_prefix}.to_q.bias": q_bias, + f"{diffusers_attention_prefix}.to_k.weight": k_weight, + f"{diffusers_attention_prefix}.to_k.bias": k_bias, + f"{diffusers_attention_prefix}.to_v.weight": v_weight, + f"{diffusers_attention_prefix}.to_v.bias": v_bias, + } + ) + + # .c_proj -> .to_out.0 + diffusers_checkpoint.update( + { + f"{diffusers_attention_prefix}.to_out.0.weight": checkpoint[f"{original_attention_prefix}.c_proj.weight"], + f"{diffusers_attention_prefix}.to_out.0.bias": checkpoint[f"{original_attention_prefix}.c_proj.bias"], + } + ) + + return diffusers_checkpoint + + +def prior_ff_to_diffusers(checkpoint, *, diffusers_ff_prefix, original_ff_prefix): + diffusers_checkpoint = { + # .c_fc -> .net.0.proj + f"{diffusers_ff_prefix}.net.{0}.proj.weight": checkpoint[f"{original_ff_prefix}.c_fc.weight"], + f"{diffusers_ff_prefix}.net.{0}.proj.bias": checkpoint[f"{original_ff_prefix}.c_fc.bias"], + # .c_proj -> .net.2 + f"{diffusers_ff_prefix}.net.{2}.weight": checkpoint[f"{original_ff_prefix}.c_proj.weight"], + f"{diffusers_ff_prefix}.net.{2}.bias": checkpoint[f"{original_ff_prefix}.c_proj.bias"], + } + + return diffusers_checkpoint + + +# done prior + + +# prior_image (only slightly different from prior) + + +PRIOR_IMAGE_ORIGINAL_PREFIX = "wrapped" + +# Uses default arguments +PRIOR_IMAGE_CONFIG = { + "num_attention_heads": 8, + "attention_head_dim": 1024 // 8, + "num_layers": 24, + "embedding_dim": 1024, + "num_embeddings": 1024, + "additional_embeddings": 0, + "time_embed_act_fn": "gelu", + "norm_in_type": "layer", + "embedding_proj_norm_type": "layer", + "encoder_hid_proj_type": None, + "added_emb_type": None, + "time_embed_dim": 1024 * 4, + "embedding_proj_dim": 1024, + "clip_embed_dim": 1024 * 2, +} + + +def prior_image_model_from_original_config(): + model = PriorTransformer(**PRIOR_IMAGE_CONFIG) + + return model + + +def prior_image_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + + # .time_embed.c_fc -> .time_embedding.linear_1 + diffusers_checkpoint.update( + { + "time_embedding.linear_1.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_fc.weight"], + "time_embedding.linear_1.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_fc.bias"], + } + ) + + # .time_embed.c_proj -> .time_embedding.linear_2 + diffusers_checkpoint.update( + { + "time_embedding.linear_2.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_proj.weight"], + "time_embedding.linear_2.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.time_embed.c_proj.bias"], + } + ) + + # .input_proj -> .proj_in + diffusers_checkpoint.update( + { + "proj_in.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.input_proj.weight"], + "proj_in.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.input_proj.bias"], + } + ) + + # .clip_embed.0 -> .embedding_proj_norm + diffusers_checkpoint.update( + { + "embedding_proj_norm.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.0.weight"], + "embedding_proj_norm.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.0.bias"], + } + ) + + # ..clip_embed.1 -> .embedding_proj + diffusers_checkpoint.update( + { + "embedding_proj.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.1.weight"], + "embedding_proj.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.clip_embed.1.bias"], + } + ) + + # .pos_emb -> .positional_embedding + diffusers_checkpoint.update( + {"positional_embedding": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.pos_emb"][None, :]} + ) + + # .ln_pre -> .norm_in + diffusers_checkpoint.update( + { + "norm_in.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_pre.weight"], + "norm_in.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_pre.bias"], + } + ) + + # .backbone.resblocks. -> .transformer_blocks. + for idx in range(len(model.transformer_blocks)): + diffusers_transformer_prefix = f"transformer_blocks.{idx}" + original_transformer_prefix = f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.backbone.resblocks.{idx}" + + # .attn -> .attn1 + diffusers_attention_prefix = f"{diffusers_transformer_prefix}.attn1" + original_attention_prefix = f"{original_transformer_prefix}.attn" + diffusers_checkpoint.update( + prior_attention_to_diffusers( + checkpoint, + diffusers_attention_prefix=diffusers_attention_prefix, + original_attention_prefix=original_attention_prefix, + attention_head_dim=model.attention_head_dim, + ) + ) + + # .mlp -> .ff + diffusers_ff_prefix = f"{diffusers_transformer_prefix}.ff" + original_ff_prefix = f"{original_transformer_prefix}.mlp" + diffusers_checkpoint.update( + prior_ff_to_diffusers( + checkpoint, diffusers_ff_prefix=diffusers_ff_prefix, original_ff_prefix=original_ff_prefix + ) + ) + + # .ln_1 -> .norm1 + diffusers_checkpoint.update( + { + f"{diffusers_transformer_prefix}.norm1.weight": checkpoint[ + f"{original_transformer_prefix}.ln_1.weight" + ], + f"{diffusers_transformer_prefix}.norm1.bias": checkpoint[f"{original_transformer_prefix}.ln_1.bias"], + } + ) + + # .ln_2 -> .norm3 + diffusers_checkpoint.update( + { + f"{diffusers_transformer_prefix}.norm3.weight": checkpoint[ + f"{original_transformer_prefix}.ln_2.weight" + ], + f"{diffusers_transformer_prefix}.norm3.bias": checkpoint[f"{original_transformer_prefix}.ln_2.bias"], + } + ) + + # .ln_post -> .norm_out + diffusers_checkpoint.update( + { + "norm_out.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_post.weight"], + "norm_out.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.ln_post.bias"], + } + ) + + # .output_proj -> .proj_to_clip_embeddings + diffusers_checkpoint.update( + { + "proj_to_clip_embeddings.weight": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.output_proj.weight"], + "proj_to_clip_embeddings.bias": checkpoint[f"{PRIOR_IMAGE_ORIGINAL_PREFIX}.output_proj.bias"], + } + ) + + return diffusers_checkpoint + + +# done prior_image + + +# renderer + +RENDERER_CONFIG = {} + + +def renderer_model_from_original_config(): + model = ShapERenderer(**RENDERER_CONFIG) + + return model + + +RENDERER_MLP_ORIGINAL_PREFIX = "renderer.nerstf" + +RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX = "encoder.params_proj" + + +def renderer_model_original_checkpoint_to_diffusers_checkpoint(model, checkpoint): + diffusers_checkpoint = {} + diffusers_checkpoint.update( + {f"mlp.{k}": checkpoint[f"{RENDERER_MLP_ORIGINAL_PREFIX}.{k}"] for k in model.mlp.state_dict().keys()} + ) + + diffusers_checkpoint.update( + { + f"params_proj.{k}": checkpoint[f"{RENDERER_PARAMS_PROJ_ORIGINAL_PREFIX}.{k}"] + for k in model.params_proj.state_dict().keys() + } + ) + + diffusers_checkpoint.update({"void.background": torch.tensor([0.0, 0.0, 0.0], dtype=torch.float32)}) + + return diffusers_checkpoint + + +# done renderer + + +# TODO maybe document and/or can do more efficiently (build indices in for loop and extract once for each split?) +def split_attentions(*, weight, bias, split, chunk_size): + weights = [None] * split + biases = [None] * split + + weights_biases_idx = 0 + + for starting_row_index in range(0, weight.shape[0], chunk_size): + row_indices = torch.arange(starting_row_index, starting_row_index + chunk_size) + + weight_rows = weight[row_indices, :] + bias_rows = bias[row_indices] + + if weights[weights_biases_idx] is None: + assert weights[weights_biases_idx] is None + weights[weights_biases_idx] = weight_rows + biases[weights_biases_idx] = bias_rows + else: + assert weights[weights_biases_idx] is not None + weights[weights_biases_idx] = torch.concat([weights[weights_biases_idx], weight_rows]) + biases[weights_biases_idx] = torch.concat([biases[weights_biases_idx], bias_rows]) + + weights_biases_idx = (weights_biases_idx + 1) % split + + return weights, biases + + +# done unet utils + + +# Driver functions + + +def prior(*, args, checkpoint_map_location): + print("loading prior") + + prior_checkpoint = torch.load(args.prior_checkpoint_path, map_location=checkpoint_map_location) + + prior_model = prior_model_from_original_config() + + prior_diffusers_checkpoint = prior_original_checkpoint_to_diffusers_checkpoint(prior_model, prior_checkpoint) + + del prior_checkpoint + + load_prior_checkpoint_to_model(prior_diffusers_checkpoint, prior_model) + + print("done loading prior") + + return prior_model + + +def prior_image(*, args, checkpoint_map_location): + print("loading prior_image") + + print(f"load checkpoint from {args.prior_image_checkpoint_path}") + prior_checkpoint = torch.load(args.prior_image_checkpoint_path, map_location=checkpoint_map_location) + + prior_model = prior_image_model_from_original_config() + + prior_diffusers_checkpoint = prior_image_original_checkpoint_to_diffusers_checkpoint(prior_model, prior_checkpoint) + + del prior_checkpoint + + load_prior_checkpoint_to_model(prior_diffusers_checkpoint, prior_model) + + print("done loading prior_image") + + return prior_model + + +def renderer(*, args, checkpoint_map_location): + print(" loading renderer") + + renderer_checkpoint = torch.load(args.transmitter_checkpoint_path, map_location=checkpoint_map_location) + + renderer_model = renderer_model_from_original_config() + + renderer_diffusers_checkpoint = renderer_model_original_checkpoint_to_diffusers_checkpoint( + renderer_model, renderer_checkpoint + ) + + del renderer_checkpoint + + load_checkpoint_to_model(renderer_diffusers_checkpoint, renderer_model, strict=True) + + print("done loading renderer") + + return renderer_model + + +# prior model will expect clip_mean and clip_std, whic are missing from the state_dict +PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"] + + +def load_prior_checkpoint_to_model(checkpoint, model): + with tempfile.NamedTemporaryFile() as file: + torch.save(checkpoint, file.name) + del checkpoint + missing_keys, unexpected_keys = model.load_state_dict(torch.load(file.name), strict=False) + missing_keys = list(set(missing_keys) - set(PRIOR_EXPECTED_MISSING_KEYS)) + + if len(unexpected_keys) > 0: + raise ValueError(f"Unexpected keys when loading prior model: {unexpected_keys}") + if len(missing_keys) > 0: + raise ValueError(f"Missing keys when loading prior model: {missing_keys}") + + +def load_checkpoint_to_model(checkpoint, model, strict=False): + with tempfile.NamedTemporaryFile() as file: + torch.save(checkpoint, file.name) + del checkpoint + if strict: + model.load_state_dict(torch.load(file.name), strict=True) + else: + load_checkpoint_and_dispatch(model, file.name, device_map="auto") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.") + + parser.add_argument( + "--prior_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the prior checkpoint to convert.", + ) + + parser.add_argument( + "--prior_image_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the prior_image checkpoint to convert.", + ) + + parser.add_argument( + "--transmitter_checkpoint_path", + default=None, + type=str, + required=False, + help="Path to the transmitter checkpoint to convert.", + ) + + parser.add_argument( + "--checkpoint_load_device", + default="cpu", + type=str, + required=False, + help="The device passed to `map_location` when loading checkpoints.", + ) + + parser.add_argument( + "--debug", + default=None, + type=str, + required=False, + help="Only run a specific stage of the convert script. Used for debugging", + ) + + args = parser.parse_args() + + print(f"loading checkpoints to {args.checkpoint_load_device}") + + checkpoint_map_location = torch.device(args.checkpoint_load_device) + + if args.debug is not None: + print(f"debug: only executing {args.debug}") + + if args.debug is None: + print("YiYi TO-DO") + elif args.debug == "prior": + prior_model = prior(args=args, checkpoint_map_location=checkpoint_map_location) + prior_model.save_pretrained(args.dump_path) + elif args.debug == "prior_image": + prior_model = prior_image(args=args, checkpoint_map_location=checkpoint_map_location) + prior_model.save_pretrained(args.dump_path) + elif args.debug == "renderer": + renderer_model = renderer(args=args, checkpoint_map_location=checkpoint_map_location) + renderer_model.save_pretrained(args.dump_path) + else: + raise ValueError(f"unknown debug value : {args.debug}") diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py index f425dc13ec2c..a7fc9a36f271 100644 --- a/src/diffusers/__init__.py +++ b/src/diffusers/__init__.py @@ -149,6 +149,8 @@ LDMTextToImagePipeline, PaintByExamplePipeline, SemanticStableDiffusionPipeline, + ShapEImg2ImgPipeline, + ShapEPipeline, StableDiffusionAttendAndExcitePipeline, StableDiffusionControlNetImg2ImgPipeline, StableDiffusionControlNetInpaintPipeline, diff --git a/src/diffusers/models/prior_transformer.py b/src/diffusers/models/prior_transformer.py index 47785a93e939..9f3c61dd7561 100644 --- a/src/diffusers/models/prior_transformer.py +++ b/src/diffusers/models/prior_transformer.py @@ -34,14 +34,33 @@ class PriorTransformer(ModelMixin, ConfigMixin): num_attention_heads (`int`, *optional*, defaults to 32): The number of heads to use for multi-head attention. attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head. num_layers (`int`, *optional*, defaults to 20): The number of layers of Transformer blocks to use. - embedding_dim (`int`, *optional*, defaults to 768): - The dimension of the CLIP embeddings. Image embeddings and text embeddings are both the same dimension. - num_embeddings (`int`, *optional*, defaults to 77): The max number of CLIP embeddings allowed (the - length of the prompt after it has been tokenized). + embedding_dim (`int`, *optional*, defaults to 768): The dimension of the model input `hidden_states` + num_embeddings (`int`, *optional*, defaults to 77): + The number of embeddings of the model input `hidden_states` additional_embeddings (`int`, *optional*, defaults to 4): The number of additional tokens appended to the projected `hidden_states`. The actual length of the used `hidden_states` is `num_embeddings + additional_embeddings`. dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + time_embed_act_fn (`str`, *optional*, defaults to 'silu'): + The activation function to use to create timestep embeddings. + norm_in_type (`str`, *optional*, defaults to None): The normalization layer to apply on hidden states before + passing to Transformer blocks. Set it to `None` if normalization is not needed. + embedding_proj_norm_type (`str`, *optional*, defaults to None): + The normalization layer to apply on the input `proj_embedding`. Set it to `None` if normalization is not + needed. + encoder_hid_proj_type (`str`, *optional*, defaults to `linear`): + The projection layer to apply on the input `encoder_hidden_states`. Set it to `None` if + `encoder_hidden_states` is `None`. + added_emb_type (`str`, *optional*, defaults to `prd`): Additional embeddings to condition the model. + Choose from `prd` or `None`. if choose `prd`, it will prepend a token indicating the (quantized) dot + product between the text embedding and image embedding as proposed in the unclip paper + https://arxiv.org/abs/2204.06125 If it is `None`, no additional embeddings will be prepended. + time_embed_dim (`int, *optional*, defaults to None): The dimension of timestep embeddings. + If None, will be set to `num_attention_heads * attention_head_dim` + embedding_proj_dim (`int`, *optional*, default to None): + The dimension of `proj_embedding`. If None, will be set to `embedding_dim`. + clip_embed_dim (`int`, *optional*, default to None): + The dimension of the output. If None, will be set to `embedding_dim`. """ @register_to_config @@ -54,6 +73,14 @@ def __init__( num_embeddings=77, additional_embeddings=4, dropout: float = 0.0, + time_embed_act_fn: str = "silu", + norm_in_type: Optional[str] = None, # layer + embedding_proj_norm_type: Optional[str] = None, # layer + encoder_hid_proj_type: Optional[str] = "linear", # linear + added_emb_type: Optional[str] = "prd", # prd + time_embed_dim: Optional[int] = None, + embedding_proj_dim: Optional[int] = None, + clip_embed_dim: Optional[int] = None, ): super().__init__() self.num_attention_heads = num_attention_heads @@ -61,17 +88,41 @@ def __init__( inner_dim = num_attention_heads * attention_head_dim self.additional_embeddings = additional_embeddings + time_embed_dim = time_embed_dim or inner_dim + embedding_proj_dim = embedding_proj_dim or embedding_dim + clip_embed_dim = clip_embed_dim or embedding_dim + self.time_proj = Timesteps(inner_dim, True, 0) - self.time_embedding = TimestepEmbedding(inner_dim, inner_dim) + self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, out_dim=inner_dim, act_fn=time_embed_act_fn) self.proj_in = nn.Linear(embedding_dim, inner_dim) - self.embedding_proj = nn.Linear(embedding_dim, inner_dim) - self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim) + if embedding_proj_norm_type is None: + self.embedding_proj_norm = None + elif embedding_proj_norm_type == "layer": + self.embedding_proj_norm = nn.LayerNorm(embedding_proj_dim) + else: + raise ValueError(f"unsupported embedding_proj_norm_type: {embedding_proj_norm_type}") + + self.embedding_proj = nn.Linear(embedding_proj_dim, inner_dim) + + if encoder_hid_proj_type is None: + self.encoder_hidden_states_proj = None + elif encoder_hid_proj_type == "linear": + self.encoder_hidden_states_proj = nn.Linear(embedding_dim, inner_dim) + else: + raise ValueError(f"unsupported encoder_hid_proj_type: {encoder_hid_proj_type}") self.positional_embedding = nn.Parameter(torch.zeros(1, num_embeddings + additional_embeddings, inner_dim)) - self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim)) + if added_emb_type == "prd": + self.prd_embedding = nn.Parameter(torch.zeros(1, 1, inner_dim)) + elif added_emb_type is None: + self.prd_embedding = None + else: + raise ValueError( + f"`added_emb_type`: {added_emb_type} is not supported. Make sure to choose one of `'prd'` or `None`." + ) self.transformer_blocks = nn.ModuleList( [ @@ -87,8 +138,16 @@ def __init__( ] ) + if norm_in_type == "layer": + self.norm_in = nn.LayerNorm(inner_dim) + elif norm_in_type is None: + self.norm_in = None + else: + raise ValueError(f"Unsupported norm_in_type: {norm_in_type}.") + self.norm_out = nn.LayerNorm(inner_dim) - self.proj_to_clip_embeddings = nn.Linear(inner_dim, embedding_dim) + + self.proj_to_clip_embeddings = nn.Linear(inner_dim, clip_embed_dim) causal_attention_mask = torch.full( [num_embeddings + additional_embeddings, num_embeddings + additional_embeddings], -10000.0 @@ -97,8 +156,8 @@ def __init__( causal_attention_mask = causal_attention_mask[None, ...] self.register_buffer("causal_attention_mask", causal_attention_mask, persistent=False) - self.clip_mean = nn.Parameter(torch.zeros(1, embedding_dim)) - self.clip_std = nn.Parameter(torch.zeros(1, embedding_dim)) + self.clip_mean = nn.Parameter(torch.zeros(1, clip_embed_dim)) + self.clip_std = nn.Parameter(torch.zeros(1, clip_embed_dim)) @property # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors @@ -172,7 +231,7 @@ def forward( hidden_states, timestep: Union[torch.Tensor, float, int], proj_embedding: torch.FloatTensor, - encoder_hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.BoolTensor] = None, return_dict: bool = True, ): @@ -217,23 +276,61 @@ def forward( timesteps_projected = timesteps_projected.to(dtype=self.dtype) time_embeddings = self.time_embedding(timesteps_projected) + if self.embedding_proj_norm is not None: + proj_embedding = self.embedding_proj_norm(proj_embedding) + proj_embeddings = self.embedding_proj(proj_embedding) - encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states) + if self.encoder_hidden_states_proj is not None and encoder_hidden_states is not None: + encoder_hidden_states = self.encoder_hidden_states_proj(encoder_hidden_states) + elif self.encoder_hidden_states_proj is not None and encoder_hidden_states is None: + raise ValueError("`encoder_hidden_states_proj` requires `encoder_hidden_states` to be set") + hidden_states = self.proj_in(hidden_states) - prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1) + positional_embeddings = self.positional_embedding.to(hidden_states.dtype) + additional_embeds = [] + additional_embeddings_len = 0 + + if encoder_hidden_states is not None: + additional_embeds.append(encoder_hidden_states) + additional_embeddings_len += encoder_hidden_states.shape[1] + + if len(proj_embeddings.shape) == 2: + proj_embeddings = proj_embeddings[:, None, :] + + if len(hidden_states.shape) == 2: + hidden_states = hidden_states[:, None, :] + + additional_embeds = additional_embeds + [ + proj_embeddings, + time_embeddings[:, None, :], + hidden_states, + ] + + if self.prd_embedding is not None: + prd_embedding = self.prd_embedding.to(hidden_states.dtype).expand(batch_size, -1, -1) + additional_embeds.append(prd_embedding) + hidden_states = torch.cat( - [ - encoder_hidden_states, - proj_embeddings[:, None, :], - time_embeddings[:, None, :], - hidden_states[:, None, :], - prd_embedding, - ], + additional_embeds, dim=1, ) + # Allow positional_embedding to not include the `addtional_embeddings` and instead pad it with zeros for these additional tokens + additional_embeddings_len = additional_embeddings_len + proj_embeddings.shape[1] + 1 + if positional_embeddings.shape[1] < hidden_states.shape[1]: + positional_embeddings = F.pad( + positional_embeddings, + ( + 0, + 0, + additional_embeddings_len, + self.prd_embedding.shape[1] if self.prd_embedding is not None else 0, + ), + value=0.0, + ) + hidden_states = hidden_states + positional_embeddings if attention_mask is not None: @@ -242,11 +339,19 @@ def forward( attention_mask = (attention_mask[:, None, :] + self.causal_attention_mask).to(hidden_states.dtype) attention_mask = attention_mask.repeat_interleave(self.config.num_attention_heads, dim=0) + if self.norm_in is not None: + hidden_states = self.norm_in(hidden_states) + for block in self.transformer_blocks: hidden_states = block(hidden_states, attention_mask=attention_mask) hidden_states = self.norm_out(hidden_states) - hidden_states = hidden_states[:, -1] + + if self.prd_embedding is not None: + hidden_states = hidden_states[:, -1] + else: + hidden_states = hidden_states[:, additional_embeddings_len:] + predicted_image_embedding = self.proj_to_clip_embeddings(hidden_states) if not return_dict: diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py index b8bee3299aff..c3968406ed90 100644 --- a/src/diffusers/pipelines/__init__.py +++ b/src/diffusers/pipelines/__init__.py @@ -77,6 +77,7 @@ from .latent_diffusion import LDMTextToImagePipeline from .paint_by_example import PaintByExamplePipeline from .semantic_stable_diffusion import SemanticStableDiffusionPipeline + from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline from .stable_diffusion import ( CycleDiffusionPipeline, StableDiffusionAttendAndExcitePipeline, diff --git a/src/diffusers/pipelines/shap_e/__init__.py b/src/diffusers/pipelines/shap_e/__init__.py new file mode 100644 index 000000000000..04aa1f2f6d78 --- /dev/null +++ b/src/diffusers/pipelines/shap_e/__init__.py @@ -0,0 +1,27 @@ +from ...utils import ( + OptionalDependencyNotAvailable, + is_torch_available, + is_transformers_available, + is_transformers_version, +) + + +try: + if not (is_transformers_available() and is_torch_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from ...utils.dummy_torch_and_transformers_objects import ShapEPipeline +else: + from .camera import create_pan_cameras + from .pipeline_shap_e import ShapEPipeline + from .pipeline_shap_e_img2img import ShapEImg2ImgPipeline + from .renderer import ( + BoundingBoxVolume, + ImportanceRaySampler, + MLPNeRFModelOutput, + MLPNeRSTFModel, + ShapEParamsProjModel, + ShapERenderer, + StratifiedRaySampler, + VoidNeRFModel, + ) diff --git a/src/diffusers/pipelines/shap_e/camera.py b/src/diffusers/pipelines/shap_e/camera.py new file mode 100644 index 000000000000..7ef0d6607022 --- /dev/null +++ b/src/diffusers/pipelines/shap_e/camera.py @@ -0,0 +1,147 @@ +# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Tuple + +import numpy as np +import torch + + +@dataclass +class DifferentiableProjectiveCamera: + """ + Implements a batch, differentiable, standard pinhole camera + """ + + origin: torch.Tensor # [batch_size x 3] + x: torch.Tensor # [batch_size x 3] + y: torch.Tensor # [batch_size x 3] + z: torch.Tensor # [batch_size x 3] + width: int + height: int + x_fov: float + y_fov: float + shape: Tuple[int] + + def __post_init__(self): + assert self.x.shape[0] == self.y.shape[0] == self.z.shape[0] == self.origin.shape[0] + assert self.x.shape[1] == self.y.shape[1] == self.z.shape[1] == self.origin.shape[1] == 3 + assert len(self.x.shape) == len(self.y.shape) == len(self.z.shape) == len(self.origin.shape) == 2 + + def resolution(self): + return torch.from_numpy(np.array([self.width, self.height], dtype=np.float32)) + + def fov(self): + return torch.from_numpy(np.array([self.x_fov, self.y_fov], dtype=np.float32)) + + def get_image_coords(self) -> torch.Tensor: + """ + :return: coords of shape (width * height, 2) + """ + pixel_indices = torch.arange(self.height * self.width) + coords = torch.stack( + [ + pixel_indices % self.width, + torch.div(pixel_indices, self.width, rounding_mode="trunc"), + ], + axis=1, + ) + return coords + + @property + def camera_rays(self): + batch_size, *inner_shape = self.shape + inner_batch_size = int(np.prod(inner_shape)) + + coords = self.get_image_coords() + coords = torch.broadcast_to(coords.unsqueeze(0), [batch_size * inner_batch_size, *coords.shape]) + rays = self.get_camera_rays(coords) + + rays = rays.view(batch_size, inner_batch_size * self.height * self.width, 2, 3) + + return rays + + def get_camera_rays(self, coords: torch.Tensor) -> torch.Tensor: + batch_size, *shape, n_coords = coords.shape + assert n_coords == 2 + assert batch_size == self.origin.shape[0] + + flat = coords.view(batch_size, -1, 2) + + res = self.resolution() + fov = self.fov() + + fracs = (flat.float() / (res - 1)) * 2 - 1 + fracs = fracs * torch.tan(fov / 2) + + fracs = fracs.view(batch_size, -1, 2) + directions = ( + self.z.view(batch_size, 1, 3) + + self.x.view(batch_size, 1, 3) * fracs[:, :, :1] + + self.y.view(batch_size, 1, 3) * fracs[:, :, 1:] + ) + directions = directions / directions.norm(dim=-1, keepdim=True) + rays = torch.stack( + [ + torch.broadcast_to(self.origin.view(batch_size, 1, 3), [batch_size, directions.shape[1], 3]), + directions, + ], + dim=2, + ) + return rays.view(batch_size, *shape, 2, 3) + + def resize_image(self, width: int, height: int) -> "DifferentiableProjectiveCamera": + """ + Creates a new camera for the resized view assuming the aspect ratio does not change. + """ + assert width * self.height == height * self.width, "The aspect ratio should not change." + return DifferentiableProjectiveCamera( + origin=self.origin, + x=self.x, + y=self.y, + z=self.z, + width=width, + height=height, + x_fov=self.x_fov, + y_fov=self.y_fov, + ) + + +def create_pan_cameras(size: int) -> DifferentiableProjectiveCamera: + origins = [] + xs = [] + ys = [] + zs = [] + for theta in np.linspace(0, 2 * np.pi, num=20): + z = np.array([np.sin(theta), np.cos(theta), -0.5]) + z /= np.sqrt(np.sum(z**2)) + origin = -z * 4 + x = np.array([np.cos(theta), -np.sin(theta), 0.0]) + y = np.cross(z, x) + origins.append(origin) + xs.append(x) + ys.append(y) + zs.append(z) + return DifferentiableProjectiveCamera( + origin=torch.from_numpy(np.stack(origins, axis=0)).float(), + x=torch.from_numpy(np.stack(xs, axis=0)).float(), + y=torch.from_numpy(np.stack(ys, axis=0)).float(), + z=torch.from_numpy(np.stack(zs, axis=0)).float(), + width=size, + height=size, + x_fov=0.7, + y_fov=0.7, + shape=(1, len(xs)), + ) diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py new file mode 100644 index 000000000000..5d96fc7bb9f4 --- /dev/null +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e.py @@ -0,0 +1,390 @@ +# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPTextModelWithProjection, CLIPTokenizer + +from ...models import PriorTransformer +from ...pipelines import DiffusionPipeline +from ...schedulers import HeunDiscreteScheduler +from ...utils import ( + BaseOutput, + is_accelerate_available, + is_accelerate_version, + logging, + randn_tensor, + replace_example_docstring, +) +from .renderer import ShapERenderer + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> import torch + >>> from diffusers import DiffusionPipeline + >>> from diffusers.utils import export_to_gif + + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + >>> repo = "openai/shap-e" + >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16) + >>> pipe = pipe.to(device) + + >>> guidance_scale = 15.0 + >>> prompt = "a shark" + + >>> images = pipe( + ... prompt, + ... guidance_scale=guidance_scale, + ... num_inference_steps=64, + ... frame_size=256, + ... ).images + + >>> gif_path = export_to_gif(images[0], "shark_3d.gif") + ``` +""" + + +@dataclass +class ShapEPipelineOutput(BaseOutput): + """ + Output class for ShapEPipeline. + + Args: + images (`torch.FloatTensor`) + a list of images for 3D rendering + """ + + images: Union[List[List[PIL.Image.Image]], List[List[np.ndarray]]] + + +class ShapEPipeline(DiffusionPipeline): + """ + Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + prior ([`PriorTransformer`]): + The canonincal unCLIP prior to approximate the image embedding from the text embedding. + text_encoder ([`CLIPTextModelWithProjection`]): + Frozen text-encoder. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + scheduler ([`HeunDiscreteScheduler`]): + A scheduler to be used in combination with `prior` to generate image embedding. + renderer ([`ShapERenderer`]): + Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects + with the NeRF rendering method + """ + + def __init__( + self, + prior: PriorTransformer, + text_encoder: CLIPTextModelWithProjection, + tokenizer: CLIPTokenizer, + scheduler: HeunDiscreteScheduler, + renderer: ShapERenderer, + ): + super().__init__() + + self.register_modules( + prior=prior, + text_encoder=text_encoder, + tokenizer=tokenizer, + scheduler=scheduler, + renderer=renderer, + ) + + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [self.text_encoder, self.prior] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared + to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward` + method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with + `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.") + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.prior, self.renderer]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook(self.safety_checker, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if self.device != torch.device("meta") or not hasattr(self.text_encoder, "_hf_hook"): + return self.device + for module in self.text_encoder.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + ): + len(prompt) if isinstance(prompt, list) else 1 + + # YiYi Notes: set pad_token_id to be 0, not sure why I can't set in the config file + self.tokenizer.pad_token_id = 0 + # get prompt text embeddings + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + text_encoder_output = self.text_encoder(text_input_ids.to(device)) + prompt_embeds = text_encoder_output.text_embeds + + prompt_embeds = prompt_embeds.repeat_interleave(num_images_per_prompt, dim=0) + # in Shap-E it normalize the prompt_embeds and then later rescale it + prompt_embeds = prompt_embeds / torch.linalg.norm(prompt_embeds, dim=-1, keepdim=True) + + if do_classifier_free_guidance: + negative_prompt_embeds = torch.zeros_like(prompt_embeds) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + # Rescale the features to have unit variance + prompt_embeds = math.sqrt(prompt_embeds.shape[1]) * prompt_embeds + + return prompt_embeds + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: str, + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + guidance_scale: float = 4.0, + frame_size: int = 64, + output_type: Optional[str] = "pil", # pil, np, latent + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 25): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + frame_size (`int`, *optional*, default to 64): + the width and height of each image frame of the generated 3d output + output_type (`str`, *optional*, defaults to `"pt"`): + The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"` + (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`ShapEPipelineOutput`] or `tuple` + """ + + if isinstance(prompt, str): + batch_size = 1 + elif isinstance(prompt, list): + batch_size = len(prompt) + else: + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + device = self._execution_device + + batch_size = batch_size * num_images_per_prompt + + do_classifier_free_guidance = guidance_scale > 1.0 + prompt_embeds = self._encode_prompt(prompt, device, num_images_per_prompt, do_classifier_free_guidance) + + # prior + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + num_embeddings = self.prior.config.num_embeddings + embedding_dim = self.prior.config.embedding_dim + + latents = self.prepare_latents( + (batch_size, num_embeddings * embedding_dim), + prompt_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + + # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim + latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim) + + for i, t in enumerate(self.progress_bar(timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + noise_pred = self.prior( + scaled_model_input, + timestep=t, + proj_embedding=prompt_embeds, + ).predicted_image_embedding + + # remove the variance + noise_pred, _ = noise_pred.split( + scaled_model_input.shape[2], dim=2 + ) # batch_size, num_embeddings, embedding_dim + + if do_classifier_free_guidance is not None: + noise_pred_uncond, noise_pred = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond) + + latents = self.scheduler.step( + noise_pred, + timestep=t, + sample=latents, + ).prev_sample + + if output_type == "latent": + return ShapEPipelineOutput(images=latents) + + images = [] + for i, latent in enumerate(latents): + image = self.renderer.decode( + latent[None, :], + device, + size=frame_size, + ray_batch_size=4096, + n_coarse_samples=64, + n_fine_samples=128, + ) + images.append(image) + + images = torch.stack(images) + + if output_type not in ["np", "pil"]: + raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}") + + images = images.cpu().numpy() + + if output_type == "pil": + images = [self.numpy_to_pil(image) for image in images] + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (images,) + + return ShapEPipelineOutput(images=images) diff --git a/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py new file mode 100644 index 000000000000..b99b808e5953 --- /dev/null +++ b/src/diffusers/pipelines/shap_e/pipeline_shap_e_img2img.py @@ -0,0 +1,349 @@ +# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional, Union + +import numpy as np +import PIL +import torch +from transformers import CLIPImageProcessor, CLIPVisionModel + +from ...models import PriorTransformer +from ...pipelines import DiffusionPipeline +from ...schedulers import HeunDiscreteScheduler +from ...utils import ( + BaseOutput, + is_accelerate_available, + logging, + randn_tensor, + replace_example_docstring, +) +from .renderer import ShapERenderer + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + >>> from PIL import Image + >>> import torch + >>> from diffusers import DiffusionPipeline + >>> from diffusers.utils import export_to_gif, load_image + + >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + >>> repo = "openai/shap-e-img2img" + >>> pipe = DiffusionPipeline.from_pretrained(repo, torch_dtype=torch.float16) + >>> pipe = pipe.to(device) + + >>> guidance_scale = 3.0 + >>> image_url = "https://hf.co/datasets/diffusers/docs-images/resolve/main/shap-e/corgi.png" + >>> image = load_image(image_url).convert("RGB") + + >>> images = pipe( + ... image, + ... guidance_scale=guidance_scale, + ... num_inference_steps=64, + ... frame_size=256, + ... ).images + + >>> gif_path = export_to_gif(images[0], "corgi_3d.gif") + ``` +""" + + +@dataclass +class ShapEPipelineOutput(BaseOutput): + """ + Output class for ShapEPipeline. + + Args: + images (`torch.FloatTensor`) + a list of images for 3D rendering + """ + + images: Union[PIL.Image.Image, np.ndarray] + + +class ShapEImg2ImgPipeline(DiffusionPipeline): + """ + Pipeline for generating latent representation of a 3D asset and rendering with NeRF method with Shap-E + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + prior ([`PriorTransformer`]): + The canonincal unCLIP prior to approximate the image embedding from the text embedding. + text_encoder ([`CLIPTextModelWithProjection`]): + Frozen text-encoder. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). + scheduler ([`HeunDiscreteScheduler`]): + A scheduler to be used in combination with `prior` to generate image embedding. + renderer ([`ShapERenderer`]): + Shap-E renderer projects the generated latents into parameters of a MLP that's used to create 3D objects + with the NeRF rendering method + """ + + def __init__( + self, + prior: PriorTransformer, + image_encoder: CLIPVisionModel, + image_processor: CLIPImageProcessor, + scheduler: HeunDiscreteScheduler, + renderer: ShapERenderer, + ): + super().__init__() + + self.register_modules( + prior=prior, + image_encoder=image_encoder, + image_processor=image_processor, + scheduler=scheduler, + renderer=renderer, + ) + + # Copied from diffusers.pipelines.unclip.pipeline_unclip.UnCLIPPipeline.prepare_latents + def prepare_latents(self, shape, dtype, device, generator, latents, scheduler): + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + if latents.shape != shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") + latents = latents.to(device) + + latents = latents * scheduler.init_noise_sigma + return latents + + def enable_sequential_cpu_offload(self, gpu_id=0): + r""" + Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, the pipeline's + models have their state dicts saved to CPU and then are moved to a `torch.device('meta') and loaded to GPU only + when their specific submodule has its `forward` method called. + """ + if is_accelerate_available(): + from accelerate import cpu_offload + else: + raise ImportError("Please install accelerate via `pip install accelerate`") + + device = torch.device(f"cuda:{gpu_id}") + + models = [self.image_encoder, self.prior] + for cpu_offloaded_model in models: + if cpu_offloaded_model is not None: + cpu_offload(cpu_offloaded_model, device) + + @property + def _execution_device(self): + r""" + Returns the device on which the pipeline's models will be executed. After calling + `pipeline.enable_sequential_cpu_offload()` the execution device can only be inferred from Accelerate's module + hooks. + """ + if self.device != torch.device("meta") or not hasattr(self.image_encoder, "_hf_hook"): + return self.device + for module in self.image_encoder.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + return self.device + + def _encode_image( + self, + image, + device, + num_images_per_prompt, + do_classifier_free_guidance, + ): + if isinstance(image, List) and isinstance(image[0], torch.Tensor): + image = torch.cat(image, axis=0) if image[0].ndim == 4 else torch.stack(image, axis=0) + + if not isinstance(image, torch.Tensor): + image = self.image_processor(image, return_tensors="pt").pixel_values[0].unsqueeze(0) + + image = image.to(dtype=self.image_encoder.dtype, device=device) + + image_embeds = self.image_encoder(image)["last_hidden_state"] + image_embeds = image_embeds[:, 1:, :].contiguous() # batch_size, dim, 256 + + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + + if do_classifier_free_guidance: + negative_image_embeds = torch.zeros_like(image_embeds) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + image_embeds = torch.cat([negative_image_embeds, image_embeds]) + + return image_embeds + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + image: Union[PIL.Image.Image, List[PIL.Image.Image]], + num_images_per_prompt: int = 1, + num_inference_steps: int = 25, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + guidance_scale: float = 4.0, + frame_size: int = 64, + output_type: Optional[str] = "pil", # pil, np, latent + return_dict: bool = True, + ): + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`): + The prompt or prompts to guide the image generation. + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + num_inference_steps (`int`, *optional*, defaults to 100): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + guidance_scale (`float`, *optional*, defaults to 4.0): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + frame_size (`int`, *optional*, default to 64): + the width and height of each image frame of the generated 3d output + output_type (`str`, *optional*, defaults to `"pt"`): + The output format of the generate image. Choose between: `"np"` (`np.array`) or `"pt"` + (`torch.Tensor`). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple. + + Examples: + + Returns: + [`ShapEPipelineOutput`] or `tuple` + """ + + if isinstance(image, PIL.Image.Image): + batch_size = 1 + elif isinstance(image, torch.Tensor): + batch_size = image.shape[0] + elif isinstance(image, list) and isinstance(image[0], (torch.Tensor, PIL.Image.Image)): + batch_size = len(image) + else: + raise ValueError( + f"`image` has to be of type `PIL.Image.Image`, `torch.Tensor`, `List[PIL.Image.Image]` or `List[torch.Tensor]` but is {type(image)}" + ) + + device = self._execution_device + + batch_size = batch_size * num_images_per_prompt + + do_classifier_free_guidance = guidance_scale > 1.0 + image_embeds = self._encode_image(image, device, num_images_per_prompt, do_classifier_free_guidance) + + # prior + + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps + + num_embeddings = self.prior.config.num_embeddings + embedding_dim = self.prior.config.embedding_dim + + latents = self.prepare_latents( + (batch_size, num_embeddings * embedding_dim), + image_embeds.dtype, + device, + generator, + latents, + self.scheduler, + ) + + # YiYi notes: for testing only to match ldm, we can directly create a latents with desired shape: batch_size, num_embeddings, embedding_dim + latents = latents.reshape(latents.shape[0], num_embeddings, embedding_dim) + + for i, t in enumerate(self.progress_bar(timesteps)): + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents + scaled_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + noise_pred = self.prior( + scaled_model_input, + timestep=t, + proj_embedding=image_embeds, + ).predicted_image_embedding + + # remove the variance + noise_pred, _ = noise_pred.split( + scaled_model_input.shape[2], dim=2 + ) # batch_size, num_embeddings, embedding_dim + + if do_classifier_free_guidance is not None: + noise_pred_uncond, noise_pred = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * (noise_pred - noise_pred_uncond) + + latents = self.scheduler.step( + noise_pred, + timestep=t, + sample=latents, + ).prev_sample + + if output_type == "latent": + return ShapEPipelineOutput(images=latents) + + images = [] + for i, latent in enumerate(latents): + print() + image = self.renderer.decode( + latent[None, :], + device, + size=frame_size, + ray_batch_size=4096, + n_coarse_samples=64, + n_fine_samples=128, + ) + + images.append(image) + + images = torch.stack(images) + + if output_type not in ["np", "pil"]: + raise ValueError(f"Only the output types `pil` and `np` are supported not output_type={output_type}") + + images = images.cpu().numpy() + + if output_type == "pil": + images = [self.numpy_to_pil(image) for image in images] + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (images,) + + return ShapEPipelineOutput(images=images) diff --git a/src/diffusers/pipelines/shap_e/renderer.py b/src/diffusers/pipelines/shap_e/renderer.py new file mode 100644 index 000000000000..8b075e671f63 --- /dev/null +++ b/src/diffusers/pipelines/shap_e/renderer.py @@ -0,0 +1,709 @@ +# Copyright 2023 Open AI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from dataclasses import dataclass +from typing import Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +from ...configuration_utils import ConfigMixin, register_to_config +from ...models import ModelMixin +from ...utils import BaseOutput +from .camera import create_pan_cameras + + +def sample_pmf(pmf: torch.Tensor, n_samples: int) -> torch.Tensor: + r""" + Sample from the given discrete probability distribution with replacement. + + The i-th bin is assumed to have mass pmf[i]. + + Args: + pmf: [batch_size, *shape, n_samples, 1] where (pmf.sum(dim=-2) == 1).all() + n_samples: number of samples + + Return: + indices sampled with replacement + """ + + *shape, support_size, last_dim = pmf.shape + assert last_dim == 1 + + cdf = torch.cumsum(pmf.view(-1, support_size), dim=1) + inds = torch.searchsorted(cdf, torch.rand(cdf.shape[0], n_samples, device=cdf.device)) + + return inds.view(*shape, n_samples, 1).clamp(0, support_size - 1) + + +def posenc_nerf(x: torch.Tensor, min_deg: int = 0, max_deg: int = 15) -> torch.Tensor: + """ + Concatenate x and its positional encodings, following NeRF. + + Reference: https://arxiv.org/pdf/2210.04628.pdf + """ + if min_deg == max_deg: + return x + + scales = 2.0 ** torch.arange(min_deg, max_deg, dtype=x.dtype, device=x.device) + *shape, dim = x.shape + xb = (x.reshape(-1, 1, dim) * scales.view(1, -1, 1)).reshape(*shape, -1) + assert xb.shape[-1] == dim * (max_deg - min_deg) + emb = torch.cat([xb, xb + math.pi / 2.0], axis=-1).sin() + return torch.cat([x, emb], dim=-1) + + +def encode_position(position): + return posenc_nerf(position, min_deg=0, max_deg=15) + + +def encode_direction(position, direction=None): + if direction is None: + return torch.zeros_like(posenc_nerf(position, min_deg=0, max_deg=8)) + else: + return posenc_nerf(direction, min_deg=0, max_deg=8) + + +def _sanitize_name(x: str) -> str: + return x.replace(".", "__") + + +def integrate_samples(volume_range, ts, density, channels): + r""" + Function integrating the model output. + + Args: + volume_range: Specifies the integral range [t0, t1] + ts: timesteps + density: torch.Tensor [batch_size, *shape, n_samples, 1] + channels: torch.Tensor [batch_size, *shape, n_samples, n_channels] + returns: + channels: integrated rgb output weights: torch.Tensor [batch_size, *shape, n_samples, 1] (density + *transmittance)[i] weight for each rgb output at [..., i, :]. transmittance: transmittance of this volume + ) + """ + + # 1. Calculate the weights + _, _, dt = volume_range.partition(ts) + ddensity = density * dt + + mass = torch.cumsum(ddensity, dim=-2) + transmittance = torch.exp(-mass[..., -1, :]) + + alphas = 1.0 - torch.exp(-ddensity) + Ts = torch.exp(torch.cat([torch.zeros_like(mass[..., :1, :]), -mass[..., :-1, :]], dim=-2)) + # This is the probability of light hitting and reflecting off of + # something at depth [..., i, :]. + weights = alphas * Ts + + # 2. Integrate channels + channels = torch.sum(channels * weights, dim=-2) + + return channels, weights, transmittance + + +class VoidNeRFModel(nn.Module): + """ + Implements the default empty space model where all queries are rendered as background. + """ + + def __init__(self, background, channel_scale=255.0): + super().__init__() + background = nn.Parameter(torch.from_numpy(np.array(background)).to(dtype=torch.float32) / channel_scale) + + self.register_buffer("background", background) + + def forward(self, position): + background = self.background[None].to(position.device) + + shape = position.shape[:-1] + ones = [1] * (len(shape) - 1) + n_channels = background.shape[-1] + background = torch.broadcast_to(background.view(background.shape[0], *ones, n_channels), [*shape, n_channels]) + + return background + + +@dataclass +class VolumeRange: + t0: torch.Tensor + t1: torch.Tensor + intersected: torch.Tensor + + def __post_init__(self): + assert self.t0.shape == self.t1.shape == self.intersected.shape + + def partition(self, ts): + """ + Partitions t0 and t1 into n_samples intervals. + + Args: + ts: [batch_size, *shape, n_samples, 1] + + Return: + + lower: [batch_size, *shape, n_samples, 1] upper: [batch_size, *shape, n_samples, 1] delta: [batch_size, + *shape, n_samples, 1] + + where + ts \\in [lower, upper] deltas = upper - lower + """ + + mids = (ts[..., 1:, :] + ts[..., :-1, :]) * 0.5 + lower = torch.cat([self.t0[..., None, :], mids], dim=-2) + upper = torch.cat([mids, self.t1[..., None, :]], dim=-2) + delta = upper - lower + assert lower.shape == upper.shape == delta.shape == ts.shape + return lower, upper, delta + + +class BoundingBoxVolume(nn.Module): + """ + Axis-aligned bounding box defined by the two opposite corners. + """ + + def __init__( + self, + *, + bbox_min, + bbox_max, + min_dist: float = 0.0, + min_t_range: float = 1e-3, + ): + """ + Args: + bbox_min: the left/bottommost corner of the bounding box + bbox_max: the other corner of the bounding box + min_dist: all rays should start at least this distance away from the origin. + """ + super().__init__() + + self.min_dist = min_dist + self.min_t_range = min_t_range + + self.bbox_min = torch.tensor(bbox_min) + self.bbox_max = torch.tensor(bbox_max) + self.bbox = torch.stack([self.bbox_min, self.bbox_max]) + assert self.bbox.shape == (2, 3) + assert min_dist >= 0.0 + assert min_t_range > 0.0 + + def intersect( + self, + origin: torch.Tensor, + direction: torch.Tensor, + t0_lower: Optional[torch.Tensor] = None, + epsilon=1e-6, + ): + """ + Args: + origin: [batch_size, *shape, 3] + direction: [batch_size, *shape, 3] + t0_lower: Optional [batch_size, *shape, 1] lower bound of t0 when intersecting this volume. + params: Optional meta parameters in case Volume is parametric + epsilon: to stabilize calculations + + Return: + A tuple of (t0, t1, intersected) where each has a shape [batch_size, *shape, 1]. If a ray intersects with + the volume, `o + td` is in the volume for all t in [t0, t1]. If the volume is bounded, t1 is guaranteed to + be on the boundary of the volume. + """ + + batch_size, *shape, _ = origin.shape + ones = [1] * len(shape) + bbox = self.bbox.view(1, *ones, 2, 3).to(origin.device) + + def _safe_divide(a, b, epsilon=1e-6): + return a / torch.where(b < 0, b - epsilon, b + epsilon) + + ts = _safe_divide(bbox - origin[..., None, :], direction[..., None, :], epsilon=epsilon) + + # Cases to think about: + # + # 1. t1 <= t0: the ray does not pass through the AABB. + # 2. t0 < t1 <= 0: the ray intersects but the BB is behind the origin. + # 3. t0 <= 0 <= t1: the ray starts from inside the BB + # 4. 0 <= t0 < t1: the ray is not inside and intersects with the BB twice. + # + # 1 and 4 are clearly handled from t0 < t1 below. + # Making t0 at least min_dist (>= 0) takes care of 2 and 3. + t0 = ts.min(dim=-2).values.max(dim=-1, keepdim=True).values.clamp(self.min_dist) + t1 = ts.max(dim=-2).values.min(dim=-1, keepdim=True).values + assert t0.shape == t1.shape == (batch_size, *shape, 1) + if t0_lower is not None: + assert t0.shape == t0_lower.shape + t0 = torch.maximum(t0, t0_lower) + + intersected = t0 + self.min_t_range < t1 + t0 = torch.where(intersected, t0, torch.zeros_like(t0)) + t1 = torch.where(intersected, t1, torch.ones_like(t1)) + + return VolumeRange(t0=t0, t1=t1, intersected=intersected) + + +class StratifiedRaySampler(nn.Module): + """ + Instead of fixed intervals, a sample is drawn uniformly at random from each interval. + """ + + def __init__(self, depth_mode: str = "linear"): + """ + :param depth_mode: linear samples ts linearly in depth. harmonic ensures + closer points are sampled more densely. + """ + self.depth_mode = depth_mode + assert self.depth_mode in ("linear", "geometric", "harmonic") + + def sample( + self, + t0: torch.Tensor, + t1: torch.Tensor, + n_samples: int, + epsilon: float = 1e-3, + ) -> torch.Tensor: + """ + Args: + t0: start time has shape [batch_size, *shape, 1] + t1: finish time has shape [batch_size, *shape, 1] + n_samples: number of ts to sample + Return: + sampled ts of shape [batch_size, *shape, n_samples, 1] + """ + ones = [1] * (len(t0.shape) - 1) + ts = torch.linspace(0, 1, n_samples).view(*ones, n_samples).to(t0.dtype).to(t0.device) + + if self.depth_mode == "linear": + ts = t0 * (1.0 - ts) + t1 * ts + elif self.depth_mode == "geometric": + ts = (t0.clamp(epsilon).log() * (1.0 - ts) + t1.clamp(epsilon).log() * ts).exp() + elif self.depth_mode == "harmonic": + # The original NeRF recommends this interpolation scheme for + # spherical scenes, but there could be some weird edge cases when + # the observer crosses from the inner to outer volume. + ts = 1.0 / (1.0 / t0.clamp(epsilon) * (1.0 - ts) + 1.0 / t1.clamp(epsilon) * ts) + + mids = 0.5 * (ts[..., 1:] + ts[..., :-1]) + upper = torch.cat([mids, t1], dim=-1) + lower = torch.cat([t0, mids], dim=-1) + # yiyi notes: add a random seed here for testing, don't forget to remove + torch.manual_seed(0) + t_rand = torch.rand_like(ts) + + ts = lower + (upper - lower) * t_rand + return ts.unsqueeze(-1) + + +class ImportanceRaySampler(nn.Module): + """ + Given the initial estimate of densities, this samples more from regions/bins expected to have objects. + """ + + def __init__( + self, + volume_range: VolumeRange, + ts: torch.Tensor, + weights: torch.Tensor, + blur_pool: bool = False, + alpha: float = 1e-5, + ): + """ + Args: + volume_range: the range in which a ray intersects the given volume. + ts: earlier samples from the coarse rendering step + weights: discretized version of density * transmittance + blur_pool: if true, use 2-tap max + 2-tap blur filter from mip-NeRF. + alpha: small value to add to weights. + """ + self.volume_range = volume_range + self.ts = ts.clone().detach() + self.weights = weights.clone().detach() + self.blur_pool = blur_pool + self.alpha = alpha + + @torch.no_grad() + def sample(self, t0: torch.Tensor, t1: torch.Tensor, n_samples: int) -> torch.Tensor: + """ + Args: + t0: start time has shape [batch_size, *shape, 1] + t1: finish time has shape [batch_size, *shape, 1] + n_samples: number of ts to sample + Return: + sampled ts of shape [batch_size, *shape, n_samples, 1] + """ + lower, upper, _ = self.volume_range.partition(self.ts) + + batch_size, *shape, n_coarse_samples, _ = self.ts.shape + + weights = self.weights + if self.blur_pool: + padded = torch.cat([weights[..., :1, :], weights, weights[..., -1:, :]], dim=-2) + maxes = torch.maximum(padded[..., :-1, :], padded[..., 1:, :]) + weights = 0.5 * (maxes[..., :-1, :] + maxes[..., 1:, :]) + weights = weights + self.alpha + pmf = weights / weights.sum(dim=-2, keepdim=True) + inds = sample_pmf(pmf, n_samples) + assert inds.shape == (batch_size, *shape, n_samples, 1) + assert (inds >= 0).all() and (inds < n_coarse_samples).all() + + t_rand = torch.rand(inds.shape, device=inds.device) + lower_ = torch.gather(lower, -2, inds) + upper_ = torch.gather(upper, -2, inds) + + ts = lower_ + (upper_ - lower_) * t_rand + ts = torch.sort(ts, dim=-2).values + return ts + + +@dataclass +class MLPNeRFModelOutput(BaseOutput): + density: torch.Tensor + signed_distance: torch.Tensor + channels: torch.Tensor + ts: torch.Tensor + + +class MLPNeRSTFModel(ModelMixin, ConfigMixin): + @register_to_config + def __init__( + self, + d_hidden: int = 256, + n_output: int = 12, + n_hidden_layers: int = 6, + act_fn: str = "swish", + insert_direction_at: int = 4, + ): + super().__init__() + + # Instantiate the MLP + + # Find out the dimension of encoded position and direction + dummy = torch.eye(1, 3) + d_posenc_pos = encode_position(position=dummy).shape[-1] + d_posenc_dir = encode_direction(position=dummy).shape[-1] + + mlp_widths = [d_hidden] * n_hidden_layers + input_widths = [d_posenc_pos] + mlp_widths + output_widths = mlp_widths + [n_output] + + if insert_direction_at is not None: + input_widths[insert_direction_at] += d_posenc_dir + + self.mlp = nn.ModuleList([nn.Linear(d_in, d_out) for d_in, d_out in zip(input_widths, output_widths)]) + + if act_fn == "swish": + # self.activation = swish + # yiyi testing: + self.activation = lambda x: F.silu(x) + else: + raise ValueError(f"Unsupported activation function {act_fn}") + + self.sdf_activation = torch.tanh + self.density_activation = torch.nn.functional.relu + self.channel_activation = torch.sigmoid + + def map_indices_to_keys(self, output): + h_map = { + "sdf": (0, 1), + "density_coarse": (1, 2), + "density_fine": (2, 3), + "stf": (3, 6), + "nerf_coarse": (6, 9), + "nerf_fine": (9, 12), + } + + mapped_output = {k: output[..., start:end] for k, (start, end) in h_map.items()} + + return mapped_output + + def forward(self, *, position, direction, ts, nerf_level="coarse"): + h = encode_position(position) + + h_preact = h + h_directionless = None + for i, layer in enumerate(self.mlp): + if i == self.config.insert_direction_at: # 4 in the config + h_directionless = h_preact + h_direction = encode_direction(position, direction=direction) + h = torch.cat([h, h_direction], dim=-1) + + h = layer(h) + + h_preact = h + + if i < len(self.mlp) - 1: + h = self.activation(h) + + h_final = h + if h_directionless is None: + h_directionless = h_preact + + activation = self.map_indices_to_keys(h_final) + + if nerf_level == "coarse": + h_density = activation["density_coarse"] + h_channels = activation["nerf_coarse"] + else: + h_density = activation["density_fine"] + h_channels = activation["nerf_fine"] + + density = self.density_activation(h_density) + signed_distance = self.sdf_activation(activation["sdf"]) + channels = self.channel_activation(h_channels) + + # yiyi notes: I think signed_distance is not used + return MLPNeRFModelOutput(density=density, signed_distance=signed_distance, channels=channels, ts=ts) + + +class ChannelsProj(nn.Module): + def __init__( + self, + *, + vectors: int, + channels: int, + d_latent: int, + ): + super().__init__() + self.proj = nn.Linear(d_latent, vectors * channels) + self.norm = nn.LayerNorm(channels) + self.d_latent = d_latent + self.vectors = vectors + self.channels = channels + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x_bvd = x + w_vcd = self.proj.weight.view(self.vectors, self.channels, self.d_latent) + b_vc = self.proj.bias.view(1, self.vectors, self.channels) + h = torch.einsum("bvd,vcd->bvc", x_bvd, w_vcd) + h = self.norm(h) + + h = h + b_vc + return h + + +class ShapEParamsProjModel(ModelMixin, ConfigMixin): + """ + project the latent representation of a 3D asset to obtain weights of a multi-layer perceptron (MLP). + + For more details, see the original paper: + """ + + @register_to_config + def __init__( + self, + *, + param_names: Tuple[str] = ( + "nerstf.mlp.0.weight", + "nerstf.mlp.1.weight", + "nerstf.mlp.2.weight", + "nerstf.mlp.3.weight", + ), + param_shapes: Tuple[Tuple[int]] = ( + (256, 93), + (256, 256), + (256, 256), + (256, 256), + ), + d_latent: int = 1024, + ): + super().__init__() + + # check inputs + if len(param_names) != len(param_shapes): + raise ValueError("Must provide same number of `param_names` as `param_shapes`") + self.projections = nn.ModuleDict({}) + for k, (vectors, channels) in zip(param_names, param_shapes): + self.projections[_sanitize_name(k)] = ChannelsProj( + vectors=vectors, + channels=channels, + d_latent=d_latent, + ) + + def forward(self, x: torch.Tensor): + out = {} + start = 0 + for k, shape in zip(self.config.param_names, self.config.param_shapes): + vectors, _ = shape + end = start + vectors + x_bvd = x[:, start:end] + out[k] = self.projections[_sanitize_name(k)](x_bvd).reshape(len(x), *shape) + start = end + return out + + +class ShapERenderer(ModelMixin, ConfigMixin): + @register_to_config + def __init__( + self, + *, + param_names: Tuple[str] = ( + "nerstf.mlp.0.weight", + "nerstf.mlp.1.weight", + "nerstf.mlp.2.weight", + "nerstf.mlp.3.weight", + ), + param_shapes: Tuple[Tuple[int]] = ( + (256, 93), + (256, 256), + (256, 256), + (256, 256), + ), + d_latent: int = 1024, + d_hidden: int = 256, + n_output: int = 12, + n_hidden_layers: int = 6, + act_fn: str = "swish", + insert_direction_at: int = 4, + background: Tuple[float] = ( + 255.0, + 255.0, + 255.0, + ), + ): + super().__init__() + + self.params_proj = ShapEParamsProjModel( + param_names=param_names, + param_shapes=param_shapes, + d_latent=d_latent, + ) + self.mlp = MLPNeRSTFModel(d_hidden, n_output, n_hidden_layers, act_fn, insert_direction_at) + self.void = VoidNeRFModel(background=background, channel_scale=255.0) + self.volume = BoundingBoxVolume(bbox_max=[1.0, 1.0, 1.0], bbox_min=[-1.0, -1.0, -1.0]) + + @torch.no_grad() + def render_rays(self, rays, sampler, n_samples, prev_model_out=None, render_with_direction=False): + """ + Perform volumetric rendering over a partition of possible t's in the union of rendering volumes (written below + with some abuse of notations) + + C(r) := sum( + transmittance(t[i]) * integrate( + lambda t: density(t) * channels(t) * transmittance(t), [t[i], t[i + 1]], + ) for i in range(len(parts)) + ) + transmittance(t[-1]) * void_model(t[-1]).channels + + where + + 1) transmittance(s) := exp(-integrate(density, [t[0], s])) calculates the probability of light passing through + the volume specified by [t[0], s]. (transmittance of 1 means light can pass freely) 2) density and channels are + obtained by evaluating the appropriate part.model at time t. 3) [t[i], t[i + 1]] is defined as the range of t + where the ray intersects (parts[i].volume \\ union(part.volume for part in parts[:i])) at the surface of the + shell (if bounded). If the ray does not intersect, the integral over this segment is evaluated as 0 and + transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1], + math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty). + + args: + rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples: + number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including + + :return: A tuple of + - `channels` + - A importance samplers for additional fine-grained rendering + - raw model output + """ + origin, direction = rays[..., 0, :], rays[..., 1, :] + + # Integrate over [t[i], t[i + 1]] + + # 1 Intersect the rays with the current volume and sample ts to integrate along. + vrange = self.volume.intersect(origin, direction, t0_lower=None) + ts = sampler.sample(vrange.t0, vrange.t1, n_samples) + ts = ts.to(rays.dtype) + + if prev_model_out is not None: + # Append the previous ts now before fprop because previous + # rendering used a different model and we can't reuse the output. + ts = torch.sort(torch.cat([ts, prev_model_out.ts], dim=-2), dim=-2).values + + batch_size, *_shape, _t0_dim = vrange.t0.shape + _, *ts_shape, _ts_dim = ts.shape + + # 2. Get the points along the ray and query the model + directions = torch.broadcast_to(direction.unsqueeze(-2), [batch_size, *ts_shape, 3]) + positions = origin.unsqueeze(-2) + ts * directions + + directions = directions.to(self.mlp.dtype) + positions = positions.to(self.mlp.dtype) + + optional_directions = directions if render_with_direction else None + + model_out = self.mlp( + position=positions, + direction=optional_directions, + ts=ts, + nerf_level="coarse" if prev_model_out is None else "fine", + ) + + # 3. Integrate the model results + channels, weights, transmittance = integrate_samples( + vrange, model_out.ts, model_out.density, model_out.channels + ) + + # 4. Clean up results that do not intersect with the volume. + transmittance = torch.where(vrange.intersected, transmittance, torch.ones_like(transmittance)) + channels = torch.where(vrange.intersected, channels, torch.zeros_like(channels)) + # 5. integration to infinity (e.g. [t[-1], math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty). + channels = channels + transmittance * self.void(origin) + + weighted_sampler = ImportanceRaySampler(vrange, ts=model_out.ts, weights=weights) + + return channels, weighted_sampler, model_out + + @torch.no_grad() + def decode( + self, + latents, + device, + size: int = 64, + ray_batch_size: int = 4096, + n_coarse_samples=64, + n_fine_samples=128, + ): + # project the the paramters from the generated latents + projected_params = self.params_proj(latents) + + # update the mlp layers of the renderer + for name, param in self.mlp.state_dict().items(): + if f"nerstf.{name}" in projected_params.keys(): + param.copy_(projected_params[f"nerstf.{name}"].squeeze(0)) + + # create cameras object + camera = create_pan_cameras(size) + rays = camera.camera_rays + rays = rays.to(device) + n_batches = rays.shape[1] // ray_batch_size + + coarse_sampler = StratifiedRaySampler() + + images = [] + + for idx in range(n_batches): + rays_batch = rays[:, idx * ray_batch_size : (idx + 1) * ray_batch_size] + + # render rays with coarse, stratified samples. + _, fine_sampler, coarse_model_out = self.render_rays(rays_batch, coarse_sampler, n_coarse_samples) + # Then, render with additional importance-weighted ray samples. + channels, _, _ = self.render_rays( + rays_batch, fine_sampler, n_fine_samples, prev_model_out=coarse_model_out + ) + + images.append(channels) + + images = torch.cat(images, dim=1) + images = images.view(*camera.shape, camera.height, camera.width, -1).squeeze(0) + + return images diff --git a/src/diffusers/schedulers/scheduling_ddim.py b/src/diffusers/schedulers/scheduling_ddim.py index 99602d14038b..a93255ca600e 100644 --- a/src/diffusers/schedulers/scheduling_ddim.py +++ b/src/diffusers/schedulers/scheduling_ddim.py @@ -47,7 +47,11 @@ class DDIMSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -60,19 +64,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_ddim_inverse.py b/src/diffusers/schedulers/scheduling_ddim_inverse.py index 2c9fc036a027..c04aabe035b5 100644 --- a/src/diffusers/schedulers/scheduling_ddim_inverse.py +++ b/src/diffusers/schedulers/scheduling_ddim_inverse.py @@ -46,7 +46,11 @@ class DDIMSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -59,19 +63,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_ddim_parallel.py b/src/diffusers/schedulers/scheduling_ddim_parallel.py index 8875aa73208b..db3ea0e1cca5 100644 --- a/src/diffusers/schedulers/scheduling_ddim_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddim_parallel.py @@ -47,7 +47,11 @@ class DDIMParallelSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -60,19 +64,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_ddpm.py b/src/diffusers/schedulers/scheduling_ddpm.py index ddf27d409d88..a1b7d7aaa9c2 100644 --- a/src/diffusers/schedulers/scheduling_ddpm.py +++ b/src/diffusers/schedulers/scheduling_ddpm.py @@ -44,7 +44,11 @@ class DDPMSchedulerOutput(BaseOutput): pred_original_sample: Optional[torch.FloatTensor] = None -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -57,19 +61,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_ddpm_parallel.py b/src/diffusers/schedulers/scheduling_ddpm_parallel.py index e4d858efde8f..a92e175877d2 100644 --- a/src/diffusers/schedulers/scheduling_ddpm_parallel.py +++ b/src/diffusers/schedulers/scheduling_ddpm_parallel.py @@ -46,7 +46,11 @@ class DDPMParallelSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -59,19 +63,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_deis_multistep.py b/src/diffusers/schedulers/scheduling_deis_multistep.py index c504fb19231a..36947294922b 100644 --- a/src/diffusers/schedulers/scheduling_deis_multistep.py +++ b/src/diffusers/schedulers/scheduling_deis_multistep.py @@ -26,7 +26,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -39,19 +43,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py index 528b7b838b1c..d7516fa601e1 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py @@ -26,7 +26,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -39,19 +43,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py index b424ebbff262..a6736b354419 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_multistep_inverse.py @@ -26,7 +26,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -39,19 +43,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py index da8b71788b75..a31e97b69651 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_sde.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_sde.py @@ -13,6 +13,7 @@ # limitations under the License. import math +from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -76,7 +77,11 @@ def __call__(self, sigma, sigma_next): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -89,19 +94,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) @@ -190,10 +206,16 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): indices = (schedule_timesteps == timestep).nonzero() - if self.state_in_first_order: - pos = -1 + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + if len(self._index_counter) == 0: + pos = 1 if len(indices) > 1 else 0 else: - pos = 0 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + pos = self._index_counter[timestep_int] + return indices[pos].item() @property @@ -292,6 +314,10 @@ def set_timesteps( self.sample = None self.mid_point_sigma = None + # for exp beta schedules, such as the one for `pipeline_shap_e.py` + # we need an index counter + self._index_counter = defaultdict(int) + def _second_order_timesteps(self, sigmas, log_sigmas): def sigma_fn(_t): return np.exp(-_t) @@ -373,6 +399,10 @@ def step( """ step_index = self.index_for_timestep(timestep) + # advance index counter by 1 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + self._index_counter[timestep_int] += 1 + # Create a noise sampler if it hasn't been created yet if self.noise_sampler is None: min_sigma, max_sigma = self.sigmas[self.sigmas > 0].min(), self.sigmas.max() diff --git a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py index 721dd5e5bb85..93975a27fc6e 100644 --- a/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py +++ b/src/diffusers/schedulers/scheduling_dpmsolver_singlestep.py @@ -29,7 +29,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -42,19 +46,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py index 6b8c2f1a8a28..065f657032e6 100644 --- a/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_ancestral_discrete.py @@ -47,7 +47,11 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -60,19 +64,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_euler_discrete.py b/src/diffusers/schedulers/scheduling_euler_discrete.py index fc52c50ebc7f..cb126d4b953c 100644 --- a/src/diffusers/schedulers/scheduling_euler_discrete.py +++ b/src/diffusers/schedulers/scheduling_euler_discrete.py @@ -47,7 +47,11 @@ class EulerDiscreteSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -60,19 +64,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_heun_discrete.py b/src/diffusers/schedulers/scheduling_heun_discrete.py index 28f29067a544..5f694fd60fc9 100644 --- a/src/diffusers/schedulers/scheduling_heun_discrete.py +++ b/src/diffusers/schedulers/scheduling_heun_discrete.py @@ -13,6 +13,7 @@ # limitations under the License. import math +from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -23,7 +24,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -36,19 +41,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) @@ -74,6 +90,10 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin): prediction type of the scheduler function, one of `epsilon` (predicting the noise of the diffusion process), `sample` (directly predicting the noisy sample`) or `v_prediction` (see section 2.4 https://imagen.research.google/video/paper.pdf). + clip_sample (`bool`, default `True`): + option to clip predicted sample for numerical stability. + clip_sample_range (`float`, default `1.0`): + the maximum magnitude for sample clipping. Valid only when `clip_sample=True`. use_karras_sigmas (`bool`, *optional*, defaults to `False`): This parameter controls whether to use Karras sigmas (Karras et al. (2022) scheme) for step sizes in the noise schedule during the sampling process. If True, the sigmas will be determined according to a sequence @@ -100,6 +120,8 @@ def __init__( trained_betas: Optional[Union[np.ndarray, List[float]]] = None, prediction_type: str = "epsilon", use_karras_sigmas: Optional[bool] = False, + clip_sample: Optional[bool] = False, + clip_sample_range: float = 1.0, timestep_spacing: str = "linspace", steps_offset: int = 0, ): @@ -114,7 +136,9 @@ def __init__( ) elif beta_schedule == "squaredcos_cap_v2": # Glide cosine schedule - self.betas = betas_for_alpha_bar(num_train_timesteps) + self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="cosine") + elif beta_schedule == "exp": + self.betas = betas_for_alpha_bar(num_train_timesteps, alpha_transform_type="exp") else: raise NotImplementedError(f"{beta_schedule} does is not implemented for {self.__class__}") @@ -131,10 +155,16 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): indices = (schedule_timesteps == timestep).nonzero() - if self.state_in_first_order: - pos = -1 + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + if len(self._index_counter) == 0: + pos = 1 if len(indices) > 1 else 0 else: - pos = 0 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + pos = self._index_counter[timestep_int] + return indices[pos].item() @property @@ -207,7 +237,7 @@ def set_timesteps( log_sigmas = np.log(sigmas) sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas) - if self.use_karras_sigmas: + if self.config.use_karras_sigmas: sigmas = self._convert_to_karras(in_sigmas=sigmas, num_inference_steps=self.num_inference_steps) timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas]) @@ -228,6 +258,10 @@ def set_timesteps( self.prev_derivative = None self.dt = None + # for exp beta schedules, such as the one for `pipeline_shap_e.py` + # we need an index counter + self._index_counter = defaultdict(int) + # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t def _sigma_to_t(self, sigma, log_sigmas): # get log sigma @@ -292,6 +326,10 @@ def step( """ step_index = self.index_for_timestep(timestep) + # advance index counter by 1 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + self._index_counter[timestep_int] += 1 + if self.state_in_first_order: sigma = self.sigmas[step_index] sigma_next = self.sigmas[step_index + 1] @@ -316,12 +354,17 @@ def step( sample / (sigma_input**2 + 1) ) elif self.config.prediction_type == "sample": - raise NotImplementedError("prediction_type not implemented yet: sample") + pred_original_sample = model_output else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`" ) + if self.config.clip_sample: + pred_original_sample = pred_original_sample.clamp( + -self.config.clip_sample_range, self.config.clip_sample_range + ) + if self.state_in_first_order: # 2. Convert to an ODE derivative for 1st order derivative = (sample - pred_original_sample) / sigma_hat diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py index d4a35ab82502..bdf9379b9b90 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_ancestral_discrete.py @@ -13,6 +13,7 @@ # limitations under the License. import math +from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -24,7 +25,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -37,19 +42,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) @@ -130,10 +146,16 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): indices = (schedule_timesteps == timestep).nonzero() - if self.state_in_first_order: - pos = -1 + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + if len(self._index_counter) == 0: + pos = 1 if len(indices) > 1 else 0 else: - pos = 0 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + pos = self._index_counter[timestep_int] + return indices[pos].item() @property @@ -245,6 +267,10 @@ def set_timesteps( self.sample = None + # for exp beta schedules, such as the one for `pipeline_shap_e.py` + # we need an index counter + self._index_counter = defaultdict(int) + def sigma_to_t(self, sigma): # get log sigma log_sigma = sigma.log() @@ -295,6 +321,10 @@ def step( """ step_index = self.index_for_timestep(timestep) + # advance index counter by 1 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + self._index_counter[timestep_int] += 1 + if self.state_in_first_order: sigma = self.sigmas[step_index] sigma_interpol = self.sigmas_interpol[step_index] diff --git a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py index 39079fde10d2..a6a1b4e6640d 100644 --- a/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py +++ b/src/diffusers/schedulers/scheduling_k_dpm_2_discrete.py @@ -13,6 +13,7 @@ # limitations under the License. import math +from collections import defaultdict from typing import List, Optional, Tuple, Union import numpy as np @@ -23,7 +24,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor: +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -36,19 +41,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999) -> torch.Tensor num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) @@ -129,10 +145,16 @@ def index_for_timestep(self, timestep, schedule_timesteps=None): indices = (schedule_timesteps == timestep).nonzero() - if self.state_in_first_order: - pos = -1 + # The sigma index that is taken for the **very** first `step` + # is always the second index (or the last index if there is only 1) + # This way we can ensure we don't accidentally skip a sigma in + # case we start in the middle of the denoising schedule (e.g. for image-to-image) + if len(self._index_counter) == 0: + pos = 1 if len(indices) > 1 else 0 else: - pos = 0 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + pos = self._index_counter[timestep_int] + return indices[pos].item() @property @@ -234,6 +256,10 @@ def set_timesteps( self.sample = None + # for exp beta schedules, such as the one for `pipeline_shap_e.py` + # we need an index counter + self._index_counter = defaultdict(int) + def sigma_to_t(self, sigma): # get log sigma log_sigma = sigma.log() @@ -283,6 +309,10 @@ def step( """ step_index = self.index_for_timestep(timestep) + # advance index counter by 1 + timestep_int = timestep.cpu().item() if torch.is_tensor(timestep) else timestep + self._index_counter[timestep_int] += 1 + if self.state_in_first_order: sigma = self.sigmas[step_index] sigma_interpol = self.sigmas_interpol[step_index + 1] diff --git a/src/diffusers/schedulers/scheduling_lms_discrete.py b/src/diffusers/schedulers/scheduling_lms_discrete.py index 1256660b843c..d58d4ce45bd1 100644 --- a/src/diffusers/schedulers/scheduling_lms_discrete.py +++ b/src/diffusers/schedulers/scheduling_lms_discrete.py @@ -45,7 +45,11 @@ class LMSDiscreteSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -58,19 +62,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_pndm.py b/src/diffusers/schedulers/scheduling_pndm.py index 70ee1301129c..794eb3674c1b 100644 --- a/src/diffusers/schedulers/scheduling_pndm.py +++ b/src/diffusers/schedulers/scheduling_pndm.py @@ -25,7 +25,11 @@ # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -38,19 +42,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_repaint.py b/src/diffusers/schedulers/scheduling_repaint.py index f2f97b38f3d3..41e7450d2df6 100644 --- a/src/diffusers/schedulers/scheduling_repaint.py +++ b/src/diffusers/schedulers/scheduling_repaint.py @@ -43,7 +43,11 @@ class RePaintSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -56,19 +60,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/schedulers/scheduling_unclip.py b/src/diffusers/schedulers/scheduling_unclip.py index d7f927658c8a..fd23e48bad00 100644 --- a/src/diffusers/schedulers/scheduling_unclip.py +++ b/src/diffusers/schedulers/scheduling_unclip.py @@ -44,7 +44,11 @@ class UnCLIPSchedulerOutput(BaseOutput): # Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): """ Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of (1-beta) over time from t = [0,1]. @@ -57,19 +61,30 @@ def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999): num_diffusion_timesteps (`int`): the number of betas to produce. max_beta (`float`): the maximum beta to use; use values lower than 1 to prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` Returns: betas (`np.ndarray`): the betas used by the scheduler to step the model outputs """ + if alpha_transform_type == "cosine": - def alpha_bar(time_step): - return math.cos((time_step + 0.008) / 1.008 * math.pi / 2) ** 2 + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}") betas = [] for i in range(num_diffusion_timesteps): t1 = i / num_diffusion_timesteps t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) return torch.tensor(betas, dtype=torch.float32) diff --git a/src/diffusers/utils/__init__.py b/src/diffusers/utils/__init__.py index bdb7f020a0aa..7449df99ba80 100644 --- a/src/diffusers/utils/__init__.py +++ b/src/diffusers/utils/__init__.py @@ -104,7 +104,7 @@ ) from .torch_utils import maybe_allow_in_graph -from .testing_utils import export_to_video +from .testing_utils import export_to_gif, export_to_video logger = get_logger(__name__) diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py index 6d39c0c67d9d..164206d776fa 100644 --- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py +++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py @@ -377,6 +377,36 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["torch", "transformers"]) +class ShapEImg2ImgPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + +class ShapEPipeline(metaclass=DummyObject): + _backends = ["torch", "transformers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch", "transformers"]) + + @classmethod + def from_config(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["torch", "transformers"]) + + class StableDiffusionAttendAndExcitePipeline(metaclass=DummyObject): _backends = ["torch", "transformers"] diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py index dcb80169de74..64eb3ac925e9 100644 --- a/src/diffusers/utils/testing_utils.py +++ b/src/diffusers/utils/testing_utils.py @@ -300,6 +300,21 @@ def preprocess_image(image: PIL.Image, batch_size: int): return 2.0 * image - 1.0 +def export_to_gif(image: List[PIL.Image.Image], output_gif_path: str = None) -> str: + if output_gif_path is None: + output_gif_path = tempfile.NamedTemporaryFile(suffix=".gif").name + + image[0].save( + output_gif_path, + save_all=True, + append_images=image[1:], + optimize=False, + duration=100, + loop=0, + ) + return output_gif_path + + def export_to_video(video_frames: List[np.ndarray], output_video_path: str = None) -> str: if is_opencv_available(): import cv2 diff --git a/tests/pipelines/shap_e/__init__.py b/tests/pipelines/shap_e/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/pipelines/shap_e/test_shap_e.py b/tests/pipelines/shap_e/test_shap_e.py new file mode 100644 index 000000000000..d095dd9d49b9 --- /dev/null +++ b/tests/pipelines/shap_e/test_shap_e.py @@ -0,0 +1,265 @@ +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import unittest + +import numpy as np +import torch +from transformers import CLIPTextConfig, CLIPTextModelWithProjection, CLIPTokenizer + +from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEPipeline +from diffusers.pipelines.shap_e import ShapERenderer +from diffusers.utils import load_numpy, slow +from diffusers.utils.testing_utils import require_torch_gpu, torch_device + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = ShapEPipeline + params = ["prompt"] + batch_params = ["prompt"] + required_optional_params = [ + "num_images_per_prompt", + "num_inference_steps", + "generator", + "latents", + "guidance_scale", + "frame_size", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def renderer_dim(self): + return 8 + + @property + def dummy_tokenizer(self): + tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip") + return tokenizer + + @property + def dummy_text_encoder(self): + torch.manual_seed(0) + config = CLIPTextConfig( + bos_token_id=0, + eos_token_id=2, + hidden_size=self.text_embedder_hidden_size, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + layer_norm_eps=1e-05, + num_attention_heads=4, + num_hidden_layers=5, + pad_token_id=1, + vocab_size=1000, + ) + return CLIPTextModelWithProjection(config) + + @property + def dummy_prior(self): + torch.manual_seed(0) + + model_kwargs = { + "num_attention_heads": 2, + "attention_head_dim": 16, + "embedding_dim": self.time_input_dim, + "num_embeddings": 32, + "embedding_proj_dim": self.text_embedder_hidden_size, + "time_embed_dim": self.time_embed_dim, + "num_layers": 1, + "clip_embed_dim": self.time_input_dim * 2, + "additional_embeddings": 0, + "time_embed_act_fn": "gelu", + "norm_in_type": "layer", + "encoder_hid_proj_type": None, + "added_emb_type": None, + } + + model = PriorTransformer(**model_kwargs) + return model + + @property + def dummy_renderer(self): + torch.manual_seed(0) + + model_kwargs = { + "param_shapes": ( + (self.renderer_dim, 93), + (self.renderer_dim, 8), + (self.renderer_dim, 8), + (self.renderer_dim, 8), + ), + "d_latent": self.time_input_dim, + "d_hidden": self.renderer_dim, + "n_output": 12, + "background": ( + 0.1, + 0.1, + 0.1, + ), + } + model = ShapERenderer(**model_kwargs) + return model + + def get_dummy_components(self): + prior = self.dummy_prior + text_encoder = self.dummy_text_encoder + tokenizer = self.dummy_tokenizer + renderer = self.dummy_renderer + + scheduler = HeunDiscreteScheduler( + beta_schedule="exp", + num_train_timesteps=1024, + prediction_type="sample", + use_karras_sigmas=True, + clip_sample=True, + clip_sample_range=1.0, + ) + components = { + "prior": prior, + "text_encoder": text_encoder, + "tokenizer": tokenizer, + "renderer": renderer, + "scheduler": scheduler, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "prompt": "horse", + "generator": generator, + "num_inference_steps": 1, + "frame_size": 32, + "output_type": "np", + } + return inputs + + def test_shap_e(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images[0] + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (20, 32, 32, 3) + + expected_slice = np.array( + [ + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + ] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_inference_batch_consistent(self): + # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches + self._test_inference_batch_consistent(batch_sizes=[1, 2]) + + def test_inference_batch_single_identical(self): + test_max_difference = torch_device == "cpu" + relax_max_difference = True + + self._test_inference_batch_single_identical( + batch_size=2, + test_max_difference=test_max_difference, + relax_max_difference=relax_max_difference, + ) + + def test_num_images_per_prompt(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + batch_size = 1 + num_images_per_prompt = 2 + + inputs = self.get_dummy_inputs(torch_device) + + for key in inputs.keys(): + if key in self.batch_params: + inputs[key] = batch_size * [inputs[key]] + + images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0] + + assert images.shape[0] == batch_size * num_images_per_prompt + + +@slow +@require_torch_gpu +class ShapEPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_shap_e(self): + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/shap_e/test_shap_e_np_out.npy" + ) + pipe = ShapEPipeline.from_pretrained("openai/shap-e") + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device=torch_device).manual_seed(0) + + images = pipe( + "a shark", + generator=generator, + guidance_scale=15.0, + num_inference_steps=64, + frame_size=64, + output_type="np", + ).images[0] + + assert images.shape == (20, 64, 64, 3) + + assert_mean_pixel_difference(images, expected_image) diff --git a/tests/pipelines/shap_e/test_shap_e_img2img.py b/tests/pipelines/shap_e/test_shap_e_img2img.py new file mode 100644 index 000000000000..f6638a994fdd --- /dev/null +++ b/tests/pipelines/shap_e/test_shap_e_img2img.py @@ -0,0 +1,281 @@ +# Copyright 2023 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random +import unittest + +import numpy as np +import torch +from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel + +from diffusers import HeunDiscreteScheduler, PriorTransformer, ShapEImg2ImgPipeline +from diffusers.pipelines.shap_e import ShapERenderer +from diffusers.utils import floats_tensor, load_image, load_numpy, slow +from diffusers.utils.testing_utils import require_torch_gpu, torch_device + +from ..test_pipelines_common import PipelineTesterMixin, assert_mean_pixel_difference + + +class ShapEImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCase): + pipeline_class = ShapEImg2ImgPipeline + params = ["image"] + batch_params = ["image"] + required_optional_params = [ + "num_images_per_prompt", + "num_inference_steps", + "generator", + "latents", + "guidance_scale", + "frame_size", + "output_type", + "return_dict", + ] + test_xformers_attention = False + + @property + def text_embedder_hidden_size(self): + return 32 + + @property + def time_input_dim(self): + return 32 + + @property + def time_embed_dim(self): + return self.time_input_dim * 4 + + @property + def renderer_dim(self): + return 8 + + @property + def dummy_image_encoder(self): + torch.manual_seed(0) + config = CLIPVisionConfig( + hidden_size=self.text_embedder_hidden_size, + image_size=64, + projection_dim=self.text_embedder_hidden_size, + intermediate_size=37, + num_attention_heads=4, + num_channels=3, + num_hidden_layers=5, + patch_size=1, + ) + + model = CLIPVisionModel(config) + return model + + @property + def dummy_image_processor(self): + image_processor = CLIPImageProcessor( + crop_size=224, + do_center_crop=True, + do_normalize=True, + do_resize=True, + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + resample=3, + size=224, + ) + + return image_processor + + @property + def dummy_prior(self): + torch.manual_seed(0) + + model_kwargs = { + "num_attention_heads": 2, + "attention_head_dim": 16, + "embedding_dim": self.time_input_dim, + "num_embeddings": 32, + "embedding_proj_dim": self.text_embedder_hidden_size, + "time_embed_dim": self.time_embed_dim, + "num_layers": 1, + "clip_embed_dim": self.time_input_dim * 2, + "additional_embeddings": 0, + "time_embed_act_fn": "gelu", + "norm_in_type": "layer", + "embedding_proj_norm_type": "layer", + "encoder_hid_proj_type": None, + "added_emb_type": None, + } + + model = PriorTransformer(**model_kwargs) + return model + + @property + def dummy_renderer(self): + torch.manual_seed(0) + + model_kwargs = { + "param_shapes": ( + (self.renderer_dim, 93), + (self.renderer_dim, 8), + (self.renderer_dim, 8), + (self.renderer_dim, 8), + ), + "d_latent": self.time_input_dim, + "d_hidden": self.renderer_dim, + "n_output": 12, + "background": ( + 0.1, + 0.1, + 0.1, + ), + } + model = ShapERenderer(**model_kwargs) + return model + + def get_dummy_components(self): + prior = self.dummy_prior + image_encoder = self.dummy_image_encoder + image_processor = self.dummy_image_processor + renderer = self.dummy_renderer + + scheduler = HeunDiscreteScheduler( + beta_schedule="exp", + num_train_timesteps=1024, + prediction_type="sample", + use_karras_sigmas=True, + clip_sample=True, + clip_sample_range=1.0, + ) + components = { + "prior": prior, + "image_encoder": image_encoder, + "image_processor": image_processor, + "renderer": renderer, + "scheduler": scheduler, + } + + return components + + def get_dummy_inputs(self, device, seed=0): + input_image = floats_tensor((1, 3, 64, 64), rng=random.Random(seed)).to(device) + + if str(device).startswith("mps"): + generator = torch.manual_seed(seed) + else: + generator = torch.Generator(device=device).manual_seed(seed) + inputs = { + "image": input_image, + "generator": generator, + "num_inference_steps": 1, + "frame_size": 32, + "output_type": "np", + } + return inputs + + def test_shap_e(self): + device = "cpu" + + components = self.get_dummy_components() + + pipe = self.pipeline_class(**components) + pipe = pipe.to(device) + + pipe.set_progress_bar_config(disable=None) + + output = pipe(**self.get_dummy_inputs(device)) + image = output.images[0] + image_slice = image[0, -3:, -3:, -1] + + assert image.shape == (20, 32, 32, 3) + + expected_slice = np.array( + [ + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + 0.00039216, + ] + ) + + assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2 + + def test_inference_batch_consistent(self): + # NOTE: Larger batch sizes cause this test to timeout, only test on smaller batches + self._test_inference_batch_consistent(batch_sizes=[1, 2]) + + def test_inference_batch_single_identical(self): + test_max_difference = torch_device == "cpu" + relax_max_difference = True + self._test_inference_batch_single_identical( + batch_size=2, + test_max_difference=test_max_difference, + relax_max_difference=relax_max_difference, + ) + + def test_num_images_per_prompt(self): + components = self.get_dummy_components() + pipe = self.pipeline_class(**components) + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + batch_size = 1 + num_images_per_prompt = 2 + + inputs = self.get_dummy_inputs(torch_device) + + for key in inputs.keys(): + if key in self.batch_params: + inputs[key] = batch_size * [inputs[key]] + + images = pipe(**inputs, num_images_per_prompt=num_images_per_prompt)[0] + + assert images.shape[0] == batch_size * num_images_per_prompt + + +@slow +@require_torch_gpu +class ShapEImg2ImgPipelineIntegrationTests(unittest.TestCase): + def tearDown(self): + # clean up the VRAM after each test + super().tearDown() + gc.collect() + torch.cuda.empty_cache() + + def test_shap_e_img2img(self): + input_image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/shap_e/corgi.png" + ) + expected_image = load_numpy( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/shap_e/test_shap_e_img2img_out.npy" + ) + pipe = ShapEImg2ImgPipeline.from_pretrained("openai/shap-e-img2img") + pipe = pipe.to(torch_device) + pipe.set_progress_bar_config(disable=None) + + generator = torch.Generator(device=torch_device).manual_seed(0) + + images = pipe( + input_image, + generator=generator, + guidance_scale=3.0, + num_inference_steps=64, + frame_size=64, + output_type="np", + ).images[0] + + assert images.shape == (20, 64, 64, 3) + + assert_mean_pixel_difference(images, expected_image) diff --git a/tests/schedulers/test_scheduler_heun.py b/tests/schedulers/test_scheduler_heun.py index 2fd50425938f..ae0fe26b11ba 100644 --- a/tests/schedulers/test_scheduler_heun.py +++ b/tests/schedulers/test_scheduler_heun.py @@ -30,11 +30,15 @@ def test_betas(self): self.check_over_configs(beta_start=beta_start, beta_end=beta_end) def test_schedules(self): - for schedule in ["linear", "scaled_linear"]: + for schedule in ["linear", "scaled_linear", "exp"]: self.check_over_configs(beta_schedule=schedule) + def test_clip_sample(self): + for clip_sample_range in [1.0, 2.0, 3.0]: + self.check_over_configs(clip_sample_range=clip_sample_range, clip_sample=True) + def test_prediction_type(self): - for prediction_type in ["epsilon", "v_prediction"]: + for prediction_type in ["epsilon", "v_prediction", "sample"]: self.check_over_configs(prediction_type=prediction_type) def test_full_loop_no_noise(self): From 8bf80fc8d8aade3bd3fca5054d05b65488fbbf8f Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Jul 2023 17:51:40 +0200 Subject: [PATCH 186/199] disable num attenion heads (#3969) * disable num attenion heads * finish --- src/diffusers/models/unet_2d_condition.py | 5 +++++ src/diffusers/models/unet_2d_condition_flax.py | 5 +++++ src/diffusers/models/unet_3d_condition.py | 5 +++++ .../pipelines/versatile_diffusion/modeling_text_unet.py | 8 ++++++++ 4 files changed, 23 insertions(+) diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py index 1f1d69c6042e..dee71bead0f9 100644 --- a/src/diffusers/models/unet_2d_condition.py +++ b/src/diffusers/models/unet_2d_condition.py @@ -211,6 +211,11 @@ def __init__( self.sample_size = sample_size + if num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + # If `num_attention_heads` is not defined (which is the case for most models) # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. # The reason for this behavior is to correct for incorrectly named variables that were introduced diff --git a/src/diffusers/models/unet_2d_condition_flax.py b/src/diffusers/models/unet_2d_condition_flax.py index 352b0b1b5e10..de39bc75d2e3 100644 --- a/src/diffusers/models/unet_2d_condition_flax.py +++ b/src/diffusers/models/unet_2d_condition_flax.py @@ -133,6 +133,11 @@ def setup(self): block_out_channels = self.block_out_channels time_embed_dim = block_out_channels[0] * 4 + if self.num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + # If `num_attention_heads` is not defined (which is the case for most models) # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. # The reason for this behavior is to correct for incorrectly named variables that were introduced diff --git a/src/diffusers/models/unet_3d_condition.py b/src/diffusers/models/unet_3d_condition.py index ee4d0d7cab98..ff2a8f1179ef 100644 --- a/src/diffusers/models/unet_3d_condition.py +++ b/src/diffusers/models/unet_3d_condition.py @@ -114,6 +114,11 @@ def __init__( self.sample_size = sample_size + if num_attention_heads is not None: + raise NotImplementedError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19." + ) + # If `num_attention_heads` is not defined (which is the case for most models) # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. # The reason for this behavior is to correct for incorrectly named variables that were introduced diff --git a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py index 79e28a42f4c6..0a2fad6aee1a 100644 --- a/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py +++ b/src/diffusers/pipelines/versatile_diffusion/modeling_text_unet.py @@ -307,6 +307,14 @@ def __init__( self.sample_size = sample_size + if num_attention_heads is not None: + raise ValueError( + "At the moment it is not possible to define the number of attention heads via `num_attention_heads`" + " because of a naming issue as described in" + " https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing" + " `num_attention_heads` will only be supported in diffusers v0.19." + ) + # If `num_attention_heads` is not defined (which is the case for most models) # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. # The reason for this behavior is to correct for incorrectly named variables that were introduced From 187ea539aed54872675cd27f6a58b27084a173ab Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Jul 2023 18:11:20 +0200 Subject: [PATCH 187/199] Improve SD XL (#3968) * improve sd xl * correct more * finish * make style * fix more --- .github/workflows/push_tests.yml | 1 + .github/workflows/push_tests_fast.yml | 2 +- docs/source/en/_toctree.yml | 2 + docs/source/en/api/loaders.mdx | 4 +- .../pipelines/stable_diffusion/img2img.mdx | 2 +- .../pipelines/stable_diffusion/text2img.mdx | 2 +- .../en/using-diffusers/other-formats.mdx | 2 +- .../en/using-diffusers/using_safetensors.mdx | 4 +- examples/community/lpw_stable_diffusion.py | 4 +- src/diffusers/loaders.py | 16 ++-- .../alt_diffusion/pipeline_alt_diffusion.py | 2 +- .../pipeline_alt_diffusion_img2img.py | 8 +- .../stable_diffusion/convert_from_ckpt.py | 87 +++++++++++++------ .../pipeline_stable_diffusion.py | 6 +- .../pipeline_stable_diffusion_img2img.py | 8 +- ...ipeline_stable_diffusion_inpaint_legacy.py | 6 +- .../pipeline_stable_diffusion_ldm3d.py | 8 +- .../pipeline_stable_diffusion_paradigms.py | 8 +- .../pipeline_stable_diffusion_xl.py | 13 +-- .../pipeline_stable_diffusion_xl_img2img.py | 18 ++-- .../stable_diffusion/test_stable_diffusion.py | 6 +- 21 files changed, 133 insertions(+), 76 deletions(-) diff --git a/.github/workflows/push_tests.yml b/.github/workflows/push_tests.yml index 567cd5f5b0d4..5ec8dbdc4026 100644 --- a/.github/workflows/push_tests.yml +++ b/.github/workflows/push_tests.yml @@ -61,6 +61,7 @@ jobs: - name: Install dependencies run: | + apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] - name: Environment diff --git a/.github/workflows/push_tests_fast.yml b/.github/workflows/push_tests_fast.yml index adf4fc8a87bc..acd59ef80dc7 100644 --- a/.github/workflows/push_tests_fast.yml +++ b/.github/workflows/push_tests_fast.yml @@ -60,7 +60,7 @@ jobs: - name: Install dependencies run: | - apt-get update && apt-get install libsndfile1-dev -y + apt-get update && apt-get install libsndfile1-dev libgl1 -y python -m pip install -e .[quality,test] - name: Environment diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 470d8c5c189d..ad1c7c0aabc2 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -247,6 +247,8 @@ title: Safe Stable Diffusion - local: api/pipelines/stable_diffusion/stable_diffusion_2 title: Stable Diffusion 2 + - local: api/pipelines/stable_diffusion/stable_diffusion_xl + title: Stable Diffusion XL - local: api/pipelines/stable_diffusion/latent_upscale title: Stable-Diffusion-Latent-Upscaler - local: api/pipelines/stable_diffusion/upscale diff --git a/docs/source/en/api/loaders.mdx b/docs/source/en/api/loaders.mdx index a236a6c70b6c..57891d23dec7 100644 --- a/docs/source/en/api/loaders.mdx +++ b/docs/source/en/api/loaders.mdx @@ -32,6 +32,6 @@ Adapters (textual inversion, LoRA, hypernetworks) allow you to modify a diffusio [[autodoc]] loaders.LoraLoaderMixin -## FromCkptMixin +## FromSingleFileMixin -[[autodoc]] loaders.FromCkptMixin +[[autodoc]] loaders.FromSingleFileMixin diff --git a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx index 7959c588608b..c70f9ac9dcb7 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/img2img.mdx @@ -31,7 +31,7 @@ proposed by Chenlin Meng, Yutong He, Yang Song, Jiaming Song, Jiajun Wu, Jun-Yan - enable_xformers_memory_efficient_attention - disable_xformers_memory_efficient_attention - load_textual_inversion - - from_ckpt + - from_single_file - load_lora_weights - save_lora_weights diff --git a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx index ce78434fdbaa..0e3f51117555 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/text2img.mdx @@ -40,7 +40,7 @@ Available Checkpoints are: - enable_vae_tiling - disable_vae_tiling - load_textual_inversion - - from_ckpt + - from_single_file - load_lora_weights - save_lora_weights diff --git a/docs/source/en/using-diffusers/other-formats.mdx b/docs/source/en/using-diffusers/other-formats.mdx index 2aeb9f3ae204..b58d00fce180 100644 --- a/docs/source/en/using-diffusers/other-formats.mdx +++ b/docs/source/en/using-diffusers/other-formats.mdx @@ -26,7 +26,7 @@ This guide will show you how to convert other Stable Diffusion formats to be com ## PyTorch .ckpt -The checkpoint - or `.ckpt` - format is commonly used to store and save models. The `.ckpt` file contains the entire model and is typically several GBs in size. While you can load and use a `.ckpt` file directly with the [`~StableDiffusionPipeline.from_ckpt`] method, it is generally better to convert the `.ckpt` file to 🤗 Diffusers so both formats are available. +The checkpoint - or `.ckpt` - format is commonly used to store and save models. The `.ckpt` file contains the entire model and is typically several GBs in size. While you can load and use a `.ckpt` file directly with the [`~StableDiffusionPipeline.from_single_file`] method, it is generally better to convert the `.ckpt` file to 🤗 Diffusers so both formats are available. There are two options for converting a `.ckpt` file; use a Space to convert the checkpoint or convert the `.ckpt` file with a script. diff --git a/docs/source/en/using-diffusers/using_safetensors.mdx b/docs/source/en/using-diffusers/using_safetensors.mdx index c312ab597075..a7bc0a7c9c1c 100644 --- a/docs/source/en/using-diffusers/using_safetensors.mdx +++ b/docs/source/en/using-diffusers/using_safetensors.mdx @@ -21,12 +21,12 @@ from diffusers import DiffusionPipeline pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", use_safetensors=True) ``` -However, model weights are not necessarily stored in separate subfolders like in the example above. Sometimes, all the weights are stored in a single `.safetensors` file. In this case, if the weights are Stable Diffusion weights, you can load the file directly with the [`~diffusers.loaders.FromCkptMixin.from_ckpt`] method: +However, model weights are not necessarily stored in separate subfolders like in the example above. Sometimes, all the weights are stored in a single `.safetensors` file. In this case, if the weights are Stable Diffusion weights, you can load the file directly with the [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] method: ```py from diffusers import StableDiffusionPipeline -pipeline = StableDiffusionPipeline.from_ckpt( +pipeline = StableDiffusionPipeline.from_single_file( "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors" ) ``` diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py index 56fb903c7106..2970aae4b169 100644 --- a/examples/community/lpw_stable_diffusion.py +++ b/examples/community/lpw_stable_diffusion.py @@ -11,7 +11,7 @@ from diffusers import DiffusionPipeline from diffusers.configuration_utils import FrozenDict from diffusers.image_processor import VaeImageProcessor -from diffusers.loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from diffusers.models import AutoencoderKL, UNet2DConditionModel from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker from diffusers.schedulers import KarrasDiffusionSchedulers @@ -410,7 +410,7 @@ def preprocess_mask(mask, batch_size, scale_factor=8): class StableDiffusionLongPromptWeightingPipeline( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-to-image generation using Stable Diffusion without tokens length limit, and support parsing diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index 1bdd33fa80cb..a0be20c54361 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -1276,13 +1276,19 @@ def _convert_kohya_lora_to_diffusers(self, state_dict): return new_state_dict, network_alpha -class FromCkptMixin: +class FromSingleFileMixin: """ Load model weights saved in the `.ckpt` format into a [`DiffusionPipeline`]. """ @classmethod - def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): + def from_ckpt(cls, *args, **kwargs): + deprecation_message = "The function `from_ckpt` is deprecated in favor of `from_single_file` and will be removed in diffusers v.0.21. Please make sure to use `StableDiffusionPipeline.from_single_file(...)` instead." + deprecate("from_ckpt", "0.21.0", deprecation_message, standard_warn=False) + return cls.from_single_file(*args, **kwargs) + + @classmethod + def from_single_file(cls, pretrained_model_link_or_path, **kwargs): r""" Instantiate a [`DiffusionPipeline`] from pretrained pipeline weights saved in the `.ckpt` format. The pipeline is set in evaluation mode (`model.eval()`) by default. @@ -1361,16 +1367,16 @@ def from_ckpt(cls, pretrained_model_link_or_path, **kwargs): >>> from diffusers import StableDiffusionPipeline >>> # Download pipeline from huggingface.co and cache. - >>> pipeline = StableDiffusionPipeline.from_ckpt( + >>> pipeline = StableDiffusionPipeline.from_single_file( ... "https://huggingface.co/WarriorMama777/OrangeMixs/blob/main/Models/AbyssOrangeMix/AbyssOrangeMix.safetensors" ... ) >>> # Download pipeline from local file >>> # file is downloaded under ./v1-5-pruned-emaonly.ckpt - >>> pipeline = StableDiffusionPipeline.from_ckpt("./v1-5-pruned-emaonly") + >>> pipeline = StableDiffusionPipeline.from_single_file("./v1-5-pruned-emaonly") >>> # Enable float16 and move to GPU - >>> pipeline = StableDiffusionPipeline.from_ckpt( + >>> pipeline = StableDiffusionPipeline.from_single_file( ... "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt", ... torch_dtype=torch.float16, ... ) diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py index b79e4f72144b..5a4746d24e06 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion.py @@ -77,7 +77,7 @@ class AltDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraL In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py index 5903f97aca36..21c1f0591a44 100644 --- a/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py +++ b/src/diffusers/pipelines/alt_diffusion/pipeline_alt_diffusion_img2img.py @@ -26,7 +26,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import PIL_INTERPOLATION, deprecate, logging, randn_tensor, replace_example_docstring @@ -95,7 +95,9 @@ def preprocess(image): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline with Stable->Alt, CLIPTextModel->RobertaSeriesModelWithTransformation, CLIPTokenizer->XLMRobertaTokenizer, AltDiffusionSafetyChecker->StableDiffusionSafetyChecker -class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin): +class AltDiffusionImg2ImgPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin +): r""" Pipeline for text-guided image to image generation using Alt Diffusion. @@ -105,7 +107,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py index 99cfcb806795..ef2333a18db2 100644 --- a/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py +++ b/src/diffusers/pipelines/stable_diffusion/convert_from_ckpt.py @@ -233,7 +233,7 @@ def create_unet_diffusers_config(original_config, image_size: int, controlnet=Fa if controlnet: unet_params = original_config.model.params.control_stage_config.params else: - if original_config.model.params.unet_config is not None: + if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None: unet_params = original_config.model.params.unet_config.params else: unet_params = original_config.model.params.network_config.params @@ -1139,7 +1139,7 @@ def download_from_original_stable_diffusion_ckpt( return: A StableDiffusionPipeline object representing the passed-in `.ckpt`/`.safetensors` file. """ - # import pipelines here to avoid circular import error when using from_ckpt method + # import pipelines here to avoid circular import error when using from_single_file method from diffusers import ( LDMTextToImagePipeline, PaintByExamplePipeline, @@ -1192,23 +1192,45 @@ def download_from_original_stable_diffusion_ckpt( checkpoint = checkpoint["state_dict"] if original_config_file is None: - key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight" + key_name_v2_1 = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight" + key_name_sd_xl_base = "conditioner.embedders.1.model.transformer.resblocks.9.mlp.c_proj.bias" + key_name_sd_xl_refiner = "conditioner.embedders.0.model.transformer.resblocks.9.mlp.c_proj.bias" # model_type = "v1" config_url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml" - if key_name in checkpoint and checkpoint[key_name].shape[-1] == 1024: + if key_name_v2_1 in checkpoint and checkpoint[key_name_v2_1].shape[-1] == 1024: # model_type = "v2" config_url = "https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml" if global_step == 110000: # v2.1 needs to upcast attention upcast_attention = True + elif key_name_sd_xl_base in checkpoint: + # only base xl has two text embedders + config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_base.yaml" + elif key_name_sd_xl_refiner in checkpoint: + # only refiner xl has embedder and one text embedders + config_url = "https://raw.githubusercontent.com/Stability-AI/generative-models/main/configs/inference/sd_xl_refiner.yaml" original_config_file = BytesIO(requests.get(config_url).content) original_config = OmegaConf.load(original_config_file) + # Convert the text model. + if ( + model_type is None + and "cond_stage_config" in original_config.model.params + and original_config.model.params.cond_stage_config is not None + ): + model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] + logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") + elif model_type is None and original_config.model.params.network_config is not None: + if original_config.model.params.network_config.params.context_dim == 2048: + model_type = "SDXL" + else: + model_type = "SDXL-Refiner" + if num_in_channels is not None: original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels @@ -1238,20 +1260,39 @@ def download_from_original_stable_diffusion_ckpt( checkpoint, original_config, checkpoint_path, image_size, upcast_attention, extract_ema ) - num_train_timesteps = original_config.model.params.timesteps or 1000 - beta_start = original_config.model.params.linear_start or 0.02 - beta_end = original_config.model.params.linear_end or 0.085 - - scheduler = DDIMScheduler( - beta_end=beta_end, - beta_schedule="scaled_linear", - beta_start=beta_start, - num_train_timesteps=num_train_timesteps, - steps_offset=1, - clip_sample=False, - set_alpha_to_one=False, - prediction_type=prediction_type, - ) + num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000 + + if model_type in ["SDXL", "SDXL-Refiner"]: + image_size = 1024 + scheduler_dict = { + "beta_schedule": "scaled_linear", + "beta_start": 0.00085, + "beta_end": 0.012, + "interpolation_type": "linear", + "num_train_timesteps": num_train_timesteps, + "prediction_type": "epsilon", + "sample_max_value": 1.0, + "set_alpha_to_one": False, + "skip_prk_steps": True, + "steps_offset": 1, + "timestep_spacing": "leading", + } + scheduler = EulerDiscreteScheduler.from_config(scheduler_dict) + scheduler_type = "euler" + vae_path = "stabilityai/sdxl-vae" + else: + beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02 + beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085 + scheduler = DDIMScheduler( + beta_end=beta_end, + beta_schedule="scaled_linear", + beta_start=beta_start, + num_train_timesteps=num_train_timesteps, + steps_offset=1, + clip_sample=False, + set_alpha_to_one=False, + prediction_type=prediction_type, + ) # make sure scheduler works correctly with DDIM scheduler.register_to_config(clip_sample=False) @@ -1294,16 +1335,6 @@ def download_from_original_stable_diffusion_ckpt( else: vae = AutoencoderKL.from_pretrained(vae_path) - # Convert the text model. - if model_type is None and original_config.model.params.cond_stage_config is not None: - model_type = original_config.model.params.cond_stage_config.target.split(".")[-1] - logger.debug(f"no `model_type` given, `model_type` inferred as: {model_type}") - elif model_type is None and original_config.model.params.network_config is not None: - if original_config.model.params.network_config.params.context_dim == 2048: - model_type = "SDXL" - else: - model_type = "SDXL-Refiner" - if model_type == "FrozenOpenCLIPEmbedder": text_model = convert_open_clip_checkpoint(checkpoint) tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer") diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py index 8368668ebea7..9ad4d404fdbe 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py @@ -22,7 +22,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -69,7 +69,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): return noise_cfg -class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): +class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -79,7 +79,7 @@ class StableDiffusionPipeline(DiffusionPipeline, TextualInversionLoaderMixin, Lo In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py index e9e91b646ed5..f8874ba2cfae 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py @@ -24,7 +24,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -98,7 +98,9 @@ def preprocess(image): return image -class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): +class StableDiffusionImg2ImgPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin +): r""" Pipeline for text-guided image to image generation using Stable Diffusion. @@ -108,7 +110,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline, TextualInversionLoaderMi In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py index 55d571ab0998..483f27ae3966 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint_legacy.py @@ -24,7 +24,7 @@ from ...configuration_utils import FrozenDict from ...image_processor import VaeImageProcessor -from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -85,7 +85,7 @@ def preprocess_mask(mask, batch_size, scale_factor=8): class StableDiffusionInpaintPipelineLegacy( - DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin ): r""" Pipeline for text-guided image inpainting using Stable Diffusion. *This is an experimental feature*. @@ -96,7 +96,7 @@ class StableDiffusionInpaintPipelineLegacy( In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py index 2df9c46f0be3..85f628ca8229 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py @@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessorLDM3D -from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -74,7 +74,9 @@ class LDM3DPipelineOutput(BaseOutput): nsfw_content_detected: Optional[List[bool]] -class StableDiffusionLDM3DPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): +class StableDiffusionLDM3DPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin +): r""" Pipeline for text-to-image and 3d generation using LDM3D. LDM3D: Latent Diffusion Model for 3D: https://arxiv.org/abs/2305.10853 @@ -85,7 +87,7 @@ class StableDiffusionLDM3DPipeline(DiffusionPipeline, TextualInversionLoaderMixi In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py index 33549ebb0edb..2239e3853a8e 100644 --- a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py +++ b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_paradigms.py @@ -19,7 +19,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import FromCkptMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...schedulers import KarrasDiffusionSchedulers from ...utils import ( @@ -59,7 +59,9 @@ """ -class StableDiffusionParadigmsPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromCkptMixin): +class StableDiffusionParadigmsPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin +): r""" Parallelized version of StableDiffusionPipeline, based on the paper https://arxiv.org/abs/2305.16317 This pipeline parallelizes the denoising steps to generate a single image faster (more akin to model parallelism). @@ -72,7 +74,7 @@ class StableDiffusionParadigmsPipeline(DiffusionPipeline, TextualInversionLoader In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py index c50381c2eb23..142aac94b99d 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py @@ -19,7 +19,7 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( AttnProcessor2_0, @@ -73,7 +73,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): return noise_cfg -class StableDiffusionXLPipeline(DiffusionPipeline): +class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -83,7 +83,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline): In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] @@ -541,9 +541,9 @@ def __call__( callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, - original_size: Tuple[int, int] = (1024, 1024), + original_size: Optional[Tuple[int, int]] = None, crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Tuple[int, int] = (1024, 1024), + target_size: Optional[Tuple[int, int]] = None, ): r""" Function invoked when calling the pipeline for generation. @@ -629,6 +629,9 @@ def __call__( height = height or self.unet.config.sample_size * self.vae_scale_factor width = width or self.unet.config.sample_size * self.vae_scale_factor + original_size = original_size or (height, width) + target_size = target_size or (height, width) + # 1. Check inputs. Raise error if not correct self.check_inputs( prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py index 329a626ada2e..f699e2331027 100644 --- a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py +++ b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py @@ -21,7 +21,7 @@ from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer from ...image_processor import VaeImageProcessor -from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin +from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin from ...models import AutoencoderKL, UNet2DConditionModel from ...models.attention_processor import ( AttnProcessor2_0, @@ -78,7 +78,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): return noise_cfg -class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline): +class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin): r""" Pipeline for text-to-image generation using Stable Diffusion. @@ -88,7 +88,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline): In addition the pipeline inherits the following loading methods: - *Textual-Inversion*: [`loaders.TextualInversionLoaderMixin.load_textual_inversion`] - *LoRA*: [`loaders.LoraLoaderMixin.load_lora_weights`] - - *Ckpt*: [`loaders.FromCkptMixin.from_ckpt`] + - *Ckpt*: [`loaders.FromSingleFileMixin.from_single_file`] as well as the following saving methods: - *LoRA*: [`loaders.LoraLoaderMixin.save_lora_weights`] @@ -136,7 +136,6 @@ def __init__( self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt) self.register_to_config(requires_aesthetics_score=requires_aesthetics_score) self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) - self.vae_scale_factor = 8 self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) self.watermark = StableDiffusionXLWatermarker() @@ -631,9 +630,9 @@ def __call__( callback_steps: int = 1, cross_attention_kwargs: Optional[Dict[str, Any]] = None, guidance_rescale: float = 0.0, - original_size: Tuple[int, int] = (1024, 1024), + original_size: Tuple[int, int] = None, crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Tuple[int, int] = (1024, 1024), + target_size: Tuple[int, int] = None, aesthetic_score: float = 6.0, negative_aesthetic_score: float = 2.5, ): @@ -778,6 +777,13 @@ def __call__( # 7. Prepare extra step kwargs. extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + height, width = latents.shape[-2:] + height = height * self.vae_scale_factor + width = width * self.vae_scale_factor + + original_size = original_size or (height, width) + target_size = target_size or (height, width) + # 8. Prepare added time ids & embeddings add_text_embeds = pooled_prompt_embeds add_time_ids, add_neg_time_ids = self._get_add_time_ids( diff --git a/tests/pipelines/stable_diffusion/test_stable_diffusion.py b/tests/pipelines/stable_diffusion/test_stable_diffusion.py index 7daf3fcda4a2..a10462a345c1 100644 --- a/tests/pipelines/stable_diffusion/test_stable_diffusion.py +++ b/tests/pipelines/stable_diffusion/test_stable_diffusion.py @@ -1029,7 +1029,7 @@ def test_download_from_hub(self): ] for ckpt_path in ckpt_paths: - pipe = StableDiffusionPipeline.from_ckpt(ckpt_path, torch_dtype=torch.float16) + pipe = StableDiffusionPipeline.from_single_file(ckpt_path, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.to("cuda") @@ -1040,7 +1040,7 @@ def test_download_from_hub(self): def test_download_local(self): filename = hf_hub_download("runwayml/stable-diffusion-v1-5", filename="v1-5-pruned-emaonly.ckpt") - pipe = StableDiffusionPipeline.from_ckpt(filename, torch_dtype=torch.float16) + pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.to("cuda") @@ -1051,7 +1051,7 @@ def test_download_local(self): def test_download_ckpt_diff_format_is_same(self): ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.ckpt" - pipe = StableDiffusionPipeline.from_ckpt(ckpt_path) + pipe = StableDiffusionPipeline.from_single_file(ckpt_path) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) pipe.unet.set_attn_processor(AttnProcessor()) pipe.to("cuda") From b8f089c5a383374246d657b8894c2ac830149e50 Mon Sep 17 00:00:00 2001 From: Aisuko Date: Fri, 7 Jul 2023 02:29:04 +1000 Subject: [PATCH 188/199] fix/doc-code: import torch and fix the broken document address (#3941) Signed-off-by: GitHub --- docs/source/en/using-diffusers/loading.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/en/using-diffusers/loading.mdx b/docs/source/en/using-diffusers/loading.mdx index 8ebd3569e4b0..79c8b278468d 100644 --- a/docs/source/en/using-diffusers/loading.mdx +++ b/docs/source/en/using-diffusers/loading.mdx @@ -174,7 +174,7 @@ A checkpoint variant is usually a checkpoint where it's weights are:
-Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [Safetensors](./using-diffusers/using_safetensors)), model structure, and weights have identical tensor shapes. +Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [Safetensors](./using_safetensors)), model structure, and weights have identical tensor shapes. | **checkpoint type** | **weight name** | **argument for loading weights** | |---------------------|-------------------------------------|----------------------------------| @@ -190,6 +190,7 @@ There are two important arguments to know for loading variants: ```python from diffusers import DiffusionPipeline +import torch # load fp16 variant stable_diffusion = DiffusionPipeline.from_pretrained( From 38e563d0c77a22dda8ebf27bcf89549702995175 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Jul 2023 19:21:03 +0200 Subject: [PATCH 189/199] Fix SD XL Docs (#3971) * finish sd xl docs * make style * Apply suggestions from code review * uP * uP * Correct --- .github/workflows/build_documentation.yml | 18 +-- .github/workflows/build_pr_documentation.yml | 19 +-- .../stable_diffusion/stable_diffusion_xl.mdx | 124 +++++++++++++++++- src/diffusers/utils/import_utils.py | 2 +- 4 files changed, 133 insertions(+), 30 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 79d2cdec0672..8fdae99883f8 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -11,17 +11,13 @@ on: jobs: build: steps: - - name: Install dependencies - run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y - - - name: Build doc - uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main - with: - commit_sha: ${{ github.sha }} - package: diffusers - notebook_folder: diffusers_doc - languages: en ko zh + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + install_libgl1: true + package: diffusers + notebook_folder: diffusers_doc + languages: en ko zh secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index 248644b7e9cd..18b606ca754c 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -9,15 +9,10 @@ concurrency: jobs: build: - steps: - - name: Install dependencies - run: | - apt-get update && apt-get install libsndfile1-dev libgl1 -y - - - name: Build doc - uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main - with: - commit_sha: ${{ github.event.pull_request.head.sha }} - pr_number: ${{ github.event.number }} - package: diffusers - languages: en ko zh + uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main + with: + commit_sha: ${{ github.event.pull_request.head.sha }} + pr_number: ${{ github.event.number }} + install_libgl1: true + package: diffusers + languages: en ko zh diff --git a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx index b87d51af233b..64abb9eef8c8 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx +++ b/docs/source/en/api/pipelines/stable_diffusion/stable_diffusion_xl.mdx @@ -12,22 +12,134 @@ specific language governing permissions and limitations under the License. # Stable diffusion XL -Stable Diffusion 2 is a text-to-image _latent diffusion_ model built upon the work of [Stable Diffusion 1](https://stability.ai/blog/stable-diffusion-public-release). -The project to train Stable Diffusion 2 was led by Robin Rombach and Katherine Crowson from [Stability AI](https://stability.ai/) and [LAION](https://laion.ai/). +Stable Diffusion XL was proposed in [SDXL: Improving Latent Diffusion Models for High-Resolution Image Synthesis](https://arxiv.org/abs/2307.01952) by Dustin Podell, Zion English, Kyle Lacey, Andreas Blattmann, Tim Dockhorn, Jonas Müller, Joe Penna, Robin Rombach -*The Stable Diffusion 2.0 release includes robust text-to-image models trained using a brand new text encoder (OpenCLIP), developed by LAION with support from Stability AI, which greatly improves the quality of the generated images compared to earlier V1 releases. The text-to-image models in this release can generate images with default resolutions of both 512x512 pixels and 768x768 pixels. -These models are trained on an aesthetic subset of the [LAION-5B dataset](https://laion.ai/blog/laion-5b/) created by the DeepFloyd team at Stability AI, which is then further filtered to remove adult content using [LAION’s NSFW filter](https://openreview.net/forum?id=M3Y74vmsMcY).* +The abstract of the paper is the following: -For more details about how Stable Diffusion 2 works and how it differs from Stable Diffusion 1, please refer to the official [launch announcement post](https://stability.ai/blog/stable-diffusion-v2-release). +*We present SDXL, a latent diffusion model for text-to-image synthesis. Compared to previous versions of Stable Diffusion, SDXL leverages a three times larger UNet backbone: The increase of model parameters is mainly due to more attention blocks and a larger cross-attention context as SDXL uses a second text encoder. We design multiple novel conditioning schemes and train SDXL on multiple aspect ratios. We also introduce a refinement model which is used to improve the visual fidelity of samples generated by SDXL using a post-hoc image-to-image technique. We demonstrate that SDXL shows drastically improved performance compared the previous versions of Stable Diffusion and achieves results competitive with those of black-box state-of-the-art image generators.* ## Tips +- Stable Diffusion XL works especially well with images between 768 and 1024. +- Stable Diffusion XL output image can be improved by making use of a refiner as shown below + ### Available checkpoints: - *Text-to-Image (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-base-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-base-0.9) with [`StableDiffusionXLPipeline`] - *Image-to-Image / Refiner (1024x1024 resolution)*: [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9) with [`StableDiffusionXLImg2ImgPipeline`] -TODO +## Usage Example + +Before using SDXL make sure to have `transformers`, `accelerate`, `safetensors` and `invisible_watermark` installed. +You can install the libraries as follows: + +``` +pip install transformers +pip install accelerate +pip install safetensors +pip install invisible-watermark>=2.0 +``` + +### *Text-to-Image* + +You can use SDXL as follows for *text-to-image*: + +```py +from diffusers import StableDiffusionXLPipeline +import torch + +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True +) +pipe.to("cuda") + +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" +image = pipe(prompt=prompt).images[0] +``` + +### Refining the image output + +The image can be refined by making use of [stabilityai/stable-diffusion-xl-refiner-0.9](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-0.9). +In this case, you only have to output the `latents` from the base model. + +```py +from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline +import torch + +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True +) +pipe.to("cuda") + +refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16" +) +refiner.to("cuda") + +prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k" + +image = pipe(prompt=prompt, output_type="latent" if use_refiner else "pil").images[0] +image = refiner(prompt=prompt, image=image[None, :]).images[0] +``` + +### Loading single file checkpoitns / original file format + +By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the +original file format into `diffusers`: + +```py +from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline +import torch + +pipe = StableDiffusionXLPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-base-0.9", torch_dtype=torch.float16, variant="fp16", use_safetensors=True +) +pipe.to("cuda") + +refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained( + "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16" +) +refiner.to("cuda") +``` + +### Memory optimization via model offloading + +If you are seeing out-of-memory errors, we recommend making use of [`StableDiffusionXLPipeline.enable_model_cpu_offload`]. + +```diff +- pipe.to("cuda") ++ pipe.enable_model_cpu_offload() +``` + +and + +```diff +- refiner.to("cuda") ++ refiner.enable_model_cpu_offload() +``` + +### Speed-up inference with `torch.compile` + +You can speed up inference by making use of `torch.compile`. This should give you **ca.** 20% speed-up. + +```diff ++ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) ++ refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True) +``` + +### Running with `torch` < 2.0 + +**Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers +attention: + +``` +pip install xformers +``` + +```diff ++pipe.enable_xformers_memory_efficient_attention() ++refiner.enable_xformers_memory_efficient_attention() +``` ## StableDiffusionXLPipeline diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py index 287992207e5a..3a7539cfb0fb 100644 --- a/src/diffusers/utils/import_utils.py +++ b/src/diffusers/utils/import_utils.py @@ -504,7 +504,7 @@ def is_invisible_watermark_available(): # docstyle-ignore INVISIBLE_WATERMARK_IMPORT_ERROR = """ -{0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install git+https://github.com/patrickvonplaten/invisible-watermark.git@remove_onnxruntime_depedency` +{0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install invisible-watermark>=2.0` """ From 51593da25aba44ed27d8680dcf0cfca4459f1e85 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 6 Jul 2023 19:28:33 +0200 Subject: [PATCH 190/199] fix main docs --- .github/workflows/build_documentation.yml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml index 8fdae99883f8..bd45b08d24f7 100644 --- a/.github/workflows/build_documentation.yml +++ b/.github/workflows/build_documentation.yml @@ -10,14 +10,13 @@ on: jobs: build: - steps: - uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main - with: - commit_sha: ${{ github.sha }} - install_libgl1: true - package: diffusers - notebook_folder: diffusers_doc - languages: en ko zh + uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main + with: + commit_sha: ${{ github.sha }} + install_libgl1: true + package: diffusers + notebook_folder: diffusers_doc + languages: en ko zh secrets: token: ${{ secrets.HUGGINGFACE_PUSH }} From 1fbcc78d6e55613b902015ff65a1d850594fa859 Mon Sep 17 00:00:00 2001 From: Yorai Levi Date: Fri, 7 Jul 2023 10:33:51 +0300 Subject: [PATCH 191/199] typo in safetensors (safetenstors) (#3976) * Update pipeline_utils.py typo in safetensors (safetenstors) * Update loaders.py typo in safetensors (safetenstors) * Update modeling_utils.py typo in safetensors (safetenstors) --- src/diffusers/loaders.py | 8 ++++---- src/diffusers/models/modeling_utils.py | 2 +- src/diffusers/pipelines/pipeline_utils.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py index a0be20c54361..525bb446b77e 100644 --- a/src/diffusers/loaders.py +++ b/src/diffusers/loaders.py @@ -177,7 +177,7 @@ def load_attn_procs(self, pretrained_model_name_or_path_or_dict: Union[str, Dict if use_safetensors and not is_safetensors_available(): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" + "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" ) allow_pickle = False @@ -589,7 +589,7 @@ def load_textual_inversion( if use_safetensors and not is_safetensors_available(): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" + "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" ) allow_pickle = False @@ -806,7 +806,7 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di if use_safetensors and not is_safetensors_available(): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" + "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" ) allow_pickle = False @@ -1054,7 +1054,7 @@ def _load_text_encoder_attn_procs( if use_safetensors and not is_safetensors_available(): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" + "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" ) allow_pickle = False diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py index cc8df3fe6d69..1fa96514c0a9 100644 --- a/src/diffusers/models/modeling_utils.py +++ b/src/diffusers/models/modeling_utils.py @@ -456,7 +456,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P if use_safetensors and not is_safetensors_available(): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" + "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" ) allow_pickle = False diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 734af819c852..87c0f711a30c 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -204,7 +204,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi transformers_index_format = r"\d{5}-of-\d{5}" if variant is not None: - # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors` + # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors` variant_file_re = re.compile( rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$" ) @@ -213,7 +213,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$" ) - # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors` + # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors` non_variant_file_re = re.compile( rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$" ) @@ -1168,7 +1168,7 @@ def download(cls, pretrained_model_name, **kwargs) -> Union[str, os.PathLike]: if use_safetensors and not is_safetensors_available(): raise ValueError( - "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors" + "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors" ) allow_pickle = False From 8d8b4311b9e19c3621dbd69e6690ee7be5702f65 Mon Sep 17 00:00:00 2001 From: Omar Sanseviero Date: Fri, 7 Jul 2023 10:39:38 +0200 Subject: [PATCH 192/199] Fix code snippet for Audio Diffusion (#3987) --- docs/source/en/api/pipelines/audio_diffusion.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/en/api/pipelines/audio_diffusion.mdx b/docs/source/en/api/pipelines/audio_diffusion.mdx index 9c7725367e8f..b6d64c938060 100644 --- a/docs/source/en/api/pipelines/audio_diffusion.mdx +++ b/docs/source/en/api/pipelines/audio_diffusion.mdx @@ -43,7 +43,7 @@ pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(devic output = pipe() display(output.images[0]) -display(Audio(output.audios[0], rate=mel.get_sample_rate())) +display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate())) ``` ### Latent Audio Diffusion From 03d829d59e261b391455348cd145369077960745 Mon Sep 17 00:00:00 2001 From: Saurav Maheshkar Date: Fri, 7 Jul 2023 15:08:16 +0530 Subject: [PATCH 193/199] feat: add `Dropout` to Flax UNet (#3894) * feat: add Dropout to Flax UNet * feat: add @compact decorator * fix: drop nn.compact --- src/diffusers/models/attention_flax.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/diffusers/models/attention_flax.py b/src/diffusers/models/attention_flax.py index 4f78b324a8e2..0b160d238431 100644 --- a/src/diffusers/models/attention_flax.py +++ b/src/diffusers/models/attention_flax.py @@ -152,6 +152,7 @@ def setup(self): self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v") self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0") + self.dropout_layer = nn.Dropout(rate=self.dropout) def reshape_heads_to_batch_dim(self, tensor): batch_size, seq_len, dim = tensor.shape @@ -214,7 +215,7 @@ def __call__(self, hidden_states, context=None, deterministic=True): hidden_states = self.reshape_batch_dim_to_heads(hidden_states) hidden_states = self.proj_attn(hidden_states) - return hidden_states + return self.dropout_layer(hidden_states, deterministic=deterministic) class FlaxBasicTransformerBlock(nn.Module): @@ -260,6 +261,7 @@ def setup(self): self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype) self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype) self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype) + self.dropout_layer = nn.Dropout(rate=self.dropout) def __call__(self, hidden_states, context, deterministic=True): # self attention @@ -280,7 +282,7 @@ def __call__(self, hidden_states, context, deterministic=True): hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic) hidden_states = hidden_states + residual - return hidden_states + return self.dropout_layer(hidden_states, deterministic=deterministic) class FlaxTransformer2DModel(nn.Module): @@ -356,6 +358,8 @@ def setup(self): dtype=self.dtype, ) + self.dropout_layer = nn.Dropout(rate=self.dropout) + def __call__(self, hidden_states, context, deterministic=True): batch, height, width, channels = hidden_states.shape residual = hidden_states @@ -378,7 +382,7 @@ def __call__(self, hidden_states, context, deterministic=True): hidden_states = self.proj_out(hidden_states) hidden_states = hidden_states + residual - return hidden_states + return self.dropout_layer(hidden_states, deterministic=deterministic) class FlaxFeedForward(nn.Module): @@ -409,7 +413,7 @@ def setup(self): self.net_2 = nn.Dense(self.dim, dtype=self.dtype) def __call__(self, hidden_states, deterministic=True): - hidden_states = self.net_0(hidden_states) + hidden_states = self.net_0(hidden_states, deterministic=deterministic) hidden_states = self.net_2(hidden_states) return hidden_states @@ -434,8 +438,9 @@ class FlaxGEGLU(nn.Module): def setup(self): inner_dim = self.dim * 4 self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype) + self.dropout_layer = nn.Dropout(rate=self.dropout) def __call__(self, hidden_states, deterministic=True): hidden_states = self.proj(hidden_states) hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2) - return hidden_linear * nn.gelu(hidden_gelu) + return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic) From ea7d75f0e2ce51628a095f1039f737510a49547d Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Mon, 10 Jul 2023 12:58:05 +0530 Subject: [PATCH 194/199] Update training script to main, fix timesteps --- .../train_consistency_distillation.py | 79 ++++--- src/diffusers/models/unet_2d_blocks.py | 219 ------------------ .../scheduling_consistency_models.py | 3 +- 3 files changed, 52 insertions(+), 249 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 99f7d06fa097..0937f1332639 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -5,7 +5,7 @@ import os from pathlib import Path from typing import Optional - +import shutil import accelerate import datasets import torch @@ -23,7 +23,7 @@ from diffusers import DDPMPipeline, UNet2DModel, CMStochasticIterativeScheduler, ConsistencyModelPipeline from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel -from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available +from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available, is_xformers_available #Copied from examples/unconditional_image_generation/train_unconditional.py for now @@ -281,14 +281,12 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) - - accelerator_project_config = ProjectConfiguration(total_limit=args.checkpoints_total_limit) + accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.logger, - logging_dir=logging_dir, project_config=accelerator_project_config, ) @@ -377,14 +375,15 @@ def load_model_hook(models, input_dir): attention_head_dim=8, down_block_types= [ "ResnetDownsampleBlock2D", - "AttnDownsampleBlock2D", + "AttnDownBlock2D", ], up_block_types= [ - "AttnUpsampleBlock2D", + "AttnUpBlock2D", "ResnetUpsampleBlock2D", ], resnet_time_scale_shift="scale_shift", - + upsample_type="resnet", + downsample_type="resnet" ) target_model = UNet2DModel( sample_size= args.resolution, @@ -396,19 +395,21 @@ def load_model_hook(models, input_dir): attention_head_dim=8, down_block_types= [ "ResnetDownsampleBlock2D", - "AttnDownsampleBlock2D", + "AttnDownBlock2D", ], up_block_types= [ - "AttnUpsampleBlock2D", + "AttnUpBlock2D", "ResnetUpsampleBlock2D", ], resnet_time_scale_shift="scale_shift", - + upsample_type="resnet", + downsample_type="resnet" ) else: config = UNet2DModel.load_config(args.model_config_name_or_path) model = UNet2DModel.from_config(config) target_model = UNet2DModel.from_config(config) + # load the model to distill into a consistency model teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet @@ -417,8 +418,6 @@ def load_model_hook(models, input_dir): teacher_model = teacher_model.float() noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 - noise_scheduler.set_timesteps(num_scales) - timesteps = noise_scheduler.timesteps # Create EMA for the model, this is the target model in the paper @@ -489,12 +488,12 @@ def transform_images(examples): ) # Prepare everything with our `accelerator`. - model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema = accelerator.prepare( - model, optimizer, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema + model, optimizer, noise_scheduler, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema = accelerator.prepare( + model, optimizer, noise_scheduler, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema ) + noise_scheduler.set_timesteps(num_scales, device=accelerator.device) target_model_ema.to(accelerator.device) - # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: @@ -550,7 +549,8 @@ def transform_images(examples): disable=not accelerator.is_local_main_process, ) - + timesteps = noise_scheduler.timesteps + sigmas = noise_scheduler.sigmas # Train! for epoch in range(first_epoch, args.num_epochs): @@ -564,17 +564,18 @@ def transform_images(examples): index = torch.randint( 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=clean_images.device ).long() + # timestep is the scaled timestep, sigma is the unscaled timestep timestep = timesteps[index] - timestep_prev = timestep + 1 + sigma = sigmas[index] + timestep_prev = timesteps[index+1] + sigma_prev = sigmas[index+1] + # add noise expects the scaled timestep only and internally converts to sigma noised_image = noise_scheduler.add_noise(clean_images, noise, timestep) - scaled_timesteps = noise_scheduler.scale_timestep(timestep) - scaled_timesteps_prev = noise_scheduler.scale_timestep(timestep_prev) target_model_ema.copy_to(target_model.parameters()) with accelerator.accumulate(model): # Predict the noise residual - - model_output = model(noise_scheduler.scale_model_input(noised_image, timestep), scaled_timesteps, class_labels=labels).sample + model_output = model(noise_scheduler.scale_model_input(noised_image, timestep), timestep, class_labels=labels).sample distiller = noise_scheduler.step( model_output, timestep, noised_image, use_noise=False ).prev_sample @@ -584,22 +585,22 @@ def transform_images(examples): # TODO - make this cleaner samples = noised_image x = samples - model_output = teacher_model(noise_scheduler.scale_model_input(x, timestep), scaled_timesteps, class_labels=labels).sample + model_output = teacher_model(noise_scheduler.scale_model_input(x, timestep), timestep, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( model_output, timestep, x, use_noise=False ).prev_sample - d = (x - teacher_denoiser) / append_dims(scaled_timesteps, x.ndim) - samples = x + d * append_dims(scaled_timesteps_prev - scaled_timesteps, x.ndim) - model_output = teacher_model(noise_scheduler.scale_model_input(samples, timestep_prev), scaled_timesteps_prev, class_labels=labels).sample + d = (x - teacher_denoiser) / append_dims(sigma, x.ndim) + samples = x + d * append_dims(sigma_prev - sigma, x.ndim) + model_output = teacher_model(noise_scheduler.scale_model_input(samples, timestep_prev), timestep_prev, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( model_output, timestep_prev, samples, use_noise=False ).prev_sample - next_d = (samples - teacher_denoiser) / append_dims(scaled_timesteps_prev, x.ndim) - denoised_image = x + (d + next_d) * append_dims((scaled_timesteps_prev - scaled_timesteps) /2, x.ndim) + next_d = (samples - teacher_denoiser) / append_dims(sigma_prev, x.ndim) + denoised_image = x + (d + next_d) * append_dims((sigma_prev - sigma) /2, x.ndim) # get output from target model - model_output = target_model(denoised_image, scaled_timesteps_prev, class_labels=labels).sample + model_output = target_model(noise_scheduler.scale_model_input(denoised_image, timestep_prev), timestep_prev, class_labels=labels).sample distiller_target = noise_scheduler.step( model_output, timestep_prev, denoised_image, use_noise=False ).prev_sample @@ -622,6 +623,26 @@ def transform_images(examples): global_step += 1 if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] + + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) + if accelerator.is_main_process: save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") accelerator.save_state(save_path) diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py index 25ac6d7b1186..cb3452f4459c 100644 --- a/src/diffusers/models/unet_2d_blocks.py +++ b/src/diffusers/models/unet_2d_blocks.py @@ -107,20 +107,6 @@ def get_down_block( resnet_time_scale_shift=resnet_time_scale_shift, downsample_type=downsample_type, ) - elif down_block_type == "AttnDownsampleBlock2D": - return AttnDownsampleBlock2D( - num_layers=num_layers, - in_channels=in_channels, - out_channels=out_channels, - temb_channels=temb_channels, - add_downsample=add_downsample, - resnet_eps=resnet_eps, - resnet_act_fn=resnet_act_fn, - resnet_groups=resnet_groups, - downsample_padding=downsample_padding, - attn_num_head_channels=attn_num_head_channels, - resnet_time_scale_shift=resnet_time_scale_shift, - ) elif down_block_type == "CrossAttnDownBlock2D": if cross_attention_dim is None: raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock2D") @@ -361,20 +347,6 @@ def get_up_block( resnet_time_scale_shift=resnet_time_scale_shift, upsample_type=upsample_type, ) - elif up_block_type == "AttnUpsampleBlock2D": - return AttnUpsampleBlock2D( - num_layers=num_layers, - in_channels=in_channels, - out_channels=out_channels, - prev_output_channel=prev_output_channel, - temb_channels=temb_channels, - add_upsample=add_upsample, - resnet_eps=resnet_eps, - resnet_act_fn=resnet_act_fn, - resnet_groups=resnet_groups, - attn_num_head_channels=attn_num_head_channels, - resnet_time_scale_shift=resnet_time_scale_shift, - ) elif up_block_type == "SkipUpBlock2D": return SkipUpBlock2D( num_layers=num_layers, @@ -887,100 +859,6 @@ def forward(self, hidden_states, temb=None, upsample_size=None): return hidden_states, output_states -class AttnDownsampleBlock2D(nn.Module): - def __init__( - self, - in_channels: int, - out_channels: int, - temb_channels: int, - dropout: float = 0.0, - num_layers: int = 1, - resnet_eps: float = 1e-6, - resnet_time_scale_shift: str = "default", - resnet_act_fn: str = "swish", - resnet_groups: int = 32, - resnet_pre_norm: bool = True, - attn_num_head_channels=1, - output_scale_factor=1.0, - downsample_padding=1, - add_downsample=True, - ): - super().__init__() - resnets = [] - attentions = [] - - for i in range(num_layers): - in_channels = in_channels if i == 0 else out_channels - resnets.append( - ResnetBlock2D( - in_channels=in_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - ) - ) - attentions.append( - Attention( - out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, - rescale_output_factor=output_scale_factor, - eps=resnet_eps, - norm_num_groups=resnet_groups, - residual_connection=True, - bias=True, - upcast_softmax=True, - _from_deprecated_attn_block=True, - ) - ) - - self.attentions = nn.ModuleList(attentions) - self.resnets = nn.ModuleList(resnets) - - if add_downsample: - self.downsamplers = nn.ModuleList( - [ - ResnetBlock2D( - in_channels=out_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - down=True, - ) - ] - ) - else: - self.downsamplers = None - - def forward(self, hidden_states, temb=None, upsample_size=None): - output_states = () - - for resnet, attn in zip(self.resnets, self.attentions): - hidden_states = resnet(hidden_states, temb) - hidden_states = attn(hidden_states) - output_states += (hidden_states,) - - if self.downsamplers is not None: - for downsampler in self.downsamplers: - hidden_states = downsampler(hidden_states, temb) - - output_states += (hidden_states,) - - return hidden_states, output_states - - class CrossAttnDownBlock2D(nn.Module): def __init__( self, @@ -2114,103 +1992,6 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_si return hidden_states -class AttnUpsampleBlock2D(nn.Module): - def __init__( - self, - in_channels: int, - prev_output_channel: int, - out_channels: int, - temb_channels: int, - dropout: float = 0.0, - num_layers: int = 1, - resnet_eps: float = 1e-6, - resnet_time_scale_shift: str = "default", - resnet_act_fn: str = "swish", - resnet_groups: int = 32, - resnet_pre_norm: bool = True, - attn_num_head_channels=1, - output_scale_factor=1.0, - add_upsample=True, - ): - super().__init__() - resnets = [] - attentions = [] - - for i in range(num_layers): - res_skip_channels = in_channels if (i == num_layers - 1) else out_channels - resnet_in_channels = prev_output_channel if i == 0 else out_channels - - resnets.append( - ResnetBlock2D( - in_channels=resnet_in_channels + res_skip_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - ) - ) - attentions.append( - Attention( - out_channels, - heads=out_channels // attn_num_head_channels if attn_num_head_channels is not None else 1, - dim_head=attn_num_head_channels if attn_num_head_channels is not None else out_channels, - rescale_output_factor=output_scale_factor, - eps=resnet_eps, - norm_num_groups=resnet_groups, - residual_connection=True, - bias=True, - upcast_softmax=True, - _from_deprecated_attn_block=True, - ) - ) - - self.attentions = nn.ModuleList(attentions) - self.resnets = nn.ModuleList(resnets) - - if add_upsample: - self.upsamplers = nn.ModuleList( - [ - ResnetBlock2D( - in_channels=out_channels, - out_channels=out_channels, - temb_channels=temb_channels, - eps=resnet_eps, - groups=resnet_groups, - dropout=dropout, - time_embedding_norm=resnet_time_scale_shift, - non_linearity=resnet_act_fn, - output_scale_factor=output_scale_factor, - pre_norm=resnet_pre_norm, - up=True, - ) - ] - ) - - else: - self.upsamplers = None - - def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None): - for resnet, attn in zip(self.resnets, self.attentions): - # pop res hidden states - res_hidden_states = res_hidden_states_tuple[-1] - res_hidden_states_tuple = res_hidden_states_tuple[:-1] - hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) - - hidden_states = resnet(hidden_states, temb) - hidden_states = attn(hidden_states) - - if self.upsamplers is not None: - for upsampler in self.upsamplers: - hidden_states = upsampler(hidden_states, temb) - - return hidden_states - - class CrossAttnUpBlock2D(nn.Module): def __init__( self, diff --git a/src/diffusers/schedulers/scheduling_consistency_models.py b/src/diffusers/schedulers/scheduling_consistency_models.py index fb296054d65b..f580c28453cd 100644 --- a/src/diffusers/schedulers/scheduling_consistency_models.py +++ b/src/diffusers/schedulers/scheduling_consistency_models.py @@ -268,6 +268,7 @@ def step( sample: torch.FloatTensor, generator: Optional[torch.Generator] = None, return_dict: bool = True, + use_noise: bool = True ) -> Union[CMStochasticIterativeSchedulerOutput, Tuple]: """ Predict the sample at the previous timestep by reversing the SDE. Core function to propagate the diffusion @@ -331,7 +332,7 @@ def step( # 2. Sample z ~ N(0, s_noise^2 * I) # Noise is not used for onestep sampling. - if len(self.timesteps) > 1: + if len(self.timesteps) > 1 and use_noise: noise = randn_tensor( model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator ) From a32b8691b04d8f417601a2db5b032041408b162f Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Mon, 10 Jul 2023 13:08:11 +0530 Subject: [PATCH 195/199] Fix bug in timestep ordering --- .../train_consistency_distillation.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 0937f1332639..31fe32bc70b8 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -549,8 +549,9 @@ def transform_images(examples): disable=not accelerator.is_local_main_process, ) - timesteps = noise_scheduler.timesteps - sigmas = noise_scheduler.sigmas + timesteps = noise_scheduler.timesteps + sigmas = noise_scheduler.sigmas # in reverse order, sigma0 is sigma_max + # Train! for epoch in range(first_epoch, args.num_epochs): @@ -565,10 +566,10 @@ def transform_images(examples): 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=clean_images.device ).long() # timestep is the scaled timestep, sigma is the unscaled timestep - timestep = timesteps[index] - sigma = sigmas[index] - timestep_prev = timesteps[index+1] - sigma_prev = sigmas[index+1] + timestep = timesteps[index+1] + sigma = sigmas[index+1] + timestep_prev = timesteps[index] + sigma_prev = sigmas[index] # add noise expects the scaled timestep only and internally converts to sigma noised_image = noise_scheduler.add_noise(clean_images, noise, timestep) target_model_ema.copy_to(target_model.parameters()) From 8742e4e57edc92215950ffaec07af57da48ee960 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Tue, 11 Jul 2023 12:57:52 +0530 Subject: [PATCH 196/199] Add review suggestions --- examples/consistency_models/requirements.txt | 2 + examples/consistency_models/script.sh | 3 - .../train_consistency_distillation.py | 65 ++++--------------- 3 files changed, 13 insertions(+), 57 deletions(-) delete mode 100644 examples/consistency_models/script.sh diff --git a/examples/consistency_models/requirements.txt b/examples/consistency_models/requirements.txt index f366720afd11..bc7b6a7f238b 100644 --- a/examples/consistency_models/requirements.txt +++ b/examples/consistency_models/requirements.txt @@ -1,3 +1,5 @@ accelerate>=0.16.0 torchvision datasets +wandb +tensrboard diff --git a/examples/consistency_models/script.sh b/examples/consistency_models/script.sh deleted file mode 100644 index dfe6bcc5b7a0..000000000000 --- a/examples/consistency_models/script.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -accelerate launch train_consistency_distillation.py --dataset_name="cifar10" --resolution=32 --center_crop --random_flip --output_dir="cifar10-32" --train_batch_size=16 --num_epochs=100 --gradient_accumulation_steps=1 --learning_rate=1e-4 --lr_warmup_steps=500 --mixed_precision=no --push_to_hub \ No newline at end of file diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 31fe32bc70b8..29cdf725e008 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -18,7 +18,7 @@ from packaging import version from torchvision import transforms from tqdm.auto import tqdm - +import wandb import diffusers from diffusers import DDPMPipeline, UNet2DModel, CMStochasticIterativeScheduler, ConsistencyModelPipeline from diffusers.optimization import get_scheduler @@ -33,35 +33,6 @@ logger = get_logger(__name__, log_level="INFO") -def append_dims(x, target_dims): - """Appends dimensions to the end of a tensor until it has target_dims dimensions.""" - dims_to_append = target_dims - x.ndim - if dims_to_append < 0: - raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less") - return x[(...,) + (None,) * dims_to_append] - - - - - - -def _extract_into_tensor(arr, timesteps, broadcast_shape): - """ - Extract values from a 1-D numpy array for a batch of indices. - - :param arr: the 1-D numpy array. - :param timesteps: a tensor of indices into the array to extract. - :param broadcast_shape: a larger shape of K dimensions with the batch - dimension equal to the length of timesteps. - :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. - """ - if not isinstance(arr, torch.Tensor): - arr = torch.from_numpy(arr) - res = arr[timesteps].float().to(timesteps.device) - while len(res.shape) < len(broadcast_shape): - res = res[..., None] - return res.expand(broadcast_shape) - def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") @@ -290,15 +261,6 @@ def main(args): project_config=accelerator_project_config, ) - if args.logger == "tensorboard": - if not is_tensorboard_available(): - raise ImportError("Make sure to install tensorboard if you want to use it for logging during training.") - - elif args.logger == "wandb": - if not is_wandb_available(): - raise ImportError("Make sure to install wandb if you want to use it for logging during training.") - import wandb - # `accelerate` 0.16.0 will have better support for customized saving if version.parse(accelerate.__version__) >= version.parse("0.16.0"): # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format @@ -413,9 +375,6 @@ def load_model_hook(models, input_dir): # load the model to distill into a consistency model teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet - model = model.float() - target_model = target_model.float() # TODO : support half precision training - teacher_model = teacher_model.float() noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 @@ -586,24 +545,22 @@ def transform_images(examples): # TODO - make this cleaner samples = noised_image x = samples - model_output = teacher_model(noise_scheduler.scale_model_input(x, timestep), timestep, class_labels=labels).sample + teacher_model_output = teacher_model(noise_scheduler.scale_model_input(x, timestep), timestep, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( - model_output, timestep, x, use_noise=False + teacher_model_output, timestep, x, use_noise=False ).prev_sample - d = (x - teacher_denoiser) / append_dims(sigma, x.ndim) - samples = x + d * append_dims(sigma_prev - sigma, x.ndim) - model_output = teacher_model(noise_scheduler.scale_model_input(samples, timestep_prev), timestep_prev, class_labels=labels).sample + d = (x - teacher_denoiser) / sigma[(...,) + (None,) * 3] + samples = x + d * (sigma_prev - sigma)[(...,) + (None,) * 3] + teacher_model_output = teacher_model(noise_scheduler.scale_model_input(samples, timestep_prev), timestep_prev, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( - model_output, timestep_prev, samples, use_noise=False + teacher_model_output, timestep_prev, samples, use_noise=False ).prev_sample - - next_d = (samples - teacher_denoiser) / append_dims(sigma_prev, x.ndim) - denoised_image = x + (d + next_d) * append_dims((sigma_prev - sigma) /2, x.ndim) - + next_d = (samples - teacher_denoiser) / sigma_prev[(...,) + (None,) * 3] + denoised_image = x + (d + next_d) * ((sigma_prev - sigma) /2)[(...,) + (None,) * 3] # get output from target model - model_output = target_model(noise_scheduler.scale_model_input(denoised_image, timestep_prev), timestep_prev, class_labels=labels).sample + target_model_output = target_model(noise_scheduler.scale_model_input(denoised_image, timestep_prev), timestep_prev, class_labels=labels).sample distiller_target = noise_scheduler.step( - model_output, timestep_prev, denoised_image, use_noise=False + target_model_output, timestep_prev, denoised_image, use_noise=False ).prev_sample loss = F.mse_loss(distiller, distiller_target) From 943c88b05b629abd5dd4b604464040dee6e689be Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Wed, 12 Jul 2023 12:30:16 +0530 Subject: [PATCH 197/199] Integrate accelerator better, change model upload --- .../train_consistency_distillation.py | 131 +++++++++++++----- 1 file changed, 97 insertions(+), 34 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 29cdf725e008..9cd1a4ef56a3 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -14,7 +14,7 @@ from accelerate.logging import get_logger from accelerate.utils import ProjectConfiguration from datasets import load_dataset -from huggingface_hub import HfFolder, Repository, create_repo, whoami +from huggingface_hub import HfFolder, Repository, create_repo, whoami, upload_folder from packaging import version from torchvision import transforms from tqdm.auto import tqdm @@ -34,6 +34,39 @@ logger = get_logger(__name__, log_level="INFO") +def save_model_card( + repo_id: str, + images=None, + base_model=str, + repo_folder=None, + pipeline: ConsistencyModelPipeline = None, +): + img_str = "" + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" + + yaml = f""" +--- +license: creativeml-openrail-m +base_model: {base_model} +tags: +- consistency models +- diffusers +inference: true +--- + """ + model_card = f""" +# Consistency Model - {repo_id} + +This is a consistency model distilled from {base_model}. +You can find some example images in the following. \n +{img_str} +""" + with open(os.path.join(repo_folder, "README.md"), "w") as f: + f.write(yaml + model_card) + + def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( @@ -58,6 +91,13 @@ def parse_args(): default=None, help="The config of the UNet model to train, leave as None to use standard Consistency configuration.", ) + parser.add_argument( + "--pretrained_teacher_model_name_or_path", + type=str, + default=None, + required=True, + help="Path to pretrained model or model identifier from huggingface.co/models to be used as teacher model", + ) parser.add_argument( "--train_data_dir", type=str, @@ -314,14 +354,9 @@ def load_model_hook(models, input_dir): repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id - create_repo(repo_name, exist_ok=True, token=args.hub_token) + repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token) repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) - with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore: - if "step_*" not in gitignore: - gitignore.write("step_*\n") - if "epoch_*" not in gitignore: - gitignore.write("epoch_*\n") elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) @@ -374,10 +409,27 @@ def load_model_hook(models, input_dir): # load the model to distill into a consistency model - teacher_model = DDPMPipeline.from_pretrained("google/ddpm-cifar10-32").unet + teacher_model = DDPMPipeline.from_pretrained(args.pretrained_teacher_model_name_or_path).unet noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 + # Check that all trainable models are in full precision + low_precision_error_string = ( + "Please make sure to always have all model weights in full float32 precision when starting training - even if" + " doing mixed precision training. copy of the weights should still be float32." + ) + + if accelerator.unwrap_model(model).dtype != torch.float32: + raise ValueError( + f"Unet loaded as datatype {accelerator.unwrap_model(model).dtype}. {low_precision_error_string}" + ) + + if args.train_text_encoder and accelerator.unwrap_model(teacher_model).dtype != torch.float32: + raise ValueError( + f"Text encoder loaded as datatype {accelerator.unwrap_model(teacher_model).dtype}." + f" {low_precision_error_string}" + ) + # Create EMA for the model, this is the target model in the paper target_model_ema = EMAModel( @@ -452,12 +504,10 @@ def transform_images(examples): ) noise_scheduler.set_timesteps(num_scales, device=accelerator.device) - target_model_ema.to(accelerator.device) # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. if accelerator.is_main_process: - run = os.path.split(__file__)[-1].split(".")[0] - accelerator.init_trackers(run) + accelerator.init_trackers("consistency-distillation", vars(args)) total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) @@ -519,10 +569,10 @@ def transform_images(examples): clean_images = batch["input"] labels = batch["labels"] # Sample noise that we'll add to the images - noise = torch.randn(clean_images.shape).to(clean_images.device) + noise = torch.randn(clean_images.shape).to(accelerator.device) # Sample a random timestep for each image, TODO - allow different timesteps in a batch index = torch.randint( - 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=clean_images.device + 0, noise_scheduler.config.num_train_timesteps-1, (1,), device=accelerator.device ).long() # timestep is the scaled timestep, sigma is the unscaled timestep timestep = timesteps[index+1] @@ -580,31 +630,31 @@ def transform_images(examples): progress_bar.update(1) global_step += 1 - if global_step % args.checkpointing_steps == 0: - # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` - if args.checkpoints_total_limit is not None: - checkpoints = os.listdir(args.output_dir) - checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] - checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) + if accelerator.is_main_process: + if global_step % args.checkpointing_steps == 0: + # _before_ saving state, check if this save would set us over the `checkpoints_total_limit` + if args.checkpoints_total_limit is not None: + checkpoints = os.listdir(args.output_dir) + checkpoints = [d for d in checkpoints if d.startswith("checkpoint")] + checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1])) - # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints - if len(checkpoints) >= args.checkpoints_total_limit: - num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 - removing_checkpoints = checkpoints[0:num_to_remove] + # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints + if len(checkpoints) >= args.checkpoints_total_limit: + num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1 + removing_checkpoints = checkpoints[0:num_to_remove] - logger.info( - f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" - ) - logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") + logger.info( + f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints" + ) + logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}") - for removing_checkpoint in removing_checkpoints: - removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) - shutil.rmtree(removing_checkpoint) + for removing_checkpoint in removing_checkpoints: + removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint) + shutil.rmtree(removing_checkpoint) - if accelerator.is_main_process: - save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") - accelerator.save_state(save_path) - logger.info(f"Saved state to {save_path}") + save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}") + accelerator.save_state(save_path) + logger.info(f"Saved state to {save_path}") logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step} logs["ema_decay"] = target_model_ema.cur_decay_value @@ -671,6 +721,19 @@ def transform_images(examples): target_model_ema.restore(unet.parameters()) if args.push_to_hub: + save_model_card( + repo_id, + images=images, + base_model=args.pretrained_teacher_model_name_or_path, + repo_folder=args.output_dir, + pipeline=pipeline, + ) + upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + ignore_patterns=["step_*", "epoch_*"], + ) repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False) accelerator.end_training() From 6b58d8195f60c4fb9eabaac10cdd0c41d23a7d2c Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Thu, 13 Jul 2023 12:44:07 +0530 Subject: [PATCH 198/199] Fix checkpointing and add test --- .../train_consistency_distillation.py | 100 +++++++++++------- examples/test_examples.py | 23 ++++ 2 files changed, 86 insertions(+), 37 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 9cd1a4ef56a3..8fb306cd514b 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -199,6 +199,8 @@ def parse_args(): parser.add_argument("--ema_power", type=float, default=3 / 4, help="The power value for the EMA decay.") parser.add_argument("--ema_max_decay", type=float, default=0.9999, help="The maximum decay magnitude for EMA.") parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.") + parser.add_argument("--testing", action="store_true", help="If running a test") + parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.") parser.add_argument( "--hub_model_id", @@ -354,14 +356,23 @@ def load_model_hook(models, input_dir): repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token) else: repo_name = args.hub_model_id - repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token) + repo_id = create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token) elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) - # Initialize the model, using a smaller model than the one defined in the original paper by default - if args.model_config_name_or_path is None: + # For testing use a dummy model + if args.testing: + config = UNet2DModel.load_config('diffusers/consistency-models-test', subfolder="test_unet") + elif args.model_config_name_or_path is not None: + config = UNet2DModel.load_config(args.model_config_name_or_path) + # Use the config if provided, model and target model have the same structure + if config is not None: + model = UNet2DModel.from_config(config) + target_model = UNet2DModel.from_config(config) + # Otherwise, use a default config + else: model = UNet2DModel( sample_size= args.resolution, in_channels=3, @@ -401,15 +412,12 @@ def load_model_hook(models, input_dir): resnet_time_scale_shift="scale_shift", upsample_type="resnet", downsample_type="resnet" - ) + ) + if args.testing: + teacher_model = UNet2DModel.from_config(config) else: - config = UNet2DModel.load_config(args.model_config_name_or_path) - model = UNet2DModel.from_config(config) - target_model = UNet2DModel.from_config(config) - - - # load the model to distill into a consistency model - teacher_model = DDPMPipeline.from_pretrained(args.pretrained_teacher_model_name_or_path).unet + # load the model to distill into a consistency model + teacher_model = DDPMPipeline.from_pretrained(args.pretrained_teacher_model_name_or_path).unet noise_scheduler = CMStochasticIterativeScheduler() num_scales = 40 @@ -421,12 +429,12 @@ def load_model_hook(models, input_dir): if accelerator.unwrap_model(model).dtype != torch.float32: raise ValueError( - f"Unet loaded as datatype {accelerator.unwrap_model(model).dtype}. {low_precision_error_string}" + f"Consistency Model loaded as datatype {accelerator.unwrap_model(model).dtype}. {low_precision_error_string}" ) - if args.train_text_encoder and accelerator.unwrap_model(teacher_model).dtype != torch.float32: + if accelerator.unwrap_model(teacher_model).dtype != torch.float32: raise ValueError( - f"Text encoder loaded as datatype {accelerator.unwrap_model(teacher_model).dtype}." + f"Teacher_model loaded as datatype {accelerator.unwrap_model(teacher_model).dtype}." f" {low_precision_error_string}" ) @@ -442,7 +450,7 @@ def load_model_hook(models, input_dir): model_config=model.config, ) - # Initialize the optimizer + # Initialize the optimizer # TODO: Change this to match the paper, RAdam optimizer = torch.optim.AdamW( model.parameters(), lr=args.learning_rate, @@ -480,12 +488,16 @@ def load_model_hook(models, input_dir): ) def transform_images(examples): - images = [augmentations(image.convert("RGB")) for image in examples["img"]] + img_key = "image" if "image" in examples else "img" + images = [augmentations(image.convert("RGB")) for image in examples[img_key]] labels = [torch.tensor(label) for label in examples["label"]] return {"input": images, "labels": labels} + + logger.info(f"Dataset size: {len(dataset)}") dataset.set_transform(transform_images) + train_dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers ) @@ -499,10 +511,11 @@ def transform_images(examples): ) # Prepare everything with our `accelerator`. - model, optimizer, noise_scheduler, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema = accelerator.prepare( - model, optimizer, noise_scheduler, train_dataloader, lr_scheduler, teacher_model, target_model, target_model_ema + model, optimizer, noise_scheduler, train_dataloader, lr_scheduler, teacher_model, target_model = accelerator.prepare( + model, optimizer, noise_scheduler, train_dataloader, lr_scheduler, teacher_model, target_model ) noise_scheduler.set_timesteps(num_scales, device=accelerator.device) + target_model_ema.to(accelerator.device) # TODO accelerate.prepare doesn't work on this for some reason # We need to initialize the trackers we use, and also store our configuration. # The trackers initializes automatically on the main process. @@ -668,7 +681,6 @@ def transform_images(examples): if accelerator.is_main_process: if epoch % args.save_images_epochs == 0 or epoch == args.num_epochs - 1: unet = accelerator.unwrap_model(model) - target_model_ema.store(unet.parameters()) target_model_ema.copy_to(unet.parameters()) @@ -683,7 +695,7 @@ def transform_images(examples): generator=generator, batch_size=args.eval_batch_size, num_inference_steps=1, - output_type="numpy", + output_type="np", ).images target_model_ema.restore(unet.parameters()) @@ -707,7 +719,6 @@ def transform_images(examples): if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1: # save the model unet = accelerator.unwrap_model(model) - target_model_ema.store(unet.parameters()) target_model_ema.copy_to(unet.parameters()) @@ -717,24 +728,39 @@ def transform_images(examples): ) pipeline.save_pretrained(args.output_dir) - target_model_ema.restore(unet.parameters()) + + if accelerator.is_main_process and args.push_to_hub: + unet = accelerator.unwrap_model(model) + target_model_ema.copy_to(unet.parameters()) + + pipeline = ConsistencyModelPipeline( + unet=unet, + scheduler=noise_scheduler, + ) - if args.push_to_hub: - save_model_card( - repo_id, - images=images, - base_model=args.pretrained_teacher_model_name_or_path, - repo_folder=args.output_dir, - pipeline=pipeline, - ) - upload_folder( - repo_id=repo_id, - folder_path=args.output_dir, - commit_message="End of training", - ignore_patterns=["step_*", "epoch_*"], - ) - repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False) + generator = torch.Generator(device=pipeline.device).manual_seed(0) + # run pipeline in inference (sample random noise and denoise) + images = pipeline( + generator=generator, + batch_size=args.eval_batch_size, + num_inference_steps=1, + output_type="pil", + ).images + + save_model_card( + repo_id, + images=images, + base_model=args.pretrained_teacher_model_name_or_path, + repo_folder=args.output_dir, + pipeline=pipeline, + ) + upload_folder( + repo_id=repo_id, + folder_path=args.output_dir, + commit_message="End of training", + ignore_patterns=["step_*", "epoch_*"], + ) accelerator.end_training() diff --git a/examples/test_examples.py b/examples/test_examples.py index d11841350064..f1096f9e1289 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -96,6 +96,29 @@ def test_train_unconditional(self): self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.bin"))) self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json"))) + def test_train_consistency(self): + with tempfile.TemporaryDirectory() as tmpdir: + test_args = f""" + examples/consistency_models/train_consistency_distillation.py + --dataset_name hf-internal-testing/dummy_image_class_data + --resolution 32 + --output_dir {tmpdir} + --train_batch_size 2 + --num_epochs 1 + --gradient_accumulation_steps 1 + --learning_rate 1e-3 + --lr_warmup_steps 5 + --testing + --pretrained_teacher_model_name_or_path google/ddpm-cifar10-32 + """.split() + + run_command(self._launch_args + test_args, return_stdout=True) + # save_pretrained smoke test + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.bin"))) + self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json"))) + + + def test_textual_inversion(self): with tempfile.TemporaryDirectory() as tmpdir: test_args = f""" From 180b3e80a4f60341d4ff139156e7330290d3b9f3 Mon Sep 17 00:00:00 2001 From: Ayush Mangal <43698245+ayushtues@users.noreply.github.com> Date: Wed, 19 Jul 2023 16:16:41 +0530 Subject: [PATCH 199/199] Remove hardcoded configs, add DiffusionPipeline --- .../train_consistency_distillation.py | 73 ++++--------------- 1 file changed, 15 insertions(+), 58 deletions(-) diff --git a/examples/consistency_models/train_consistency_distillation.py b/examples/consistency_models/train_consistency_distillation.py index 8fb306cd514b..551b5d0c1b1e 100644 --- a/examples/consistency_models/train_consistency_distillation.py +++ b/examples/consistency_models/train_consistency_distillation.py @@ -20,7 +20,7 @@ from tqdm.auto import tqdm import wandb import diffusers -from diffusers import DDPMPipeline, UNet2DModel, CMStochasticIterativeScheduler, ConsistencyModelPipeline +from diffusers import DiffusionPipeline, UNet2DModel, CMStochasticIterativeScheduler, ConsistencyModelPipeline from diffusers.optimization import get_scheduler from diffusers.training_utils import EMAModel from diffusers.utils import check_min_version, is_accelerate_version, is_tensorboard_available, is_wandb_available, is_xformers_available @@ -362,63 +362,19 @@ def load_model_hook(models, input_dir): elif args.output_dir is not None: os.makedirs(args.output_dir, exist_ok=True) - # For testing use a dummy model - if args.testing: - config = UNet2DModel.load_config('diffusers/consistency-models-test', subfolder="test_unet") - elif args.model_config_name_or_path is not None: - config = UNet2DModel.load_config(args.model_config_name_or_path) # Use the config if provided, model and target model have the same structure - if config is not None: - model = UNet2DModel.from_config(config) - target_model = UNet2DModel.from_config(config) - # Otherwise, use a default config - else: - model = UNet2DModel( - sample_size= args.resolution, - in_channels=3, - out_channels=3, - layers_per_block=2, - num_class_embeds=1000, - block_out_channels= [32, 64], - attention_head_dim=8, - down_block_types= [ - "ResnetDownsampleBlock2D", - "AttnDownBlock2D", - ], - up_block_types= [ - "AttnUpBlock2D", - "ResnetUpsampleBlock2D", - ], - resnet_time_scale_shift="scale_shift", - upsample_type="resnet", - downsample_type="resnet" - ) - target_model = UNet2DModel( - sample_size= args.resolution, - in_channels=3, - out_channels=3, - layers_per_block=2, - num_class_embeds=1000, - block_out_channels= [32, 64], - attention_head_dim=8, - down_block_types= [ - "ResnetDownsampleBlock2D", - "AttnDownBlock2D", - ], - up_block_types= [ - "AttnUpBlock2D", - "ResnetUpsampleBlock2D", - ], - resnet_time_scale_shift="scale_shift", - upsample_type="resnet", - downsample_type="resnet" - ) - if args.testing: - teacher_model = UNet2DModel.from_config(config) + if args.model_config_name_or_path is not None: + config = UNet2DModel.load_config(args.model_config_name_or_path) + # Else use a default config else: - # load the model to distill into a consistency model - teacher_model = DDPMPipeline.from_pretrained(args.pretrained_teacher_model_name_or_path).unet + config = UNet2DModel.load_config("ayushtues/consistency_tiny_unet") + model = UNet2DModel.from_config(config) + target_model = UNet2DModel.from_config(config) noise_scheduler = CMStochasticIterativeScheduler() + # load the model to distill into a consistency model + teacher_pipeline = DiffusionPipeline.from_pretrained(args.pretrained_teacher_model_name_or_path) + teacher_model = teacher_pipeline.unet + teacher_scheduler = teacher_pipeline.scheduler num_scales = 40 # Check that all trainable models are in full precision @@ -493,11 +449,8 @@ def transform_images(examples): labels = [torch.tensor(label) for label in examples["label"]] return {"input": images, "labels": labels} - - logger.info(f"Dataset size: {len(dataset)}") dataset.set_transform(transform_images) - train_dataloader = torch.utils.data.DataLoader( dataset, batch_size=args.train_batch_size, shuffle=True, num_workers=args.dataloader_num_workers ) @@ -572,6 +525,7 @@ def transform_images(examples): ) timesteps = noise_scheduler.timesteps + teacher_scheduler.set_timesteps(timesteps=timesteps, device=accelerator.device) sigmas = noise_scheduler.sigmas # in reverse order, sigma0 is sigma_max @@ -614,6 +568,9 @@ def transform_images(examples): ).prev_sample d = (x - teacher_denoiser) / sigma[(...,) + (None,) * 3] samples = x + d * (sigma_prev - sigma)[(...,) + (None,) * 3] + # We probably want to use Sigma for an arbitrary teacher model here, since that corresponds to the unscaled timestep + # We just want a denoised image from an input x, t using the teacher model, since that is used in the score function + # So we should figure out how to get the denoised image from the teacher model teacher_model_output = teacher_model(noise_scheduler.scale_model_input(samples, timestep_prev), timestep_prev, class_labels=labels).sample teacher_denoiser = noise_scheduler.step( teacher_model_output, timestep_prev, samples, use_noise=False