From d668cbdad25980200bbed271872e93444d74d609 Mon Sep 17 00:00:00 2001 From: kimmishi Date: Mon, 11 Dec 2023 14:27:37 +0800 Subject: [PATCH 1/6] add scliced run --- torchdistpackage/__init__.py | 2 +- torchdistpackage/utils.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/torchdistpackage/__init__.py b/torchdistpackage/__init__.py index 0f10bdf..803b83c 100644 --- a/torchdistpackage/__init__.py +++ b/torchdistpackage/__init__.py @@ -11,7 +11,7 @@ from .dist.sharded_ema import ShardedEMA -from .utils import fix_rand +from .utils import fix_rand, sliced_run from .tools.module_profiler import report_prof, register_profile_hooks, get_model_profile diff --git a/torchdistpackage/utils.py b/torchdistpackage/utils.py index b8b6641..32b65e3 100644 --- a/torchdistpackage/utils.py +++ b/torchdistpackage/utils.py @@ -62,4 +62,25 @@ def partition_params(model, num_partitions, return_dict=False): if elcnt > numel_per_partition: partition_id+=1 elcnt=0 - return partitions \ No newline at end of file + return partitions + +def sliced_run(fn, input: torch.Tensor, micro_bs): + """slice the input into several micro batches and run forward, with no_grad + + Args: + fn (function): forward function + input (torch.Tensor): tensor + micro_bs (integer): micro batchsize + + Returns: + tensor + """ + chunk_outs = [] + with torch.no_grad(): + chunks = input.split(micro_bs, 0) + for chunk in chunks: + chunk_out = fn(chunk) + chunk_outs.append(chunk_out) + + out = torch.cat(chunk_outs, dim=0) + return out \ No newline at end of file From a8d56bcee7d11bbfaaaa96c7a8588383d9def5b0 Mon Sep 17 00:00:00 2001 From: kimmishi Date: Wed, 10 Jan 2024 15:04:59 +0800 Subject: [PATCH 2/6] update debug nan --- examples/diffusion/train_unet.py | 130 +++++++++++++++++++++++++++ torchdistpackage/__init__.py | 3 +- torchdistpackage/tools/bminf_int8.py | 4 +- torchdistpackage/tools/bnb_fc.py | 3 +- torchdistpackage/tools/debug_nan.py | 20 +++-- 5 files changed, 147 insertions(+), 13 deletions(-) create mode 100644 examples/diffusion/train_unet.py diff --git a/examples/diffusion/train_unet.py b/examples/diffusion/train_unet.py new file mode 100644 index 0000000..277b0b9 --- /dev/null +++ b/examples/diffusion/train_unet.py @@ -0,0 +1,130 @@ +import diffusers +from diffusers import ( + AutoencoderKL, + DDPMScheduler, + StableDiffusionXLPipeline, + UNet2DConditionModel, +) + +import torch +import torch.nn as nn +import torch.nn.functional as F + +import time +import functools + +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP +from torch.distributed.fsdp import ShardingStrategy + +from torchdistpackage import setup_distributed_slurm +import sdto + +def enable_tf32(): + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + + +def train(model, vae, optimizer_class, batchsize, use_zero=False, use_amp=True, h=512, w=512, is_xl=False, + input_grad=False): + dt=torch.float32 + + if not is_xl: + timesteps = torch.arange(batchsize, dtype=torch.int64).cuda()+100 + prompt_embeds = torch.rand([batchsize,77,768], dtype=dt).cuda() + time_ids = torch.rand([batchsize,6], dtype=dt).cuda() + text_embeds = torch.rand([batchsize,1280], dtype=dt).cuda() + encoder_hidden_states = torch.rand([batchsize,77,768], dtype=dt).cuda() + + else: + timesteps = torch.arange(batchsize, dtype=torch.int64).cuda()+100 + prompt_embeds = torch.rand([batchsize,77,2048], dtype=dt).cuda() + time_ids = torch.rand([batchsize,6], dtype=dt).cuda() + text_embeds = torch.rand([batchsize,1280], dtype=dt).cuda() + + + # model.to(dt) + model.cuda() + + model_input = torch.rand([batchsize, 3, h, w], dtype=torch.float32).cuda() + + # noisy_model_input = torch.rand([batchsize, 4, h//8, w//8], dtype=dt, device='cuda').requires_grad_(input_grad) + unet_added_conditions = { + "time_ids": time_ids, + "text_embeds": text_embeds + } + + + # model.enable_gradient_checkpointing() + # model.enable_xformers_memory_efficient_attention() + model, vae, opt, _ = sdto.compile(model, vae, compile_vae=True) + + perf_times = [] + if use_zero: + model = FSDP(model, sharding_strategy=torch.distributed.fsdp.ShardingStrategy.SHARD_GRAD_OP) + opt =optimizer_class(model.parameters()) + else: + opt =optimizer_class(model.parameters()) + + from torch.profiler import profile, record_function, ProfilerActivity + prof = torch.profiler.profile( + schedule=torch.profiler.schedule(wait=0, warmup=5, active=1, repeat=1), + on_trace_ready=torch.profiler.tensorboard_trace_handler('./bnb_int8'), + activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], + record_shapes=True, + with_stack=True, + profile_memory=True) + # prof.start() + for ind in range(10): + torch.cuda.synchronize() + beg = time.time() + + # noisy_model_input = vae.encode(model_input).latent_dist.sample().mul_(0.18215) + noisy_model_input = sdto.sliced_vae(vae, model_input, use_autocast=True) + + with torch.autocast(dtype=torch.float16, device_type='cuda', enabled=use_amp): + if is_xl: + model_pred = model( + noisy_model_input, timesteps, prompt_embeds, added_cond_kwargs=unet_added_conditions + ).sample + # print("after fwd", torch.cuda.memory_allocated()/1e9, torch.cuda.max_memory_allocated()/1e9) + # import pdb;pdb.set_trace() + loss = F.mse_loss(model_pred.float(), torch.rand_like(model_pred).float(), reduction="mean") + else: + model_pred = model(noisy_model_input, timesteps, encoder_hidden_states).sample + # print("after fwd", torch.cuda.memory_allocated()/1e9, torch.cuda.max_memory_allocated()/1e9) + # import pdb;pdb.set_trace() + loss = F.mse_loss(model_pred.float(), torch.rand_like(model_pred).float(), reduction="mean") + + loss.backward() + # print("after bwd", torch.cuda.max_memory_allocated()/1e9) + + opt.step() + opt.zero_grad() + torch.cuda.synchronize() + # prof.step() + if ind>5: + perf_times.append(time.time()-beg) + beg=time.time() + print("max mem", torch.cuda.max_memory_allocated()/1e9) + print(perf_times) + +enable_tf32() +rank, world_size, port, addr=setup_distributed_slurm() + +pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" + +# pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" + +unet = UNet2DConditionModel.from_pretrained( + pretrained_model_name_or_path, subfolder="unet" +).cuda() +unet.train() +vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae").cuda() + +# print(unet) +# optimizer_class = FusedAdam +optimizer_class = functools.partial(torch.optim.Adam,fused = True) + +train(unet, vae, optimizer_class, 96, + use_amp=False, use_zero=False, h=576, w=1024, is_xl ='xl' in pretrained_model_name_or_path, + input_grad=True) \ No newline at end of file diff --git a/torchdistpackage/__init__.py b/torchdistpackage/__init__.py index 803b83c..2d28608 100644 --- a/torchdistpackage/__init__.py +++ b/torchdistpackage/__init__.py @@ -17,4 +17,5 @@ from .tools.module_replace import replace_all_module from .tools.bnb_fc import replace_linear_by_bnb -from .tools.bminf_int8 import replace_linear_by_bminf \ No newline at end of file +from .tools.bminf_int8 import replace_linear_by_bminf +from .tools.debug_nan import register_abnormal_val_hooks,check_model_params \ No newline at end of file diff --git a/torchdistpackage/tools/bminf_int8.py b/torchdistpackage/tools/bminf_int8.py index 8c9b4fa..476289d 100644 --- a/torchdistpackage/tools/bminf_int8.py +++ b/torchdistpackage/tools/bminf_int8.py @@ -1,13 +1,13 @@ import torch -import bminf - from .module_replace import replace_all_module def if_replace_linear(name, module): return isinstance(module, (torch.nn.Linear)) def get_new_module(name, module): + import bminf + new_linear = bminf.QuantizedLinear(module) return new_linear diff --git a/torchdistpackage/tools/bnb_fc.py b/torchdistpackage/tools/bnb_fc.py index d68900b..dddf662 100644 --- a/torchdistpackage/tools/bnb_fc.py +++ b/torchdistpackage/tools/bnb_fc.py @@ -1,6 +1,5 @@ import torch import functools -from bitsandbytes.nn import Linear8bitLt from .module_replace import replace_all_module @@ -8,6 +7,8 @@ def if_replace_linear(name, module): return isinstance(module, (torch.nn.Linear)) def get_new_module(name, module, bnb_kwargs={}): + from bitsandbytes.nn import Linear8bitLt + old_shape = module.weight.shape indim = old_shape[1] outdim = old_shape[0] diff --git a/torchdistpackage/tools/debug_nan.py b/torchdistpackage/tools/debug_nan.py index 78e0288..e478738 100644 --- a/torchdistpackage/tools/debug_nan.py +++ b/torchdistpackage/tools/debug_nan.py @@ -25,7 +25,7 @@ def check_model_params(model): for name,param in model.named_parameters(): if not check_tensor_inf_nan(param): print("model param:", name, " contains nan/inf!") - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() return False # register_forward_hook @@ -35,7 +35,7 @@ def check_value(module, args, output): if not check_tensors(args): print(module_name, "input contains nan/inf!") if not check_tensors(output): - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() print(module_name, "output contains nan/inf!") @@ -48,14 +48,16 @@ def check_value(module, grad_input, grad_output): print(module_name, "grad_output contains nan/inf!") if not check_tensors(grad_input): print(module_name, "grad_input contains nan/inf!") - import pdb;pdb.set_trace() + # import pdb;pdb.set_trace() return check_value # detect param update nan -# usage example: - # fwd_hooks = {} - # bwd_hooks = {} - # for name, module in model.named_modules(): - # fwd_hooks[name] = module.register_forward_hook(fwd_hook_wrapper(name)) - # bwd_hooks[name] = module.register_forward_hook(bwd_hook_wrapper(name)) \ No newline at end of file +def register_abnormal_val_hooks(model): + fwd_hooks = {} + bwd_hooks = {} + for name, module in model.named_modules(): + fwd_hooks[name] = module.register_forward_hook(fwd_hook_wrapper(name)) + bwd_hooks[name] = module.register_forward_hook(bwd_hook_wrapper(name)) + return fwd_hooks, bwd_hooks + From 61960122b807a6faf728219ea8293a00541a761c Mon Sep 17 00:00:00 2001 From: kimmishi Date: Wed, 10 Jan 2024 19:50:16 +0800 Subject: [PATCH 3/6] update debug nan to monitor value increase --- torchdistpackage/__init__.py | 2 +- torchdistpackage/tools/debug_nan.py | 41 ++++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/torchdistpackage/__init__.py b/torchdistpackage/__init__.py index 2d28608..dde2933 100644 --- a/torchdistpackage/__init__.py +++ b/torchdistpackage/__init__.py @@ -18,4 +18,4 @@ from .tools.module_replace import replace_all_module from .tools.bnb_fc import replace_linear_by_bnb from .tools.bminf_int8 import replace_linear_by_bminf -from .tools.debug_nan import register_abnormal_val_hooks,check_model_params \ No newline at end of file +from .tools.debug_nan import register_debug_nan_hooks,check_model_params \ No newline at end of file diff --git a/torchdistpackage/tools/debug_nan.py b/torchdistpackage/tools/debug_nan.py index e478738..1c17621 100644 --- a/torchdistpackage/tools/debug_nan.py +++ b/torchdistpackage/tools/debug_nan.py @@ -22,11 +22,38 @@ def check_tensors(inputs): def check_model_params(model): + ret=True for name,param in model.named_parameters(): if not check_tensor_inf_nan(param): print("model param:", name, " contains nan/inf!") # import pdb;pdb.set_trace() - return False + ret = False + return ret + +def print_tensor_info(module_name, output, msg): + if isinstance(output, torch.Tensor): + print(module_name, msg, output.max()) + elif isinstance(output, tuple) or isinstance(output, list): + for i, t in enumerate(output): + print_tensor_info(module_name, t, msg) + # print(module_name, f"output_{i}_max", t.max()) + +@torch.no_grad +def get_max(output): + max=0 + if isinstance(output, torch.Tensor): + return output.max().item() + elif isinstance(output, tuple) or isinstance(output, list): + for i, t in enumerate(output): + if get_max(t)>max: + max = get_max(t) + return max + +def find_big_change(module_name, args, output): + out_max = get_max(output) + in_max = get_max(args) + if in_max!=0 and out_max / in_max >= 2 and out_max>1000: + print(module_name, out_max, in_max) # register_forward_hook @torch.no_grad() @@ -35,17 +62,23 @@ def check_value(module, args, output): if not check_tensors(args): print(module_name, "input contains nan/inf!") if not check_tensors(output): - # import pdb;pdb.set_trace() + import pdb;pdb.set_trace() print(module_name, "output contains nan/inf!") + # if 'conv_shortcut' in module_name: + # print_tensor_info(module_name, args, "input_max") + # print_tensor_info(module_name, output, "output_max") + find_big_change(module_name, args, output) return check_value + # model.register_full_backward_hook def bwd_hook_wrapper(module_name=''): def check_value(module, grad_input, grad_output): if not check_tensors(grad_output): print(module_name, "grad_output contains nan/inf!") + import pdb;pdb.set_trace() if not check_tensors(grad_input): print(module_name, "grad_input contains nan/inf!") # import pdb;pdb.set_trace() @@ -53,11 +86,11 @@ def check_value(module, grad_input, grad_output): # detect param update nan -def register_abnormal_val_hooks(model): +def register_debug_nan_hooks(model): fwd_hooks = {} bwd_hooks = {} for name, module in model.named_modules(): fwd_hooks[name] = module.register_forward_hook(fwd_hook_wrapper(name)) - bwd_hooks[name] = module.register_forward_hook(bwd_hook_wrapper(name)) + # bwd_hooks[name] = module.register_full_backward_hook(bwd_hook_wrapper(name)) return fwd_hooks, bwd_hooks From 4b52853101c34b483beec2165b6d977acb457ba8 Mon Sep 17 00:00:00 2001 From: kimmishi Date: Mon, 15 Apr 2024 09:47:02 +0800 Subject: [PATCH 4/6] update test files --- examples/model_parallel/test_transformer.py | 14 +- explore/memory-profile/attention_processor.py | 1611 +++++++++++++++++ explore/memory-profile/transblock.py | 731 ++++++++ torchdistpackage/dist/launch_from_slurm.py | 2 + .../parallel/tensor_parallel/attn.py | 10 +- .../parallel/tensor_parallel/transformer.py | 10 +- 6 files changed, 2365 insertions(+), 13 deletions(-) create mode 100644 explore/memory-profile/attention_processor.py create mode 100644 explore/memory-profile/transblock.py diff --git a/examples/model_parallel/test_transformer.py b/examples/model_parallel/test_transformer.py index 9bb3d90..b70019a 100644 --- a/examples/model_parallel/test_transformer.py +++ b/examples/model_parallel/test_transformer.py @@ -10,10 +10,10 @@ fix_rand() -def test_model(dim, depth, nh=2, dtype=torch.float): +def test_model(dim, depth, nh=2, dtype=torch.float, seqlen=1024, b=1): model = Transformer(dim, depth=depth, num_heads=nh, tensor_parallel=False, sequence_parallel=False).cuda().to(dtype) - # tp_model = Transformer(dim, depth=depth, num_heads=nh, tensor_parallel=True, sequence_parallel=False).cuda().to(dtype) - tp_model = Transformer(dim, depth=depth, num_heads=nh, tensor_parallel=True, sequence_parallel=True).cuda().to(dtype) + tp_model = Transformer(dim, depth=depth, num_heads=nh, tensor_parallel=True, sequence_parallel=False).cuda().to(dtype) + # tp_model = Transformer(dim, depth=depth, num_heads=nh, tensor_parallel=True, sequence_parallel=True).cuda().to(dtype) opt = torch.optim.AdamW(model.parameters()) @@ -24,13 +24,14 @@ def test_model(dim, depth, nh=2, dtype=torch.float): # sp_model.blocks[ind].init_from_full(model.blocks[ind]) for _ in range(10): - inp = torch.rand((32, 1024, dim)).cuda().to(dtype) + inp = torch.rand((b, seqlen, dim)).cuda().to(dtype) opt.zero_grad() out = model(inp) tp_out = tp_model(inp) - assert torch.allclose(out, tp_out, rtol=1e-1, atol=1e-02) + # import pdb;pdb.set_trace() + assert torch.allclose(out, tp_out, rtol=1e-3, atol=1e-02) import pdb;pdb.set_trace() # TODO: fix this mis alignment @@ -43,4 +44,5 @@ def test_model(dim, depth, nh=2, dtype=torch.float): opt.step() tp_opt.step() -test_model(1024, 8) \ No newline at end of file +# test_model(1024, 8) +test_model(4, 2, seqlen=5) \ No newline at end of file diff --git a/explore/memory-profile/attention_processor.py b/explore/memory-profile/attention_processor.py new file mode 100644 index 0000000..267c46e --- /dev/null +++ b/explore/memory-profile/attention_processor.py @@ -0,0 +1,1611 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from importlib import import_module +from typing import Callable, Optional, Union + +import torch +import einops +import torch.nn.functional as F +from torch import nn + + +from diffusers.utils import USE_PEFT_BACKEND, deprecate, logging +from diffusers.utils.import_utils import is_xformers_available +from diffusers.utils.torch_utils import maybe_allow_in_graph +from diffusers.models.lora import LoRACompatibleLinear, LoRALinearLayer + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +if is_xformers_available(): + import xformers + import xformers.ops +else: + xformers = None + + +@maybe_allow_in_graph +class Attention(nn.Module): + r""" + A cross attention layer. + + Parameters: + query_dim (`int`): + The number of channels in the query. + cross_attention_dim (`int`, *optional*): + The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`. + heads (`int`, *optional*, defaults to 8): + The number of heads to use for multi-head attention. + dim_head (`int`, *optional*, defaults to 64): + The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + bias (`bool`, *optional*, defaults to False): + Set to `True` for the query, key, and value linear layers to contain a bias parameter. + upcast_attention (`bool`, *optional*, defaults to False): + Set to `True` to upcast the attention computation to `float32`. + upcast_softmax (`bool`, *optional*, defaults to False): + Set to `True` to upcast the softmax computation to `float32`. + cross_attention_norm (`str`, *optional*, defaults to `None`): + The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`. + cross_attention_norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the group norm in the cross attention. + added_kv_proj_dim (`int`, *optional*, defaults to `None`): + The number of channels to use for the added key and value projections. If `None`, no projection is used. + norm_num_groups (`int`, *optional*, defaults to `None`): + The number of groups to use for the group norm in the attention. + spatial_norm_dim (`int`, *optional*, defaults to `None`): + The number of channels to use for the spatial normalization. + out_bias (`bool`, *optional*, defaults to `True`): + Set to `True` to use a bias in the output linear layer. + scale_qk (`bool`, *optional*, defaults to `True`): + Set to `True` to scale the query and key by `1 / sqrt(dim_head)`. + only_cross_attention (`bool`, *optional*, defaults to `False`): + Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if + `added_kv_proj_dim` is not `None`. + eps (`float`, *optional*, defaults to 1e-5): + An additional value added to the denominator in group normalization that is used for numerical stability. + rescale_output_factor (`float`, *optional*, defaults to 1.0): + A factor to rescale the output by dividing it with this value. + residual_connection (`bool`, *optional*, defaults to `False`): + Set to `True` to add the residual connection to the output. + _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`): + Set to `True` if the attention block is loaded from a deprecated state dict. + processor (`AttnProcessor`, *optional*, defaults to `None`): + The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and + `AttnProcessor` otherwise. + """ + + def __init__( + self, + query_dim: int, + cross_attention_dim: Optional[int] = None, + heads: int = 8, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + upcast_attention: bool = False, + upcast_softmax: bool = False, + cross_attention_norm: Optional[str] = None, + cross_attention_norm_num_groups: int = 32, + added_kv_proj_dim: Optional[int] = None, + norm_num_groups: Optional[int] = None, + spatial_norm_dim: Optional[int] = None, + out_bias: bool = True, + scale_qk: bool = True, + only_cross_attention: bool = False, + eps: float = 1e-5, + rescale_output_factor: float = 1.0, + residual_connection: bool = False, + _from_deprecated_attn_block: bool = False, + processor: Optional["AttnProcessor"] = None, + ): + super().__init__() + self.inner_dim = dim_head * heads + self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim + self.upcast_attention = upcast_attention + self.upcast_softmax = upcast_softmax + self.rescale_output_factor = rescale_output_factor + self.residual_connection = residual_connection + self.dropout = dropout + + # we make use of this private variable to know whether this class is loaded + # with an deprecated state dict so that we can convert it on the fly + self._from_deprecated_attn_block = _from_deprecated_attn_block + + self.scale_qk = scale_qk + self.scale = dim_head**-0.5 if self.scale_qk else 1.0 + + self.heads = heads + # for slice_size > 0 the attention score computation + # is split across the batch axis to save memory + # You can set slice_size with `set_attention_slice` + self.sliceable_head_dim = heads + + self.added_kv_proj_dim = added_kv_proj_dim + self.only_cross_attention = only_cross_attention + + if self.added_kv_proj_dim is None and self.only_cross_attention: + raise ValueError( + "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`." + ) + + if norm_num_groups is not None: + self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True) + else: + self.group_norm = None + + if spatial_norm_dim is not None: + self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim) + else: + self.spatial_norm = None + + if cross_attention_norm is None: + self.norm_cross = None + elif cross_attention_norm == "layer_norm": + self.norm_cross = nn.LayerNorm(self.cross_attention_dim) + elif cross_attention_norm == "group_norm": + if self.added_kv_proj_dim is not None: + # The given `encoder_hidden_states` are initially of shape + # (batch_size, seq_len, added_kv_proj_dim) before being projected + # to (batch_size, seq_len, cross_attention_dim). The norm is applied + # before the projection, so we need to use `added_kv_proj_dim` as + # the number of channels for the group norm. + norm_cross_num_channels = added_kv_proj_dim + else: + norm_cross_num_channels = self.cross_attention_dim + + self.norm_cross = nn.GroupNorm( + num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True + ) + else: + raise ValueError( + f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" + ) + + if USE_PEFT_BACKEND: + linear_cls = nn.Linear + else: + linear_cls = LoRACompatibleLinear + + self.to_q = linear_cls(query_dim, self.inner_dim, bias=bias) + + if not self.only_cross_attention: + # only relevant for the `AddedKVProcessor` classes + self.to_k = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias) + self.to_v = linear_cls(self.cross_attention_dim, self.inner_dim, bias=bias) + else: + self.to_k = None + self.to_v = None + + if self.added_kv_proj_dim is not None: + self.add_k_proj = linear_cls(added_kv_proj_dim, self.inner_dim) + self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim) + + self.to_out = nn.ModuleList([]) + self.to_out.append(linear_cls(self.inner_dim, query_dim, bias=out_bias)) + self.to_out.append(nn.Dropout(dropout)) + + # set attention processor + # We use the AttnProcessor2_0 by default when torch 2.x is used which uses + # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention + # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1 + if processor is None: + processor = ( + AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor() + ) + self.set_processor(processor) + + + def set_processor(self, processor: "AttnProcessor", _remove_lora: bool = False) -> None: + r""" + Set the attention processor to use. + + Args: + processor (`AttnProcessor`): + The attention processor to use. + _remove_lora (`bool`, *optional*, defaults to `False`): + Set to `True` to remove LoRA layers from the model. + """ + if not USE_PEFT_BACKEND and hasattr(self, "processor") and _remove_lora and self.to_q.lora_layer is not None: + deprecate( + "set_processor to offload LoRA", + "0.26.0", + "In detail, removing LoRA layers via calling `set_default_attn_processor` is deprecated. Please make sure to call `pipe.unload_lora_weights()` instead.", + ) + # TODO(Patrick, Sayak) - this can be deprecated once PEFT LoRA integration is complete + # We need to remove all LoRA layers + # Don't forget to remove ALL `_remove_lora` from the codebase + for module in self.modules(): + if hasattr(module, "set_lora_layer"): + module.set_lora_layer(None) + + # if current processor is in `self._modules` and if passed `processor` is not, we need to + # pop `processor` from `self._modules` + if ( + hasattr(self, "processor") + and isinstance(self.processor, torch.nn.Module) + and not isinstance(processor, torch.nn.Module) + ): + logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}") + self._modules.pop("processor") + + self.processor = processor + + def forward( + self, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + **cross_attention_kwargs, + ) -> torch.Tensor: + r""" + The forward method of the `Attention` class. + + Args: + hidden_states (`torch.Tensor`): + The hidden states of the query. + encoder_hidden_states (`torch.Tensor`, *optional*): + The hidden states of the encoder. + attention_mask (`torch.Tensor`, *optional*): + The attention mask to use. If `None`, no mask is applied. + **cross_attention_kwargs: + Additional keyword arguments to pass along to the cross attention. + + Returns: + `torch.Tensor`: The output of the attention layer. + """ + # The `Attention` class can call different attention processors / attention functions + # here we simply pass along all tensors to the selected processor class + # For standard processors that are defined here, `**cross_attention_kwargs` is empty + return self.processor( + self, + hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + +class AttnProcessor: + r""" + Default processor for performing attention-related computations. + """ + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + ) -> torch.Tensor: + residual = hidden_states + + args = () if USE_PEFT_BACKEND else (scale,) + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states, *args) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states, *args) + value = attn.to_v(encoder_hidden_states, *args) + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states, *args) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class CustomDiffusionAttnProcessor(nn.Module): + r""" + Processor for implementing attention for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + """ + + def __init__( + self, + train_kv: bool = True, + train_q_out: bool = True, + hidden_size: Optional[int] = None, + cross_attention_dim: Optional[int] = None, + out_bias: bool = True, + dropout: float = 0.0, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype) + else: + query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype)) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype)) + value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype)) + key = key.to(attn.to_q.weight.dtype) + value = value.to(attn.to_q.weight.dtype) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + query = attn.head_to_batch_dim(query) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + +class AttnAddedKVProcessor: + r""" + Processor for performing attention-related computations with extra learnable key and value matrices for the text + encoder. + """ + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + ) -> torch.Tensor: + residual = hidden_states + + args = () if USE_PEFT_BACKEND else (scale,) + + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states, *args) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states, *args) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states, *args) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states, *args) + value = attn.to_v(hidden_states, *args) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + attention_probs = attn.get_attention_scores(query, key, attention_mask) + hidden_states = torch.bmm(attention_probs, value) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states, *args) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class AttnAddedKVProcessor2_0: + r""" + Processor for performing scaled dot-product attention (enabled by default if you're using PyTorch 2.0), with extra + learnable key and value matrices for the text encoder. + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError( + "AttnAddedKVProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0." + ) + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + ) -> torch.Tensor: + residual = hidden_states + + args = () if USE_PEFT_BACKEND else (scale,) + + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size, out_dim=4) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states, *args) + query = attn.head_to_batch_dim(query, out_dim=4) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj, out_dim=4) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj, out_dim=4) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states, *args) + value = attn.to_v(hidden_states, *args) + key = attn.head_to_batch_dim(key, out_dim=4) + value = attn.head_to_batch_dim(value, out_dim=4) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=2) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=2) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, residual.shape[1]) + + # linear proj + hidden_states = attn.to_out[0](hidden_states, *args) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class XFormersAttnAddedKVProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + + def __init__(self, attention_op: Optional[Callable] = None): + self.attention_op = attention_op + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + residual = hidden_states + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class XFormersAttnProcessor: + r""" + Processor for implementing memory efficient attention using xFormers. + + Args: + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + """ + + def __init__(self, attention_op: Optional[Callable] = None): + self.attention_op = attention_op + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + ) -> torch.FloatTensor: + residual = hidden_states + + args = () if USE_PEFT_BACKEND else (scale,) + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, key_tokens, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size) + if attention_mask is not None: + # expand our mask's singleton query_tokens dimension: + # [batch*heads, 1, key_tokens] -> + # [batch*heads, query_tokens, key_tokens] + # so that it can be added as a bias onto the attention scores that xformers computes: + # [batch*heads, query_tokens, key_tokens] + # we do this explicitly because xformers doesn't broadcast the singleton dimension for us. + _, query_tokens, _ = hidden_states.shape + attention_mask = attention_mask.expand(-1, query_tokens, -1) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states, *args) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states, *args) + value = attn.to_v(encoder_hidden_states, *args) + + query = attn.head_to_batch_dim(query).contiguous() + key = attn.head_to_batch_dim(key).contiguous() + value = attn.head_to_batch_dim(value).contiguous() + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states, *args) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class AttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + ) -> torch.FloatTensor: + residual = hidden_states + + args = () if USE_PEFT_BACKEND else (scale,) + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + # scaled_dot_product_attention expects attention_mask shape to be + # (batch, heads, source_length, target_length) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + args = () if USE_PEFT_BACKEND else (scale,) + query = attn.to_q(hidden_states, *args) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states, *args) + value = attn.to_v(encoder_hidden_states, *args) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + + # # b: batch_size; h: attention_head; s: source_length; t: target_lenth + # attention_mask = einops.repeat(attention_mask, 'b h s t -> b h (s f) t', f=19).contiguous() + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + # linear proj + hidden_states = attn.to_out[0](hidden_states, *args) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class CustomDiffusionXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing memory efficient attention using xFormers for the Custom Diffusion method. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to use + as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best operator. + """ + + def __init__( + self, + train_kv: bool = True, + train_q_out: bool = False, + hidden_size: Optional[int] = None, + cross_attention_dim: Optional[int] = None, + out_bias: bool = True, + dropout: float = 0.0, + attention_op: Optional[Callable] = None, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.attention_op = attention_op + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states).to(attn.to_q.weight.dtype) + else: + query = attn.to_q(hidden_states.to(attn.to_q.weight.dtype)) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype)) + value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype)) + key = key.to(attn.to_q.weight.dtype) + value = value.to(attn.to_q.weight.dtype) + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + query = attn.head_to_batch_dim(query).contiguous() + key = attn.head_to_batch_dim(key).contiguous() + value = attn.head_to_batch_dim(value).contiguous() + + hidden_states = xformers.ops.memory_efficient_attention( + query, key, value, attn_bias=attention_mask, op=self.attention_op, scale=attn.scale + ) + hidden_states = hidden_states.to(query.dtype) + hidden_states = attn.batch_to_head_dim(hidden_states) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + +class CustomDiffusionAttnProcessor2_0(nn.Module): + r""" + Processor for implementing attention for the Custom Diffusion method using PyTorch 2.0’s memory-efficient scaled + dot-product attention. + + Args: + train_kv (`bool`, defaults to `True`): + Whether to newly train the key and value matrices corresponding to the text features. + train_q_out (`bool`, defaults to `True`): + Whether to newly train query matrices corresponding to the latent image features. + hidden_size (`int`, *optional*, defaults to `None`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + out_bias (`bool`, defaults to `True`): + Whether to include the bias parameter in `train_q_out`. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + """ + + def __init__( + self, + train_kv: bool = True, + train_q_out: bool = True, + hidden_size: Optional[int] = None, + cross_attention_dim: Optional[int] = None, + out_bias: bool = True, + dropout: float = 0.0, + ): + super().__init__() + self.train_kv = train_kv + self.train_q_out = train_q_out + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + + # `_custom_diffusion` id for easy serialization and loading. + if self.train_kv: + self.to_k_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + self.to_v_custom_diffusion = nn.Linear(cross_attention_dim or hidden_size, hidden_size, bias=False) + if self.train_q_out: + self.to_q_custom_diffusion = nn.Linear(hidden_size, hidden_size, bias=False) + self.to_out_custom_diffusion = nn.ModuleList([]) + self.to_out_custom_diffusion.append(nn.Linear(hidden_size, hidden_size, bias=out_bias)) + self.to_out_custom_diffusion.append(nn.Dropout(dropout)) + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + batch_size, sequence_length, _ = hidden_states.shape + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + if self.train_q_out: + query = self.to_q_custom_diffusion(hidden_states) + else: + query = attn.to_q(hidden_states) + + if encoder_hidden_states is None: + crossattn = False + encoder_hidden_states = hidden_states + else: + crossattn = True + if attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + if self.train_kv: + key = self.to_k_custom_diffusion(encoder_hidden_states.to(self.to_k_custom_diffusion.weight.dtype)) + value = self.to_v_custom_diffusion(encoder_hidden_states.to(self.to_v_custom_diffusion.weight.dtype)) + key = key.to(attn.to_q.weight.dtype) + value = value.to(attn.to_q.weight.dtype) + + else: + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + + if crossattn: + detach = torch.ones_like(key) + detach[:, :1, :] = detach[:, :1, :] * 0.0 + key = detach * key + (1 - detach) * key.detach() + value = detach * value + (1 - detach) * value.detach() + + inner_dim = hidden_states.shape[-1] + + head_dim = inner_dim // attn.heads + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) + + if self.train_q_out: + # linear proj + hidden_states = self.to_out_custom_diffusion[0](hidden_states) + # dropout + hidden_states = self.to_out_custom_diffusion[1](hidden_states) + else: + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + return hidden_states + + +class SlicedAttnProcessor: + r""" + Processor for implementing sliced attention. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + + def __init__(self, slice_size: int): + self.slice_size = slice_size + + def __call__( + self, + attn: Attention, + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + residual = hidden_states + + input_ndim = hidden_states.ndim + + if input_ndim == 4: + batch_size, channel, height, width = hidden_states.shape + hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if attn.group_norm is not None: + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + dim = query.shape[-1] + query = attn.head_to_batch_dim(query) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + key = attn.to_k(encoder_hidden_states) + value = attn.to_v(encoder_hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + + batch_size_attention, query_tokens, _ = query.shape + hidden_states = torch.zeros( + (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype + ) + + for i in range(batch_size_attention // self.slice_size): + start_idx = i * self.slice_size + end_idx = (i + 1) * self.slice_size + + query_slice = query[start_idx:end_idx] + key_slice = key[start_idx:end_idx] + attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None + + attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice) + + attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx]) + + hidden_states[start_idx:end_idx] = attn_slice + + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + if input_ndim == 4: + hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width) + + if attn.residual_connection: + hidden_states = hidden_states + residual + + hidden_states = hidden_states / attn.rescale_output_factor + + return hidden_states + + +class SlicedAttnAddedKVProcessor: + r""" + Processor for implementing sliced attention with extra learnable key and value matrices for the text encoder. + + Args: + slice_size (`int`, *optional*): + The number of steps to compute attention. Uses as many slices as `attention_head_dim // slice_size`, and + `attention_head_dim` must be a multiple of the `slice_size`. + """ + + def __init__(self, slice_size): + self.slice_size = slice_size + + def __call__( + self, + attn: "Attention", + hidden_states: torch.FloatTensor, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + temb: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + residual = hidden_states + + if attn.spatial_norm is not None: + hidden_states = attn.spatial_norm(hidden_states, temb) + + hidden_states = hidden_states.view(hidden_states.shape[0], hidden_states.shape[1], -1).transpose(1, 2) + + batch_size, sequence_length, _ = hidden_states.shape + + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + + if encoder_hidden_states is None: + encoder_hidden_states = hidden_states + elif attn.norm_cross: + encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states) + + hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) + + query = attn.to_q(hidden_states) + dim = query.shape[-1] + query = attn.head_to_batch_dim(query) + + encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states) + encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states) + + encoder_hidden_states_key_proj = attn.head_to_batch_dim(encoder_hidden_states_key_proj) + encoder_hidden_states_value_proj = attn.head_to_batch_dim(encoder_hidden_states_value_proj) + + if not attn.only_cross_attention: + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + key = attn.head_to_batch_dim(key) + value = attn.head_to_batch_dim(value) + key = torch.cat([encoder_hidden_states_key_proj, key], dim=1) + value = torch.cat([encoder_hidden_states_value_proj, value], dim=1) + else: + key = encoder_hidden_states_key_proj + value = encoder_hidden_states_value_proj + + batch_size_attention, query_tokens, _ = query.shape + hidden_states = torch.zeros( + (batch_size_attention, query_tokens, dim // attn.heads), device=query.device, dtype=query.dtype + ) + + for i in range(batch_size_attention // self.slice_size): + start_idx = i * self.slice_size + end_idx = (i + 1) * self.slice_size + + query_slice = query[start_idx:end_idx] + key_slice = key[start_idx:end_idx] + attn_mask_slice = attention_mask[start_idx:end_idx] if attention_mask is not None else None + + attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice) + + attn_slice = torch.bmm(attn_slice, value[start_idx:end_idx]) + + hidden_states[start_idx:end_idx] = attn_slice + + hidden_states = attn.batch_to_head_dim(hidden_states) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + hidden_states = hidden_states.transpose(-1, -2).reshape(residual.shape) + hidden_states = hidden_states + residual + + return hidden_states + + +class SpatialNorm(nn.Module): + """ + Spatially conditioned normalization as defined in https://arxiv.org/abs/2209.09002. + + Args: + f_channels (`int`): + The number of channels for input to group normalization layer, and output of the spatial norm layer. + zq_channels (`int`): + The number of channels for the quantized vector as described in the paper. + """ + + def __init__( + self, + f_channels: int, + zq_channels: int, + ): + super().__init__() + self.norm_layer = nn.GroupNorm(num_channels=f_channels, num_groups=32, eps=1e-6, affine=True) + self.conv_y = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) + self.conv_b = nn.Conv2d(zq_channels, f_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, f: torch.FloatTensor, zq: torch.FloatTensor) -> torch.FloatTensor: + f_size = f.shape[-2:] + zq = F.interpolate(zq, size=f_size, mode="nearest") + norm_f = self.norm_layer(f) + new_f = norm_f * self.conv_y(zq) + self.conv_b(zq) + return new_f + + +## Deprecated +class LoRAAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + kwargs (`dict`): + Additional keyword arguments to pass to the `LoRALinearLayer` layers. + """ + + def __init__( + self, + hidden_size: int, + cross_attention_dim: Optional[int] = None, + rank: int = 4, + network_alpha: Optional[int] = None, + **kwargs, + ): + super().__init__() + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + q_rank = kwargs.pop("q_rank", None) + q_hidden_size = kwargs.pop("q_hidden_size", None) + q_rank = q_rank if q_rank is not None else rank + q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size + + v_rank = kwargs.pop("v_rank", None) + v_hidden_size = kwargs.pop("v_hidden_size", None) + v_rank = v_rank if v_rank is not None else rank + v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size + + out_rank = kwargs.pop("out_rank", None) + out_hidden_size = kwargs.pop("out_hidden_size", None) + out_rank = out_rank if out_rank is not None else rank + out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size + + self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) + self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + self_cls_name = self.__class__.__name__ + deprecate( + self_cls_name, + "0.26.0", + ( + f"Make sure use {self_cls_name[4:]} instead by setting" + "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using" + " `LoraLoaderMixin.load_lora_weights`" + ), + ) + attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device) + attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device) + attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device) + attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device) + + attn._modules.pop("processor") + attn.processor = AttnProcessor() + return attn.processor(attn, hidden_states, *args, **kwargs) + + +class LoRAAttnProcessor2_0(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product + attention. + + Args: + hidden_size (`int`): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + kwargs (`dict`): + Additional keyword arguments to pass to the `LoRALinearLayer` layers. + """ + + def __init__( + self, + hidden_size: int, + cross_attention_dim: Optional[int] = None, + rank: int = 4, + network_alpha: Optional[int] = None, + **kwargs, + ): + super().__init__() + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + q_rank = kwargs.pop("q_rank", None) + q_hidden_size = kwargs.pop("q_hidden_size", None) + q_rank = q_rank if q_rank is not None else rank + q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size + + v_rank = kwargs.pop("v_rank", None) + v_hidden_size = kwargs.pop("v_hidden_size", None) + v_rank = v_rank if v_rank is not None else rank + v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size + + out_rank = kwargs.pop("out_rank", None) + out_hidden_size = kwargs.pop("out_hidden_size", None) + out_rank = out_rank if out_rank is not None else rank + out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size + + self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) + self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + self_cls_name = self.__class__.__name__ + deprecate( + self_cls_name, + "0.26.0", + ( + f"Make sure use {self_cls_name[4:]} instead by setting" + "LoRA layers to `self.{to_q,to_k,to_v,to_out[0]}.lora_layer` respectively. This will be done automatically when using" + " `LoraLoaderMixin.load_lora_weights`" + ), + ) + attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device) + attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device) + attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device) + attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device) + + attn._modules.pop("processor") + attn.processor = AttnProcessor2_0() + return attn.processor(attn, hidden_states, *args, **kwargs) + + +class LoRAXFormersAttnProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with memory efficient attention using xFormers. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + attention_op (`Callable`, *optional*, defaults to `None`): + The base + [operator](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.AttentionOpBase) to + use as the attention operator. It is recommended to set to `None`, and allow xFormers to choose the best + operator. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + kwargs (`dict`): + Additional keyword arguments to pass to the `LoRALinearLayer` layers. + """ + + def __init__( + self, + hidden_size: int, + cross_attention_dim: int, + rank: int = 4, + attention_op: Optional[Callable] = None, + network_alpha: Optional[int] = None, + **kwargs, + ): + super().__init__() + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + self.attention_op = attention_op + + q_rank = kwargs.pop("q_rank", None) + q_hidden_size = kwargs.pop("q_hidden_size", None) + q_rank = q_rank if q_rank is not None else rank + q_hidden_size = q_hidden_size if q_hidden_size is not None else hidden_size + + v_rank = kwargs.pop("v_rank", None) + v_hidden_size = kwargs.pop("v_hidden_size", None) + v_rank = v_rank if v_rank is not None else rank + v_hidden_size = v_hidden_size if v_hidden_size is not None else hidden_size + + out_rank = kwargs.pop("out_rank", None) + out_hidden_size = kwargs.pop("out_hidden_size", None) + out_rank = out_rank if out_rank is not None else rank + out_hidden_size = out_hidden_size if out_hidden_size is not None else hidden_size + + self.to_q_lora = LoRALinearLayer(q_hidden_size, q_hidden_size, q_rank, network_alpha) + self.to_k_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(cross_attention_dim or v_hidden_size, v_hidden_size, v_rank, network_alpha) + self.to_out_lora = LoRALinearLayer(out_hidden_size, out_hidden_size, out_rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + self_cls_name = self.__class__.__name__ + deprecate( + self_cls_name, + "0.26.0", + ( + f"Make sure use {self_cls_name[4:]} instead by setting" + "LoRA layers to `self.{to_q,to_k,to_v,add_k_proj,add_v_proj,to_out[0]}.lora_layer` respectively. This will be done automatically when using" + " `LoraLoaderMixin.load_lora_weights`" + ), + ) + attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device) + attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device) + attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device) + attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device) + + attn._modules.pop("processor") + attn.processor = XFormersAttnProcessor() + return attn.processor(attn, hidden_states, *args, **kwargs) + + +class LoRAAttnAddedKVProcessor(nn.Module): + r""" + Processor for implementing the LoRA attention mechanism with extra learnable key and value matrices for the text + encoder. + + Args: + hidden_size (`int`, *optional*): + The hidden size of the attention layer. + cross_attention_dim (`int`, *optional*, defaults to `None`): + The number of channels in the `encoder_hidden_states`. + rank (`int`, defaults to 4): + The dimension of the LoRA update matrices. + network_alpha (`int`, *optional*): + Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs. + kwargs (`dict`): + Additional keyword arguments to pass to the `LoRALinearLayer` layers. + """ + + def __init__( + self, + hidden_size: int, + cross_attention_dim: Optional[int] = None, + rank: int = 4, + network_alpha: Optional[int] = None, + ): + super().__init__() + + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.rank = rank + + self.to_q_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.add_k_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.add_v_proj_lora = LoRALinearLayer(cross_attention_dim or hidden_size, hidden_size, rank, network_alpha) + self.to_k_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_v_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + self.to_out_lora = LoRALinearLayer(hidden_size, hidden_size, rank, network_alpha) + + def __call__(self, attn: Attention, hidden_states: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor: + self_cls_name = self.__class__.__name__ + deprecate( + self_cls_name, + "0.26.0", + ( + f"Make sure use {self_cls_name[4:]} instead by setting" + "LoRA layers to `self.{to_q,to_k,to_v,add_k_proj,add_v_proj,to_out[0]}.lora_layer` respectively. This will be done automatically when using" + " `LoraLoaderMixin.load_lora_weights`" + ), + ) + attn.to_q.lora_layer = self.to_q_lora.to(hidden_states.device) + attn.to_k.lora_layer = self.to_k_lora.to(hidden_states.device) + attn.to_v.lora_layer = self.to_v_lora.to(hidden_states.device) + attn.to_out[0].lora_layer = self.to_out_lora.to(hidden_states.device) + + attn._modules.pop("processor") + attn.processor = AttnAddedKVProcessor() + return attn.processor(attn, hidden_states, *args, **kwargs) + + +LORA_ATTENTION_PROCESSORS = ( + LoRAAttnProcessor, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + LoRAAttnAddedKVProcessor, +) + +ADDED_KV_ATTENTION_PROCESSORS = ( + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + LoRAAttnAddedKVProcessor, +) + +CROSS_ATTENTION_PROCESSORS = ( + AttnProcessor, + AttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + LoRAAttnProcessor, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, +) + +AttentionProcessor = Union[ + AttnProcessor, + AttnProcessor2_0, + XFormersAttnProcessor, + SlicedAttnProcessor, + AttnAddedKVProcessor, + SlicedAttnAddedKVProcessor, + AttnAddedKVProcessor2_0, + XFormersAttnAddedKVProcessor, + CustomDiffusionAttnProcessor, + CustomDiffusionXFormersAttnProcessor, + CustomDiffusionAttnProcessor2_0, + # deprecated + LoRAAttnProcessor, + LoRAAttnProcessor2_0, + LoRAXFormersAttnProcessor, + LoRAAttnAddedKVProcessor, +] diff --git a/explore/memory-profile/transblock.py b/explore/memory-profile/transblock.py new file mode 100644 index 0000000..e8a8c98 --- /dev/null +++ b/explore/memory-profile/transblock.py @@ -0,0 +1,731 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, Optional + +import torch +from torch import nn + +from diffusers.utils import USE_PEFT_BACKEND +from diffusers.utils.torch_utils import maybe_allow_in_graph +from diffusers.models.activations import GEGLU, GELU, ApproximateGELU +from diffusers.models.embeddings import SinusoidalPositionalEmbedding +from diffusers.models.lora import LoRACompatibleLinear +from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormZero + +try: + from attention_processor import Attention +except: + from .attention_processor import Attention + +USE_PEFT_BACKEND=False + + +@maybe_allow_in_graph +class BasicTransformerBlock(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + upcast_attention (`bool`, *optional*): + Whether to upcast the attention computation to float32. This is useful for mixed precision training. + norm_elementwise_affine (`bool`, *optional*, defaults to `True`): + Whether to use learnable elementwise affine parameters for normalization. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`. + final_dropout (`bool` *optional*, defaults to False): + Whether to apply a final dropout after the last feed-forward layer. + attention_type (`str`, *optional*, defaults to `"default"`): + The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`. + positional_embeddings (`str`, *optional*, defaults to `None`): + The type of positional embeddings to apply to. + num_positional_embeddings (`int`, *optional*, defaults to `None`): + The maximum number of positional embeddings to apply. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single' + norm_eps: float = 1e-5, + final_dropout: bool = False, + attention_type: str = "default", + positional_embeddings: Optional[str] = None, + num_positional_embeddings: Optional[int] = None, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + self.use_ada_layer_norm_single = norm_type == "ada_norm_single" + self.use_layer_norm = norm_type == "layer_norm" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + if positional_embeddings and (num_positional_embeddings is None): + raise ValueError( + "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined." + ) + + if positional_embeddings == "sinusoidal": + self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings) + else: + self.pos_embed = None + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif self.use_ada_layer_norm_zero: + self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + ) + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + if not self.use_ada_layer_norm_single: + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + + self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + # 4. Fuser + if attention_type == "gated" or attention_type == "gated-text-image": + self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim) + + # 5. Scale-shift for PixArt-Alpha. + if self.use_ada_layer_norm_single: + self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + ) -> torch.FloatTensor: + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.use_layer_norm: + norm_hidden_states = self.norm1(hidden_states) + elif self.use_ada_layer_norm_single: + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1) + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + norm_hidden_states = norm_hidden_states.squeeze(1) + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Retrieve lora scale. + lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0 + + # 2. Prepare GLIGEN inputs + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.use_ada_layer_norm_single: + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 2.5 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.use_ada_layer_norm: + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.use_ada_layer_norm_zero or self.use_layer_norm: + norm_hidden_states = self.norm2(hidden_states) + elif self.use_ada_layer_norm_single: + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.use_ada_layer_norm_single is False: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + if not self.use_ada_layer_norm_single: + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self.use_ada_layer_norm_single: + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size + ff_output = torch.cat( + [ + self.ff(hid_slice, scale=lora_scale) + for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim) + ], + dim=self._chunk_dim, + ) + else: + ff_output = self.ff(norm_hidden_states, scale=lora_scale) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.use_ada_layer_norm_single: + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + + + +class GEGLUFeedForward(nn.Module): + r""" + A feed-forward layer. + + Parameters: + dim (`int`): The number of channels in the input. + dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. + mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. + """ + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim) + if activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate="tanh") + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim) + + self.net = nn.ModuleList([]) + # project in + self.net.append(act_fn) + # project dropout + self.net.append(nn.Dropout(dropout)) + # project out + self.net.append(linear_cls(inner_dim, dim_out)) + # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor: + compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear) + for module in self.net: + if isinstance(module, compatible_cls): + hidden_states = module(hidden_states, scale) + else: + hidden_states = module(hidden_states) + return hidden_states + +class PlainFeedForward(nn.Module): + r""" + A feed-forward layer. + + Parameters: + dim (`int`): The number of channels in the input. + dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. + mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. + """ + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + ): + super().__init__() + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear + + + self.net = nn.ModuleList([]) + # project in + self.net.append(linear_cls(dim, inner_dim)) + # project dropout + self.net.append(nn.GELU()) + # project out + self.net.append(linear_cls(inner_dim, dim_out)) + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states: torch.Tensor, scale: float = 1.0) -> torch.Tensor: + compatible_cls = (GEGLU,) if USE_PEFT_BACKEND else (GEGLU, LoRACompatibleLinear) + for module in self.net: + if isinstance(module, compatible_cls): + hidden_states = module(hidden_states, scale) + else: + hidden_states = module(hidden_states) + return hidden_states + + +@maybe_allow_in_graph +class BasicTransformerBlock_(nn.Module): + r""" + A basic Transformer block. + + Parameters: + dim (`int`): The number of channels in the input and output. + num_attention_heads (`int`): The number of heads to use for multi-head attention. + attention_head_dim (`int`): The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + num_embeds_ada_norm (: + obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. + attention_bias (: + obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. + only_cross_attention (`bool`, *optional*): + Whether to use only cross-attention layers. In this case two cross attention layers are used. + double_self_attention (`bool`, *optional*): + Whether to use two self-attention layers. In this case no cross attention layers are used. + upcast_attention (`bool`, *optional*): + Whether to upcast the attention computation to float32. This is useful for mixed precision training. + norm_elementwise_affine (`bool`, *optional*, defaults to `True`): + Whether to use learnable elementwise affine parameters for normalization. + norm_type (`str`, *optional*, defaults to `"layer_norm"`): + The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`. + final_dropout (`bool` *optional*, defaults to False): + Whether to apply a final dropout after the last feed-forward layer. + attention_type (`str`, *optional*, defaults to `"default"`): + The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`. + positional_embeddings (`str`, *optional*, defaults to `None`): + The type of positional embeddings to apply to. + num_positional_embeddings (`int`, *optional*, defaults to `None`): + The maximum number of positional embeddings to apply. + """ + + def __init__( + self, + dim: int, + num_attention_heads: int, + attention_head_dim: int, + dropout=0.0, + cross_attention_dim: Optional[int] = None, + activation_fn: str = "geglu", + num_embeds_ada_norm: Optional[int] = None, + attention_bias: bool = False, + only_cross_attention: bool = False, + double_self_attention: bool = False, + upcast_attention: bool = False, + norm_elementwise_affine: bool = True, + norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single' + norm_eps: float = 1e-5, + final_dropout: bool = False, + attention_type: str = "default", + positional_embeddings: Optional[str] = None, + num_positional_embeddings: Optional[int] = None, + ): + super().__init__() + self.only_cross_attention = only_cross_attention + + self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" + self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" + self.use_ada_layer_norm_single = norm_type == "ada_norm_single" + self.use_layer_norm = norm_type == "layer_norm" + + if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: + raise ValueError( + f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" + f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." + ) + + if positional_embeddings and (num_positional_embeddings is None): + raise ValueError( + "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined." + ) + + if positional_embeddings == "sinusoidal": + self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings) + else: + self.pos_embed = None + + # Define 3 blocks. Each block has its own normalization layer. + # 1. Self-Attn + if self.use_ada_layer_norm: + self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) + elif self.use_ada_layer_norm_zero: + self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) + else: + self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + cross_attention_dim=cross_attention_dim if only_cross_attention else None, + upcast_attention=upcast_attention, + ) + + # 2. Cross-Attn + if cross_attention_dim is not None or double_self_attention: + # We currently only use AdaLayerNormZero for self attention where there will only be one attention block. + # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during + # the second cross attention block. + self.norm2 = ( + AdaLayerNorm(dim, num_embeds_ada_norm) + if self.use_ada_layer_norm + else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + ) + self.attn2 = Attention( + query_dim=dim, + cross_attention_dim=cross_attention_dim if not double_self_attention else None, + heads=num_attention_heads, + dim_head=attention_head_dim, + dropout=dropout, + bias=attention_bias, + upcast_attention=upcast_attention, + ) # is self-attn if encoder_hidden_states is none + else: + self.norm2 = None + self.attn2 = None + + # 3. Feed-forward + # if not self.use_ada_layer_norm_single: + # self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) + + self.ff = GEGLUFeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) + + # 4. Fuser + if attention_type == "gated" or attention_type == "gated-text-image": + self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim) + + # 5. Scale-shift for PixArt-Alpha. + if self.use_ada_layer_norm_single: + self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5) + + # let chunk size default to None + self._chunk_size = None + self._chunk_dim = 0 + + def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int): + # Sets chunk feed-forward + self._chunk_size = chunk_size + self._chunk_dim = dim + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + timestep: Optional[torch.LongTensor] = None, + cross_attention_kwargs: Dict[str, Any] = None, + class_labels: Optional[torch.LongTensor] = None, + ) -> torch.FloatTensor: + # Notice that normalization is always applied before the real computation in the following blocks. + # 0. Self-Attention + batch_size = hidden_states.shape[0] + + if self.use_ada_layer_norm: + norm_hidden_states = self.norm1(hidden_states, timestep) + elif self.use_ada_layer_norm_zero: + norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( + hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype + ) + elif self.use_layer_norm: + norm_hidden_states = self.norm1(hidden_states) + elif self.use_ada_layer_norm_single: + shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = ( + self.scale_shift_table[None] + ).chunk(6, dim=1) + norm_hidden_states = self.norm1(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa + norm_hidden_states = norm_hidden_states.squeeze(1) + else: + raise ValueError("Incorrect norm used") + + if self.pos_embed is not None: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + # 1. Retrieve lora scale. + lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0 + + # 2. Prepare GLIGEN inputs + cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {} + gligen_kwargs = cross_attention_kwargs.pop("gligen", None) + + # import pdb;pdb.set_trace() + attn_output = self.attn1( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + if self.use_ada_layer_norm_zero: + attn_output = gate_msa.unsqueeze(1) * attn_output + elif self.use_ada_layer_norm_single: + attn_output = gate_msa * attn_output + + hidden_states = attn_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + # 2.5 GLIGEN Control + if gligen_kwargs is not None: + hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"]) + + # 3. Cross-Attention + if self.attn2 is not None: + if self.use_ada_layer_norm: + norm_hidden_states = self.norm2(hidden_states, timestep) + elif self.use_ada_layer_norm_zero or self.use_layer_norm: + norm_hidden_states = self.norm2(hidden_states) + elif self.use_ada_layer_norm_single: + # For PixArt norm2 isn't applied here: + # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103 + norm_hidden_states = hidden_states + else: + raise ValueError("Incorrect norm") + + if self.pos_embed is not None and self.use_ada_layer_norm_single is False: + norm_hidden_states = self.pos_embed(norm_hidden_states) + + attn_output = self.attn2( + norm_hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + **cross_attention_kwargs, + ) + hidden_states = attn_output + hidden_states + + # 4. Feed-forward + if not self.use_ada_layer_norm_single: + norm_hidden_states = self.norm3(hidden_states) + + if self.use_ada_layer_norm_zero: + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + if self.use_ada_layer_norm_single: + # pass + norm_hidden_states = self.norm3(hidden_states) + # import pdb;pdb.set_trace() + + norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp + + if self._chunk_size is not None: + # "feed_forward_chunk_size" can be used to save memory + if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0: + raise ValueError( + f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." + ) + + num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size + ff_output = torch.cat( + [ + self.ff(hid_slice, scale=lora_scale) + for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim) + ], + dim=self._chunk_dim, + ) + else: + ff_output = self.ff(norm_hidden_states, scale=lora_scale) + + if self.use_ada_layer_norm_zero: + ff_output = gate_mlp.unsqueeze(1) * ff_output + elif self.use_ada_layer_norm_single: + ff_output = gate_mlp * ff_output + + hidden_states = ff_output + hidden_states + if hidden_states.ndim == 4: + hidden_states = hidden_states.squeeze(1) + + return hidden_states + +saved_tensors = [] + +def pack_hook(x): + # global saved_tensors + print("Packing", x.numel(), x.grad_fn, x.data_ptr()) + # saved_tensors.append(x) + return x + +def unpack_hook(x): + print("Unpacking", x.numel()) + return x + +def adaln(x): + gate = torch.rand([1,1, x.shape[-1]], dtype=x.dtype, device=x.device) + gate2 = torch.rand([1,1, x.shape[-1]], dtype=x.dtype, device=x.device) + y = x* (1 + gate) + gate2 + + return y + +def profile_memory(dim=4096, dtype=torch.bfloat16, b=1, s=8100): + # model = PlainFeedForward(dim, dropout=0).to(dtype).cuda() + # model = GEGLUFeedForward(dim).to(dtype).cuda() + # model = Attention( + # query_dim=dim, + # heads=32, + # dim_head=dim//32, + # dropout=0, + # bias=False, + # upcast_attention=False, + # ).to(dtype).cuda() + # model = BasicTransformerBlock_(dim, 32, dim//32, cross_attention_dim=dim + # , norm_type='ada_norm_single').to(dtype).cuda() + model = BasicTransformerBlock_(dim, 32, dim//32 + , norm_type='ada_norm_single').to(dtype).cuda() + input = torch.rand([b,s,dim], dtype=dtype, device='cuda') + encoder_states = torch.rand([b,360,dim], dtype=dtype, device='cuda') + print("numel:", input.numel(), "dataptr", input.data_ptr()) + beg=torch.cuda.memory_allocated() + + with torch.autograd.graph.saved_tensors_hooks(pack_hook, unpack_hook): + out = model(input, encoder_hidden_states=encoder_states) + # out = adaln(input) + print("output ptr", out.data_ptr()) + print("mem comsumption:", (torch.cuda.memory_allocated()-beg)/1e9, "num bsd=", (torch.cuda.memory_allocated()-beg)//input.numel(), flush=True) + +profile_memory() \ No newline at end of file diff --git a/torchdistpackage/dist/launch_from_slurm.py b/torchdistpackage/dist/launch_from_slurm.py index 3489b61..3b86052 100644 --- a/torchdistpackage/dist/launch_from_slurm.py +++ b/torchdistpackage/dist/launch_from_slurm.py @@ -24,6 +24,8 @@ def setup_distributed_slurm(backend="nccl", port=None): import torch import torch.distributed as dist + if dist.is_initialized(): + return None num_gpus = torch.cuda.device_count() if "SLURM_JOB_ID" in os.environ: diff --git a/torchdistpackage/parallel/tensor_parallel/attn.py b/torchdistpackage/parallel/tensor_parallel/attn.py index 529cd18..d73d2fb 100644 --- a/torchdistpackage/parallel/tensor_parallel/attn.py +++ b/torchdistpackage/parallel/tensor_parallel/attn.py @@ -36,17 +36,19 @@ def _naive_attn(self, x): q= _split_heads(q, self.num_heads, self.head_dim) k= _split_heads(k, self.num_heads, self.head_dim) v= _split_heads(v, self.num_heads, self.head_dim) + # (b, nh, s, hdim) attn = ((q * self.scale) @ k.transpose(-2, -1)) + # (b, nh, s, s) attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, DIM) x = self.proj(x) - x = self.proj_drop(x) return x def forward(self, x): x = self._naive_attn(x) + x = self.proj_drop(x) return x @@ -80,14 +82,13 @@ def _naive_attn(self, x): q= _split_heads(q, self.head_num_per_partition, self.head_dim) k= _split_heads(k, self.head_num_per_partition, self.head_dim) v= _split_heads(v, self.head_num_per_partition, self.head_dim) - + # (b, nh_per, s, hdim) attn = ((q * self.scale) @ k.transpose(-2, -1)) attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, DIM//self.tp_size) x = self.proj(x) - x = self.proj_drop(x) return x def forward(self, x): @@ -96,4 +97,7 @@ def forward(self, x): x = gather_from_sequence_parallel_region(x) x = self._naive_attn(x) + + x = self.proj_drop(x) + return x \ No newline at end of file diff --git a/torchdistpackage/parallel/tensor_parallel/transformer.py b/torchdistpackage/parallel/tensor_parallel/transformer.py index 5a6cd57..b60f7b8 100644 --- a/torchdistpackage/parallel/tensor_parallel/transformer.py +++ b/torchdistpackage/parallel/tensor_parallel/transformer.py @@ -46,8 +46,8 @@ def __init__(self, dim, mlp_ratio=4, num_heads=8, sequence_parallel=False): self.sequence_parallel = sequence_parallel def forward(self, hidden_states): - if self.sequence_parallel: - hidden_states = maybe_split_into_sequence_parallel(hidden_states) + # if self.sequence_parallel: + # hidden_states = maybe_split_into_sequence_parallel(hidden_states) residual = hidden_states hidden_states = self.ln_1(hidden_states) @@ -67,8 +67,8 @@ def forward(self, hidden_states): # residual connection hidden_states = residual + feed_forward_hidden_states - if self.sequence_parallel: - set_sequence_parallel_attr(hidden_states) + # if self.sequence_parallel: + # set_sequence_parallel_attr(hidden_states) return hidden_states @torch.no_grad() @@ -93,6 +93,8 @@ def __init__(self, dim, mlp_ratio=4, num_heads=8, depth=12, tensor_parallel=True self.sequence_parallel = sequence_parallel def forward(self, x): + if self.sequence_parallel: + x = maybe_split_into_sequence_parallel(x) for blk in self.blocks: x = blk(x) if self.sequence_parallel: From 74d1808bd89cfeb886116590b52b93483ea665b5 Mon Sep 17 00:00:00 2001 From: kimmishi Date: Mon, 15 Apr 2024 09:58:16 +0800 Subject: [PATCH 5/6] remove diffusion example --- examples/diffusion/train_unet.py | 130 ------------------------------- 1 file changed, 130 deletions(-) delete mode 100644 examples/diffusion/train_unet.py diff --git a/examples/diffusion/train_unet.py b/examples/diffusion/train_unet.py deleted file mode 100644 index 277b0b9..0000000 --- a/examples/diffusion/train_unet.py +++ /dev/null @@ -1,130 +0,0 @@ -import diffusers -from diffusers import ( - AutoencoderKL, - DDPMScheduler, - StableDiffusionXLPipeline, - UNet2DConditionModel, -) - -import torch -import torch.nn as nn -import torch.nn.functional as F - -import time -import functools - -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP -from torch.distributed.fsdp import ShardingStrategy - -from torchdistpackage import setup_distributed_slurm -import sdto - -def enable_tf32(): - torch.backends.cuda.matmul.allow_tf32 = True - torch.backends.cudnn.allow_tf32 = True - - -def train(model, vae, optimizer_class, batchsize, use_zero=False, use_amp=True, h=512, w=512, is_xl=False, - input_grad=False): - dt=torch.float32 - - if not is_xl: - timesteps = torch.arange(batchsize, dtype=torch.int64).cuda()+100 - prompt_embeds = torch.rand([batchsize,77,768], dtype=dt).cuda() - time_ids = torch.rand([batchsize,6], dtype=dt).cuda() - text_embeds = torch.rand([batchsize,1280], dtype=dt).cuda() - encoder_hidden_states = torch.rand([batchsize,77,768], dtype=dt).cuda() - - else: - timesteps = torch.arange(batchsize, dtype=torch.int64).cuda()+100 - prompt_embeds = torch.rand([batchsize,77,2048], dtype=dt).cuda() - time_ids = torch.rand([batchsize,6], dtype=dt).cuda() - text_embeds = torch.rand([batchsize,1280], dtype=dt).cuda() - - - # model.to(dt) - model.cuda() - - model_input = torch.rand([batchsize, 3, h, w], dtype=torch.float32).cuda() - - # noisy_model_input = torch.rand([batchsize, 4, h//8, w//8], dtype=dt, device='cuda').requires_grad_(input_grad) - unet_added_conditions = { - "time_ids": time_ids, - "text_embeds": text_embeds - } - - - # model.enable_gradient_checkpointing() - # model.enable_xformers_memory_efficient_attention() - model, vae, opt, _ = sdto.compile(model, vae, compile_vae=True) - - perf_times = [] - if use_zero: - model = FSDP(model, sharding_strategy=torch.distributed.fsdp.ShardingStrategy.SHARD_GRAD_OP) - opt =optimizer_class(model.parameters()) - else: - opt =optimizer_class(model.parameters()) - - from torch.profiler import profile, record_function, ProfilerActivity - prof = torch.profiler.profile( - schedule=torch.profiler.schedule(wait=0, warmup=5, active=1, repeat=1), - on_trace_ready=torch.profiler.tensorboard_trace_handler('./bnb_int8'), - activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], - record_shapes=True, - with_stack=True, - profile_memory=True) - # prof.start() - for ind in range(10): - torch.cuda.synchronize() - beg = time.time() - - # noisy_model_input = vae.encode(model_input).latent_dist.sample().mul_(0.18215) - noisy_model_input = sdto.sliced_vae(vae, model_input, use_autocast=True) - - with torch.autocast(dtype=torch.float16, device_type='cuda', enabled=use_amp): - if is_xl: - model_pred = model( - noisy_model_input, timesteps, prompt_embeds, added_cond_kwargs=unet_added_conditions - ).sample - # print("after fwd", torch.cuda.memory_allocated()/1e9, torch.cuda.max_memory_allocated()/1e9) - # import pdb;pdb.set_trace() - loss = F.mse_loss(model_pred.float(), torch.rand_like(model_pred).float(), reduction="mean") - else: - model_pred = model(noisy_model_input, timesteps, encoder_hidden_states).sample - # print("after fwd", torch.cuda.memory_allocated()/1e9, torch.cuda.max_memory_allocated()/1e9) - # import pdb;pdb.set_trace() - loss = F.mse_loss(model_pred.float(), torch.rand_like(model_pred).float(), reduction="mean") - - loss.backward() - # print("after bwd", torch.cuda.max_memory_allocated()/1e9) - - opt.step() - opt.zero_grad() - torch.cuda.synchronize() - # prof.step() - if ind>5: - perf_times.append(time.time()-beg) - beg=time.time() - print("max mem", torch.cuda.max_memory_allocated()/1e9) - print(perf_times) - -enable_tf32() -rank, world_size, port, addr=setup_distributed_slurm() - -pretrained_model_name_or_path="stabilityai/stable-diffusion-xl-base-1.0" - -# pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" - -unet = UNet2DConditionModel.from_pretrained( - pretrained_model_name_or_path, subfolder="unet" -).cuda() -unet.train() -vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae").cuda() - -# print(unet) -# optimizer_class = FusedAdam -optimizer_class = functools.partial(torch.optim.Adam,fused = True) - -train(unet, vae, optimizer_class, 96, - use_amp=False, use_zero=False, h=576, w=1024, is_xl ='xl' in pretrained_model_name_or_path, - input_grad=True) \ No newline at end of file From 7333dbaec7b18611a6b25a60876592d903061e54 Mon Sep 17 00:00:00 2001 From: KimmiShi Date: Tue, 23 Apr 2024 11:47:07 +0800 Subject: [PATCH 6/6] Update debug_nan.py, add grad norm hook --- torchdistpackage/tools/debug_nan.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/torchdistpackage/tools/debug_nan.py b/torchdistpackage/tools/debug_nan.py index 1c17621..ce42845 100644 --- a/torchdistpackage/tools/debug_nan.py +++ b/torchdistpackage/tools/debug_nan.py @@ -94,3 +94,29 @@ def register_debug_nan_hooks(model): # bwd_hooks[name] = module.register_full_backward_hook(bwd_hook_wrapper(name)) return fwd_hooks, bwd_hooks +# debug grad norm + +def get_grad_norm(grads, norm_type=2.0): + norms = [torch.norm(g.detach(), norm_type) for g in grads if g is not None] + if len(norms)==0: + return 0 + total_norm = torch.norm(torch.stack(norms), norm_type) + return total_norm + +def bwd_hook_wrapper(module_name='', max_grad_norm=0.1): + def check_value(module, grad_input, grad_output): + out_grad_norm = get_grad_norm(grad_output) + in_grad_norm = get_grad_norm(grad_input) + if out_grad_norm > max_grad_norm: + if dist.get_rank()==0: + print(module_name, "grad_output norm:", out_grad_norm, flush=True) + if in_grad_norm > max_grad_norm: + if dist.get_rank()==0: + print(module_name, "grad_input norm:", in_grad_norm, flush=True) + return check_value + +def register_debug_hooks(model): + bwd_hooks = {} + for name, module in model.named_modules(): + bwd_hooks[name] = module.register_full_backward_hook(bwd_hook_wrapper(name)) + return bwd_hooks