From 716fa4f14688b7dc0c0040d145ab21d7dee2b488 Mon Sep 17 00:00:00 2001 From: apolinario Date: Sat, 4 Jun 2022 13:57:50 +0200 Subject: [PATCH 01/13] Bump to 1.3.1 New perceptor Better defaulls --- latent.ipynb | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 8e06f7a..5bb7339 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -16,7 +16,7 @@ "id": "NUmmV5ZvrPbP" }, "source": [ - "# Latent Majesty Diffusion v1.3\n", + "# Latent Majesty Diffusion v1.3.1\n", "#### Formerly known as Princess Generator\n", "##### Access our [Majestic Guide](https://multimodal.art/majesty-diffusion) (_under construction_), our [GitHub](https://github.com/multimodalart/majesty-diffusion), join our community on [Discord](https://discord.gg/yNBtQBEDfZ) or reach out via [@multimodalart on Twitter](https://twitter.com/multimodalart))\n", "\\\n", @@ -28,7 +28,8 @@ "#### CLIP Guided Latent Diffusion by [dango233](https://github.com/Dango233/) and [apolinario (@multimodalart)](https://twitter.com/multimodalart). \n", "The LAION-400M-trained model and the modified inference code are from [CompVis Latent Diffusion](https://github.com/CompVis/latent-diffusion). The guided-diffusion method is modified by Dango233 based on [Katherine Crowson](https://twitter.com/RiversHaveWings)'s guided diffusion notebook. multimodalart savable settings, MMC and assembled the Colab. Check the complete list on our GitHub. Some functions and methods are from various code masters (nsheppard, DanielRussRuss and others)\n", "\n", - "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))" + "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))\n", + "Changelog: 1.3.1 - better defaults, added ViT-L/14 LAION-400M trained" ] }, { @@ -1018,6 +1019,7 @@ "ViT_B32 = False #@param {type:\"boolean\"}\n", "ViT_B16 = True #@param {type:\"boolean\"}\n", "ViT_L14 = False #@param {type:\"boolean\"}\n", + "ViT_L14_laion = False #@param {type:\"boolean\"}\n", "ViT_L14_336px = False #@param {type:\"boolean\"}\n", "#RN101 = False #@param {type:\"boolean\"}\n", "#RN50 = False #@param {type:\"boolean\"}\n", @@ -1053,6 +1055,8 @@ " clip_load_list.append(\"[clip - mlfoundations - RN50x64--openai]\")\n", "if RN50x16: \n", " clip_load_list.append(\"[clip - mlfoundations - RN50x16--openai]\")\n", + "if ViT_L14_laion: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14--laion400m_e32]\")\n", "if ViT_L14_336px:\n", " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14-336--openai]\")\n", "if ViT_B16_plus:\n", @@ -1178,7 +1182,7 @@ "cutn_batches = 1\n", "range_index = [0]*300 + [0]*1000 \n", "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", - "tv_scales = [1000]*1+[600]*3\n", + "tv_scales = [800]*1+[200]*3\n", "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", @@ -1191,12 +1195,12 @@ "opt_mag_mul = 10 #Magnify grad before clamping\n", "#PLMS Currently not working, working on a fix\n", "#opt.plms = False #Won;=t work with clip guidance\n", - "opt_ddim_eta, opt_eta_end = [1.4,1] # linear variation of eta\n", - "opt_temperature = .975 \n", + "opt_ddim_eta, opt_eta_end = [1.5,1] # linear variation of eta\n", + "opt_temperature = .95\n", "\n", "#Grad advanced settings\n", "grad_center = False\n", - "grad_scale= 0.5 #5 Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "grad_scale= 0.1 #5 Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", "anti_jpg = 0 #not working\n", "\n", "#Init image advanced settings\n", @@ -1204,7 +1208,7 @@ "init_magnitude = 0.15\n", "\n", "#More settings\n", - "RGB_min, RGB_max = [-0.95,0.95]\n", + "RGB_min, RGB_max = [-1,1]\n", "padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", "flip_aug=False\n", "cc = 60\n", @@ -1281,8 +1285,8 @@ "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", "width = 256#@param{type: 'integer'}\n", "height = 256#@param{type: 'integer'}\n", - "latent_diffusion_guidance_scale = 2 #@param {type:\"number\"}\n", - "clip_guidance_scale = 5000 #@param{type: 'integer'}\n", + "latent_diffusion_guidance_scale = 2.5 #@param {type:\"number\"}\n", + "clip_guidance_scale = 7500 #@param{type: 'integer'}\n", "how_many_batches = 1 #@param{type: 'integer'}\n", "aesthetic_loss_scale = 200 #@param{type: 'integer'}\n", "augment_cuts=True #@param{type:'boolean'}\n", @@ -1302,7 +1306,7 @@ "init_scale = 1000 #@param{type: 'integer'}\n", "init_brightness = 0.0 #@param{type: 'number'}\n", "#@markdown How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)\n", - "init_noise = 0.6 #@param{type: 'number'}\n", + "init_noise = 0.57 #@param{type: 'number'}\n", "\n", "#@markdown\n", "\n", From 3a8981ff91fec260c17b747636fdefd00b788ec2 Mon Sep 17 00:00:00 2001 From: apolinario Date: Sat, 4 Jun 2022 16:49:08 +0200 Subject: [PATCH 02/13] 1.4 bump --- latent.ipynb | 122 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 104 insertions(+), 18 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 5bb7339..b5bbdb7 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -371,6 +371,7 @@ "outputs": [], "source": [ "#@title Load necessary functions\n", + "from ldm.modules.diffusionmodules.util import noise_like\n", "def set_custom_schedules(schedule):\n", " custom_schedules = []\n", " for schedule_item in schedule:\n", @@ -440,6 +441,7 @@ " pad_input = pad_input.repeat(input.shape[0],1,1,1)\n", " cutout = resize(pad_input, out_shape=output_shape_all)\n", " if aug: cutout=self.augs(cutout)\n", + " if cut_blur_n > 0: cutout[0:cut_blur_n,:,:,:] = TF.gaussian_blur(cutout[0:cut_blur_n,:,:,:],cut_blur_kernel)\n", " cutouts_list.append(cutout)\n", " \n", " if self.InnerCrop >0:\n", @@ -520,11 +522,24 @@ " return x\n", "\n", "def cond_fn(x, t):\n", + " global cur_step\n", + " cur_step += 1\n", " t=1000-t\n", " t=t[0]\n", " with torch.enable_grad():\n", " global clamp_start_, clamp_max\n", " x = x.detach()\n", + " if dynamic_decode and t<=350 :\n", + " x_recon = x\n", + " s = torch.quantile(\n", + " rearrange(x_recon, 'b ... -> b (...)').abs(),\n", + " threshold_percentile,\n", + " dim = -1\n", + " )\n", + "\n", + " s.clamp_(min = 1.)\n", + " s = s.view(-1, *((1,) * (x_recon.ndim - 1)))\n", + " x = x_recon.clamp(-s, s) / s\n", " x = x.requires_grad_()\n", " x_in = model.decode_first_stage(x)\n", " display_handler(x_in,t,1,False)\n", @@ -580,6 +595,7 @@ " tv_loss(F.interpolate(x, scale_factor = 1/8)).sum()* tv_scales[3] \n", " range_scale= range_index[t]\n", " range_losses = range_loss(x_in,RGB_min,RGB_max).sum() * range_scale\n", + " var_scale = var_index[t]\n", " loss = tv_losses + range_losses + losses\n", " #del losses\n", " if symmetric_loss_scale != 0: loss += symmetric_loss(x_in) * symmetric_loss_scale\n", @@ -615,6 +631,14 @@ " #print(magnitude, end = \"\\r\")\n", " grad = grad* magnitude.clamp(max= clamp_max) /magnitude#0.2\n", " grad = grad.detach()\n", + " grad = grad_fn(grad,t)\n", + " x = x.detach()\n", + " x = x.requires_grad_()\n", + " var = x.var()\n", + " var_losses = (var.pow(2).clamp(min = 1)- 1) * var_scale\n", + " var_losses.backward()\n", + " grad -= x.grad\n", + " print(grad.abs().mean(), x.grad.abs().mean(), end = \"\\r\")\n", " return grad\n", "\n", "def null_fn(x_in):\n", @@ -637,14 +661,27 @@ " im.save(p.stdin, 'PNG')\n", "\n", "\n", - " \n", + "def grad_fn(x,t):\n", + " if t <= 500 and grad_blur: x = TF.gaussian_blur(x, 2*round(int(max(grad_blur-t/150, 1)))-1, 1.5)\n", + " return x\n", "def cond_clamp(image,t): \n", - " #if t >=0:\n", - " #mag=image.square().mean().sqrt()\n", - " #mag = (mag*cc).clamp(1.6,100)\n", - " image = image.clamp(-cc, cc)\n", - " image = torch.nan_to_num(image, nan=0.0, posinf=cc, neginf=-cc)\n", - " return(image)\n", + " t = 1000-t[0]\n", + " if t<= max(punish_steps, compress_steps):\n", + " s = torch.quantile(\n", + " rearrange(image, 'b ... -> b (...)').abs(),\n", + " threshold_percentile,\n", + " dim = -1\n", + " )\n", + " s = s.view(-1, *((1,) * (image.ndim - 1)))\n", + " ths = s.clamp(min = threshold)\n", + " im_max = image.clamp(min = ths) - image.clamp(min = ths, max = ths)\n", + " im_min = image.clamp(max = -ths, min = -ths) - image.clamp(max = -ths)\n", + " if t<=punish_steps:\n", + " image = image.clamp(min = -ths, max = ths)+(im_max-im_min) * punish_factor #((im_max-im_min)*punish_factor).tanh()/punish_factor \n", + " if t<= compress_steps:\n", + " image = image / (ths/threshold)**compress_factor\n", + " image += noise_like(image.shape,device,False) * ((ths/threshold)**compress_factor - 1)\n", + " return(image) \n", "\n", "def make_schedule(t_start, t_end, step_size=1):\n", " schedule = []\n", @@ -790,7 +827,8 @@ "from ldm.modules.diffusionmodules.util import noise_like\n", "def do_run():\n", " # with torch.cuda.amp.autocast():\n", - " global progress,target_embeds, weights, zero_embed, init, scale_factor\n", + " global progress,target_embeds, weights, zero_embed, init, scale_factor, cur_step\n", + " cur_step = 0\n", " scale_factor = 1\n", " make_cutouts = {}\n", " for i in clip_list:\n", @@ -879,6 +917,8 @@ "\n", " all_samples=list()\n", " last_step_upscale = False\n", + " eta1 = opt.ddim_eta\n", + " eta2 = opt.eta_end\n", " with torch.enable_grad():\n", " with torch.cuda.amp.autocast():\n", " with model.ema_scope():\n", @@ -909,15 +949,17 @@ " verbose=False,\n", " unconditional_guidance_scale=opt.scale,\n", " unconditional_conditioning=uc,\n", - " eta=opt.ddim_eta,\n", - " eta_end = opt.eta_end,\n", + " eta=eta1,\n", + " eta_end = eta2,\n", " img_callback=None if use_cond_fn else display_handler,\n", - " cond_fn=cond_fn, #if use_cond_fn else None,\n", + " cond_fn=cond_fn if use_cond_fn else None,\n", " temperature = opt.temperature,\n", " x_adjust_fn=cond_clamp,\n", " x_T = x_T,\n", " x0=x_T,\n", - " mask=mask\n", + " mask=mask,\n", + " score_corrector = score_corrector,\n", + " corrector_kwargs = {}\n", " )\n", " x_T = samples_ddim.clamp(-6,6)\n", " else:\n", @@ -926,9 +968,18 @@ " method, scale_factor = custom_schedule.split(\":\")\n", " scale_factor = float(scale_factor)\n", " #clamp_index = np.array(clamp_index) * scale_factor\n", - " if method == \"latent\":\n", - " x_T = resize(samples_ddim, scale_factors=scale_factor, antialiasing=True)*scale_div\n", + " if method == \"RGB\":\n", + " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", + " temp_file = os.path.join(sample_path, temp_file_name)\n", + " im.save(temp_file, format = \"PNG\")\n", + " init = Image.open(fetch(temp_file)).convert('RGB')\n", + " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", + " opt.H, opt.W = opt.H*scale_factor, opt.W*scale_factor\n", + " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W], antialiasing=True)\n", + " init = init.mul(2).sub(1).half()\n", + " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", " x_T += noise_like(x_T.shape,device,False)*init_noise\n", + " x_T = x_T.clamp(-6,6)\n", " if method == \"gfpgan\":\n", " last_step_upscale = True\n", " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", @@ -953,7 +1004,16 @@ " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", " x_T += noise_like(x_T.shape,device,False)*init_noise\n", " x_T = x_T.clamp(-6,6)\n", - "\n", + " if method == \"purge\":\n", + " for i in scale_factor.split(\",\"):\n", + " if i in clip_list:\n", + " arch, pub, m_id = i[1:-1].split(' - ')\n", + " print(\"Purge \",i)\n", + " del clip_list[i]\n", + " del clip_model[m_id]\n", + " del clip_size[m_id]\n", + " del clip_tokenize[m_id]\n", + " del clip_normalize[m_id]\n", " #last_step_uspcale_list.append(last_step_upscale)\n", " scale_factor = 1\n", " current_time = str(round(time.time()))\n", @@ -1019,7 +1079,6 @@ "ViT_B32 = False #@param {type:\"boolean\"}\n", "ViT_B16 = True #@param {type:\"boolean\"}\n", "ViT_L14 = False #@param {type:\"boolean\"}\n", - "ViT_L14_laion = False #@param {type:\"boolean\"}\n", "ViT_L14_336px = False #@param {type:\"boolean\"}\n", "#RN101 = False #@param {type:\"boolean\"}\n", "#RN50 = False #@param {type:\"boolean\"}\n", @@ -1030,6 +1089,7 @@ "#@markdown #### OpenCLIP models\n", "ViT_B16_plus = False #@param {type: \"boolean\"}\n", "ViT_B32_laion2b = True #@param {type: \"boolean\"}\n", + "ViT_L14_laion = False #@param {type:\"boolean\"}\n", "\n", "#@markdown #### Multilangual CLIP models \n", "clip_farsi = False #@param {type: \"boolean\"}\n", @@ -1211,16 +1271,30 @@ "RGB_min, RGB_max = [-1,1]\n", "padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", "flip_aug=False\n", - "cc = 60\n", "cutout_debug = False\n", "opt.outdir = outputs_path\n", + "prior_weight = 0\n", + "score_modifier = True\n", + "threshold_percentile = .90\n", + "threshold = 1.2\n", + "\n", + "var_index = [0]*1000\n", "\n", "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", "experimental_aesthetic_embeddings = False\n", "#How much you want this to influence your result\n", "experimental_aesthetic_embeddings_weight = 0.5\n", "#9 are good aesthetic embeddings, 0 are bad ones\n", - "experimental_aesthetic_embeddings_score = 9" + "experimental_aesthetic_embeddings_score = 9\n", + "\n", + "# For fun dont change execpt if you really know what your are doing\n", + "cut_blur_n = 0\n", + "cut_blur_kernel = 3\n", + "grad_blur = 0\n", + "compress_steps = 0\n", + "compress_factor = 0.025\n", + "punish_steps = 0\n", + "punish_factor = 0.9" ] }, { @@ -1381,6 +1455,18 @@ "opt.scale = latent_diffusion_guidance_scale\n", "aug = augment_cuts\n", "\n", + "score_corrector = DotMap()\n", + "\n", + "def modify_score(e_t, e_t_uncond):\n", + " # print(e_t.abs().mean(),e_t.var())\n", + " global dynamic_decode\n", + " dynamic_decode = False\n", + " if e_t.var() >= 1 and score_modifier:\n", + " dynamic_decode = True\n", + " e_t = (e_t - e_t_uncond)/e_t.var() + e_t_uncond\n", + " return(e_t)\n", + "score_corrector.modify_score = modify_score\n", + "\n", "torch.cuda.empty_cache()\n", "gc.collect()\n", "generate_video = False\n", From af2df81a0a28de222f3e37814279237ad831d39b Mon Sep 17 00:00:00 2001 From: apolinario Date: Sat, 4 Jun 2022 17:03:13 +0200 Subject: [PATCH 03/13] new branch --- latent.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/latent.ipynb b/latent.ipynb index b5bbdb7..246935b 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -121,7 +121,7 @@ " downgrade_pytorch_result = subprocess.run(['pip', 'install', 'torch==1.10.2', 'torchvision==0.11.3', '-q'], stdout=subprocess.PIPE).stdout.decode('utf-8')\n", " import sys\n", " sys.path.append(\".\")\n", - " !git clone https://github.com/multimodalart/latent-diffusion\n", + " !git clone https://github.com/multimodalart/latent-diffusion --branch 1.4\n", " !git clone https://github.com/CompVis/taming-transformers\n", " !git clone https://github.com/TencentARC/GFPGAN\n", " !git clone https://github.com/multimodalart/majesty-diffusion\n", From 9edfb8f49a5c10111fc6be194883e4dcbe6897e3 Mon Sep 17 00:00:00 2001 From: apolinario Date: Sat, 4 Jun 2022 21:47:07 +0200 Subject: [PATCH 04/13] 1.4 updates --- latent.ipynb | 136 ++++++++++++++++++++++++++++----------------------- 1 file changed, 76 insertions(+), 60 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 246935b..32750a6 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -16,7 +16,7 @@ "id": "NUmmV5ZvrPbP" }, "source": [ - "# Latent Majesty Diffusion v1.3.1\n", + "# Latent Majesty Diffusion v1.4\n", "#### Formerly known as Princess Generator\n", "##### Access our [Majestic Guide](https://multimodal.art/majesty-diffusion) (_under construction_), our [GitHub](https://github.com/multimodalart/majesty-diffusion), join our community on [Discord](https://discord.gg/yNBtQBEDfZ) or reach out via [@multimodalart on Twitter](https://twitter.com/multimodalart))\n", "\\\n", @@ -29,7 +29,7 @@ "The LAION-400M-trained model and the modified inference code are from [CompVis Latent Diffusion](https://github.com/CompVis/latent-diffusion). The guided-diffusion method is modified by Dango233 based on [Katherine Crowson](https://twitter.com/RiversHaveWings)'s guided diffusion notebook. multimodalart savable settings, MMC and assembled the Colab. Check the complete list on our GitHub. Some functions and methods are from various code masters (nsheppard, DanielRussRuss and others)\n", "\n", "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))\n", - "Changelog: 1.3.1 - better defaults, added ViT-L/14 LAION-400M trained" + "Changelog: 1.4 - better defaults, added ViT-L/14 LAION-400M trained, fix CLOOB, adds modified dynamic thresholding, removes latent upscaler (was broken), adds RGB upscaler\n" ] }, { @@ -208,7 +208,7 @@ "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_16_linear.pth\"):\n", " print(\"Using ViT-B/16 sa aesthetic model already saved\")\n", "else:\n", - " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth\n", + " !wget -O $model_path/sa_0_4_vit_b_16_linear.pth https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth\n", "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"):\n", " print(\"Using ViT-B/32 aesthetic model from Google Drive\")\n", "else:\n", @@ -274,6 +274,7 @@ "from ldm.util import instantiate_from_config\n", "from ldm.models.diffusion.ddim import DDIMSampler\n", "from ldm.models.diffusion.plms import PLMSSampler\n", + "from ldm.modules.diffusionmodules.util import noise_like\n", "import tensorflow as tf\n", "from dotmap import DotMap\n", "import ipywidgets as widgets\n", @@ -371,7 +372,6 @@ "outputs": [], "source": [ "#@title Load necessary functions\n", - "from ldm.modules.diffusionmodules.util import noise_like\n", "def set_custom_schedules(schedule):\n", " custom_schedules = []\n", " for schedule_item in schedule:\n", @@ -529,17 +529,17 @@ " with torch.enable_grad():\n", " global clamp_start_, clamp_max\n", " x = x.detach()\n", - " if dynamic_decode and t<=350 :\n", - " x_recon = x\n", - " s = torch.quantile(\n", - " rearrange(x_recon, 'b ... -> b (...)').abs(),\n", - " threshold_percentile,\n", - " dim = -1\n", - " )\n", - "\n", - " s.clamp_(min = 1.)\n", - " s = s.view(-1, *((1,) * (x_recon.ndim - 1)))\n", - " x = x_recon.clamp(-s, s) / s\n", + " #if dynamic_decode and t<=350 :\n", + " # x_recon = x\n", + " # s = torch.quantile(\n", + " # rearrange(x_recon, 'b ... -> b (...)').abs(),\n", + " # threshold_percentile,\n", + " # dim = -1\n", + " # )\n", + "\n", + " # s.clamp_(min = 1.)\n", + " # s = s.view(-1, *((1,) * (x_recon.ndim - 1)))\n", + " # x = x_recon.clamp(-s, s) / s\n", " x = x.requires_grad_()\n", " x_in = model.decode_first_stage(x)\n", " display_handler(x_in,t,1,False)\n", @@ -627,7 +627,7 @@ " magnitude = grad.square().mean().sqrt()\n", " else:\n", " return(grad)\n", - " clamp_max = clamp_index[t]\n", + " clamp_max = clamp_index_variation[t]\n", " #print(magnitude, end = \"\\r\")\n", " grad = grad* magnitude.clamp(max= clamp_max) /magnitude#0.2\n", " grad = grad.detach()\n", @@ -759,7 +759,7 @@ " custom_schedule_setting = {custom_schedule_setting}\n", "\n", " #Cut settings\n", - " clamp_index = {list_mul_to_array(clamp_index)}\n", + " clamp_index = {clamp_index}\n", " cut_overview = {list_mul_to_array(cut_overview)}\n", " cut_innercut = {list_mul_to_array(cut_innercut)}\n", " cut_ic_pow = {cut_ic_pow}\n", @@ -789,7 +789,11 @@ " grad_center = {grad_center}\n", " #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", " grad_scale={grad_scale} \n", - "\n", + " score_modifier = {score_modifier}\n", + " threshold_percentile = {threshold_percentile}\n", + " threshold = {threshold}\n", + " var_index = {list_mul_to_array(var_index)}\n", + " \n", " #Init image advanced settings\n", " init_rotate={init_rotate}\n", " mask_rotate={mask_rotate}\n", @@ -801,13 +805,22 @@ " #How to pad the image with cut_overview\n", " padargs = {padargs} \n", " flip_aug={flip_aug}\n", - " cc = {cc}\n", + " \n", " #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", " experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings}\n", " #How much you want this to influence your result\n", " experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight}\n", " #9 are good aesthetic embeddings, 0 are bad ones\n", " experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score}\n", + "\n", + " # For fun dont change except if you really know what your are doing\n", + " cut_blur_n = {cut_blur_n}\n", + " cut_blur_kernel = {cut_blur_kernel}\n", + " grad_blur = {grad_blur}\n", + " compress_steps = {compress_steps}\n", + " compress_factor = {compress_factor}\n", + " punish_steps = {punish_steps}\n", + " punish_factor = {punish_factor}\n", " '''\n", " return(settings)\n", "\n", @@ -824,7 +837,7 @@ "aesthetic_model_32 = torch.nn.Linear(512,1).cuda()\n", "aesthetic_model_32.load_state_dict(torch.load(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"))\n", "\n", - "from ldm.modules.diffusionmodules.util import noise_like\n", + "\n", "def do_run():\n", " # with torch.cuda.amp.autocast():\n", " global progress,target_embeds, weights, zero_embed, init, scale_factor, cur_step\n", @@ -868,9 +881,9 @@ " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", " target_embeds[i].append(embed)\n", " weights[i].extend([weight])\n", - " if anti_jpg != 0:\n", - " target_embeds[\"ViT-B-32--openai\"].append(torch.tensor([np.load(f\"{model_path}/openimages_512x_png_embed224.npz\")['arr_0']-np.load(f\"{model_path}/imagenet_512x_jpg_embed224.npz\")['arr_0']], device = device))\n", - " weights[\"ViT-B-32--openai\"].append(anti_jpg)\n", + " #if anti_jpg != 0:\n", + " # target_embeds[\"ViT-B-32--openai\"].append(torch.tensor([np.load(f\"{model_path}/openimages_512x_png_embed224.npz\")['arr_0']-np.load(f\"{model_path}/imagenet_512x_jpg_embed224.npz\")['arr_0']], device = device))\n", + " # weights[\"ViT-B-32--openai\"].append(anti_jpg)\n", "\n", " for i in clip_list:\n", " target_embeds[i] = torch.cat(target_embeds[i])\n", @@ -1004,6 +1017,8 @@ " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", " x_T += noise_like(x_T.shape,device,False)*init_noise\n", " x_T = x_T.clamp(-6,6)\n", + " \n", + " \n", " if method == \"purge\":\n", " for i in scale_factor.split(\",\"):\n", " if i in clip_list:\n", @@ -1099,8 +1114,8 @@ "cloob_ViT_B16 = False #@param {type: \"boolean\"}\n", "\n", "# @markdown Load even more CLIP and CLIP-like models (from [Multi-Modal-Comparators](https://github.com/dmarx/Multi-Modal-Comparators))\n", - "model1 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", - "model2 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model1 = \"\" # @param [\"[clip - mlfoundations - RN50--openai]\",\"[clip - mlfoundations - RN101--openai]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model2 = \"\" # @param [\"[clip - mlfoundations - RN50--openai]\",\"[clip - mlfoundations - RN101--openai]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", "model3 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", "\n", "if ViT_B32: \n", @@ -1178,10 +1193,7 @@ " clip_model[item[\"id\"]] = MockOpenaiClip(clip_model_loaded)\n", " clip_size[item[\"id\"]] = clip_model[item[\"id\"]].visual.input_resolution\n", " clip_tokenize[item[\"id\"]] = clip_model[item[\"id\"]].preprocess_text()\n", - " if(item[\"architecture\"] == 'cloob'):\n", - " clip_normalize[item[\"id\"]] = clip_model[item[\"id\"]].normalize\n", - " else:\n", - " clip_normalize[item[\"id\"]] = normalize\n", + " clip_normalize[item[\"id\"]] = normalize\n", " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", "\n", "\n", @@ -1227,22 +1239,21 @@ "\n", "#Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion\n", "custom_schedule_setting = [\n", - " [200,1000,8],\n", - " [50,200,5],\n", - " #\"gfpgan:1.5\",\n", - " #[50,200,5],\n", + " [50,1000,8],\n", + " \"gfpgan:1.5\",\n", + " [5,200,5],\n", "]\n", " \n", "#Cut settings\n", - "clamp_index = [1]*1000 \n", + "clamp_index = [2,1.4] #linear variation of the index for clamping the gradient \n", "cut_overview = [8]*500 + [4]*500\n", "cut_innercut = [0]*500 + [4]*500\n", - "cut_ic_pow = .1\n", + "cut_ic_pow = .2\n", "cut_icgray_p = [.1]*300+[0]*1000\n", "cutn_batches = 1\n", - "range_index = [0]*300 + [0]*1000 \n", + "range_index = [0]*1000\n", "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", - "tv_scales = [800]*1+[200]*3\n", + "tv_scales = [600]*1+[200]*3\n", "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", @@ -1251,17 +1262,23 @@ "symmetric_loss_scale = 0 #Apply symmetric loss\n", "\n", "#Latent Diffusion Advanced Settings\n", - "scale_div = 0.5 # Use when latent upscale to correct satuation problem\n", - "opt_mag_mul = 10 #Magnify grad before clamping\n", + "scale_div = 1 # Use when latent upscale to correct satuation problem\n", + "opt_mag_mul = 15 #Magnify grad before clamping\n", "#PLMS Currently not working, working on a fix\n", "#opt.plms = False #Won;=t work with clip guidance\n", - "opt_ddim_eta, opt_eta_end = [1.5,1] # linear variation of eta\n", + "opt_ddim_eta, opt_eta_end = [1.6,1] # linear variation of eta\n", "opt_temperature = .95\n", "\n", "#Grad advanced settings\n", "grad_center = False\n", - "grad_scale= 0.1 #5 Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", - "anti_jpg = 0 #not working\n", + "grad_scale= 0.5 #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "\n", + "#Restraints the model from explodign despite larger clamp\n", + "score_modifier = True\n", + "threshold_percentile = .9\n", + "threshold = 1.2\n", + "var_index = [2]*1000\n", + "\n", "\n", "#Init image advanced settings\n", "init_rotate, mask_rotate=[False, False]\n", @@ -1273,12 +1290,6 @@ "flip_aug=False\n", "cutout_debug = False\n", "opt.outdir = outputs_path\n", - "prior_weight = 0\n", - "score_modifier = True\n", - "threshold_percentile = .90\n", - "threshold = 1.2\n", - "\n", - "var_index = [0]*1000\n", "\n", "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", "experimental_aesthetic_embeddings = False\n", @@ -1287,14 +1298,14 @@ "#9 are good aesthetic embeddings, 0 are bad ones\n", "experimental_aesthetic_embeddings_score = 9\n", "\n", - "# For fun dont change execpt if you really know what your are doing\n", + "# For fun dont change except if you really know what your are doing\n", "cut_blur_n = 0\n", "cut_blur_kernel = 3\n", "grad_blur = 0\n", - "compress_steps = 0\n", - "compress_factor = 0.025\n", - "punish_steps = 0\n", - "punish_factor = 0.9" + "compress_steps = 200\n", + "compress_factor = 0.1\n", + "punish_steps = 200\n", + "punish_factor = 0.8" ] }, { @@ -1359,8 +1370,8 @@ "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", "width = 256#@param{type: 'integer'}\n", "height = 256#@param{type: 'integer'}\n", - "latent_diffusion_guidance_scale = 2.5 #@param {type:\"number\"}\n", - "clip_guidance_scale = 7500 #@param{type: 'integer'}\n", + "latent_diffusion_guidance_scale = 4 #@param {type:\"number\"}\n", + "clip_guidance_scale = 5000#@param{type: 'integer'}\n", "how_many_batches = 1 #@param{type: 'integer'}\n", "aesthetic_loss_scale = 200 #@param{type: 'integer'}\n", "augment_cuts=True #@param{type:'boolean'}\n", @@ -1454,15 +1465,20 @@ "#opt.W, opt.H = [width,height]\n", "opt.scale = latent_diffusion_guidance_scale\n", "aug = augment_cuts\n", + "#Checks if it's not a normal schedule (legacy purposes to keep old configs compatible)\n", + "if(len(clamp_index) == 2): \n", + " clamp_index_variation = np.linspace(clamp_index[0],clamp_index[1],1000) \n", "\n", + "else:\n", + " clamp_index_variation = clamp_index\n", "score_corrector = DotMap()\n", "\n", "def modify_score(e_t, e_t_uncond):\n", " # print(e_t.abs().mean(),e_t.var())\n", - " global dynamic_decode\n", - " dynamic_decode = False\n", + " #global dynamic_decode\n", + " #dynamic_decode = False\n", " if e_t.var() >= 1 and score_modifier:\n", - " dynamic_decode = True\n", + " #dynamic_decode = True\n", " e_t = (e_t - e_t_uncond)/e_t.var() + e_t_uncond\n", " return(e_t)\n", "score_corrector.modify_score = modify_score\n", @@ -1533,7 +1549,7 @@ "xEVSOJ4f0B21" ], "machine_shape": "hm", - "name": "Latent Majesty Diffusion v1.3", + "name": "Latent Majesty Diffusion v1.4", "private_outputs": true, "provenance": [] }, @@ -1547,4 +1563,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file From 3295e09754e55de462cf1ba0aedf353b50c592cd Mon Sep 17 00:00:00 2001 From: apolinario Date: Sun, 5 Jun 2022 08:51:09 +0200 Subject: [PATCH 05/13] Attempt fix purge --- latent.ipynb | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 32750a6..47be834 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -837,8 +837,12 @@ "aesthetic_model_32 = torch.nn.Linear(512,1).cuda()\n", "aesthetic_model_32.load_state_dict(torch.load(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"))\n", "\n", - "\n", + "has_purged = False\n", "def do_run():\n", + " if(has_purged):\n", + " global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", + " has_purged = False\n", " # with torch.cuda.amp.autocast():\n", " global progress,target_embeds, weights, zero_embed, init, scale_factor, cur_step\n", " cur_step = 0\n", @@ -1017,9 +1021,9 @@ " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", " x_T += noise_like(x_T.shape,device,False)*init_noise\n", " x_T = x_T.clamp(-6,6)\n", - " \n", - " \n", " if method == \"purge\":\n", + " global has_purged\n", + " has_purged = True\n", " for i in scale_factor.split(\",\"):\n", " if i in clip_list:\n", " arch, pub, m_id = i[1:-1].split(' - ')\n", @@ -1093,7 +1097,7 @@ "#@markdown #### Open AI CLIP models\n", "ViT_B32 = False #@param {type:\"boolean\"}\n", "ViT_B16 = True #@param {type:\"boolean\"}\n", - "ViT_L14 = False #@param {type:\"boolean\"}\n", + "ViT_L14 = True #@param {type:\"boolean\"}\n", "ViT_L14_336px = False #@param {type:\"boolean\"}\n", "#RN101 = False #@param {type:\"boolean\"}\n", "#RN50 = False #@param {type:\"boolean\"}\n", @@ -1209,7 +1213,7 @@ " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", "\n", "clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", - "\n", + "clip_load_list_universal = clip_load_list\n", "torch.cuda.empty_cache()\n", "gc.collect()" ] @@ -1563,4 +1567,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 7df9bf1837b85270c9372fd34e1780482673e4ff Mon Sep 17 00:00:00 2001 From: apolinario Date: Mon, 6 Jun 2022 00:57:41 +0200 Subject: [PATCH 06/13] Add modified dynamic thresholding --- README.md | 10 +- latent.ipynb | 53 +- previous_versions/latent_v1.2.ipynb | 1460 +++++++++++++++++++++++++++ 3 files changed, 1503 insertions(+), 20 deletions(-) create mode 100644 previous_versions/latent_v1.2.ipynb diff --git a/README.md b/README.md index ceb5647..72e92cf 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,12 @@ Current implementations: - [V-Majesty Diffusion](#v-majesty-diffusion-v12) -## Latent Majesty Diffusion v1.3 +## Latent Majesty Diffusion v1.4 ##### Formerly known as Latent Princess Generator [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/multimodalart/MajestyDiffusion/blob/main/latent.ipynb) A [Dango233](https://github.com/Dango233) and [apolinario (@multimodalart)](https://github.com/multimodalart) Colab notebook implementing [CompVis](https://github.com/CompVis)' Latent Diffusion, with the following changes: +v1.2 - Added [Dango233](https://github.com/Dango233) CLIP Guidance - Added [Dango233](https://github.com/Dango233) magical **new** step and upscaling scheduling - Added [Dango233](https://github.com/Dango233) cuts, augs and attributes scheduling @@ -31,6 +32,13 @@ A [Dango233](https://github.com/Dango233) and [apolinario (@multimodalart)](http - Added [LAION-AI](https://github.com/LAION-AI/aesthetic-predictor) aesthetic predictor embeddings - Added [Dango233](https://github.com/Dango233) inpainting mode - Added [apolinario (@multimodalart)](https://github.com/multimodalart) savable settings and setting library (including `colab-free-default`, `dango233-princesses`, `the-other-zippy` and `makaitrad` shared settings. Share yours with us too with a pull request! +v1.3 + - Better Upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion)) +v1.4 + - Added [Dango233](https://github.com/Dango233) Customised Dynamic Thresholding + - Added [open_clip](https://github.com/mlfoundations/open_clip) ViT-L/14 LAION-400M trained + - Fix CLOOB perceptor from MMC + - Removes latent upscaler (was broken), adds RGB upscaler ## V-Majesty Diffusion v1.2 ##### Formerly known as Princess Generator ver. Victoria diff --git a/latent.ipynb b/latent.ipynb index 47be834..8535d36 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -396,7 +396,8 @@ " def __init__(self, cut_size,\n", " Overview=4, \n", " WholeCrop = 0, WC_Allowance = 10, WC_Grey_P=0.2,\n", - " InnerCrop = 0, IC_Size_Pow=0.5, IC_Grey_P = 0.2\n", + " InnerCrop = 0, IC_Size_Pow=0.5, IC_Grey_P = 0.2,\n", + " cut_blur_n = 0\n", " ):\n", " super().__init__()\n", " self.cut_size = cut_size\n", @@ -407,6 +408,7 @@ " self.InnerCrop = InnerCrop\n", " self.IC_Size_Pow = IC_Size_Pow\n", " self.IC_Grey_P = IC_Grey_P\n", + " self.cut_blur_n = cut_blur_n\n", " self.augs = T.Compose([\n", " #T.RandomHorizontalFlip(p=0.5),\n", " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", @@ -441,7 +443,7 @@ " pad_input = pad_input.repeat(input.shape[0],1,1,1)\n", " cutout = resize(pad_input, out_shape=output_shape_all)\n", " if aug: cutout=self.augs(cutout)\n", - " if cut_blur_n > 0: cutout[0:cut_blur_n,:,:,:] = TF.gaussian_blur(cutout[0:cut_blur_n,:,:,:],cut_blur_kernel)\n", + " if self.cut_blur_n > 0: cutout[0:self.cut_blur_n,:,:,:] = TF.gaussian_blur(cutout[0:self.cut_blur_n,:,:,:],cut_blur_kernel)\n", " cutouts_list.append(cutout)\n", " \n", " if self.InnerCrop >0:\n", @@ -681,7 +683,7 @@ " if t<= compress_steps:\n", " image = image / (ths/threshold)**compress_factor\n", " image += noise_like(image.shape,device,False) * ((ths/threshold)**compress_factor - 1)\n", - " return(image) \n", + " return(image) \n", "\n", "def make_schedule(t_start, t_end, step_size=1):\n", " schedule = []\n", @@ -1249,15 +1251,16 @@ "]\n", " \n", "#Cut settings\n", - "clamp_index = [2,1.4] #linear variation of the index for clamping the gradient \n", + "clamp_index = [2,1] #linear variation of the index for clamping the gradient \n", "cut_overview = [8]*500 + [4]*500\n", "cut_innercut = [0]*500 + [4]*500\n", "cut_ic_pow = .2\n", "cut_icgray_p = [.1]*300+[0]*1000\n", "cutn_batches = 1\n", + "cut_blur_n = [0]*400 + [0]*600\n", "range_index = [0]*1000\n", "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", - "tv_scales = [600]*1+[200]*3\n", + "tv_scales = [800]*1+[100]*3\n", "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", @@ -1281,7 +1284,7 @@ "score_modifier = True\n", "threshold_percentile = .9\n", "threshold = 1.2\n", - "var_index = [2]*1000\n", + "var_index = [0]*1000\n", "\n", "\n", "#Init image advanced settings\n", @@ -1305,7 +1308,7 @@ "# For fun dont change except if you really know what your are doing\n", "cut_blur_n = 0\n", "cut_blur_kernel = 3\n", - "grad_blur = 0\n", + "grad_blur = False\n", "compress_steps = 200\n", "compress_factor = 0.1\n", "punish_steps = 200\n", @@ -1339,13 +1342,13 @@ "source": [ "#Amp up your prompt game with prompt engineering, check out this guide: https://matthewmcateer.me/blog/clip-prompt-engineering/\n", "#Prompt for CLIP Guidance\n", - "clip_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "clip_prompts =[\"portrait of a Majestic Princess, trending on artstation\"] \n", "\n", "#Prompt for Latent Diffusion\n", - "latent_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "latent_prompts = [\"portrait of a Majestic Princess, trending on artstation\"] \n", "\n", "#Negative prompts for Latent Diffusion\n", - "latent_negatives = [\"low quality image\"]\n", + "latent_negatives = [\"\"]\n", "\n", "image_prompts = []" ] @@ -1374,10 +1377,10 @@ "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", "width = 256#@param{type: 'integer'}\n", "height = 256#@param{type: 'integer'}\n", - "latent_diffusion_guidance_scale = 4 #@param {type:\"number\"}\n", + "latent_diffusion_guidance_scale = 15 #@param {type:\"number\"}\n", "clip_guidance_scale = 5000#@param{type: 'integer'}\n", "how_many_batches = 1 #@param{type: 'integer'}\n", - "aesthetic_loss_scale = 200 #@param{type: 'integer'}\n", + "aesthetic_loss_scale = 400 #@param{type: 'integer'}\n", "augment_cuts=True #@param{type:'boolean'}\n", "\n", "#@markdown\n", @@ -1477,14 +1480,26 @@ " clamp_index_variation = clamp_index\n", "score_corrector = DotMap()\n", "\n", + "\n", "def modify_score(e_t, e_t_uncond):\n", - " # print(e_t.abs().mean(),e_t.var())\n", - " #global dynamic_decode\n", - " #dynamic_decode = False\n", - " if e_t.var() >= 1 and score_modifier:\n", - " #dynamic_decode = True\n", - " e_t = (e_t - e_t_uncond)/e_t.var() + e_t_uncond\n", - " return(e_t)\n", + " if(!score_modifier):\n", + " return e_t\n", + " else:\n", + " e_t_d = (e_t - e_t_uncond)\n", + " s = torch.quantile(\n", + " rearrange(e_t_d, 'b ... -> b (...)').abs().float(),\n", + " threshold_percentile,\n", + " dim = -1\n", + " )\n", + "\n", + " s.clamp_(min = 1.)\n", + " s = s.view(-1, *((1,) * (e_t_d.ndim - 1)))\n", + " e_t_d = F.softsign(e_t_d) / s / 3\n", + " \n", + " #e_t_d = e_t_d.clamp(-s,s) / s\n", + " e_t = e_t_uncond + e_t_d\n", + " return(e_t)\n", + "\n", "score_corrector.modify_score = modify_score\n", "\n", "torch.cuda.empty_cache()\n", diff --git a/previous_versions/latent_v1.2.ipynb b/previous_versions/latent_v1.2.ipynb new file mode 100644 index 0000000..8e06f7a --- /dev/null +++ b/previous_versions/latent_v1.2.ipynb @@ -0,0 +1,1460 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NUmmV5ZvrPbP" + }, + "source": [ + "# Latent Majesty Diffusion v1.3\n", + "#### Formerly known as Princess Generator\n", + "##### Access our [Majestic Guide](https://multimodal.art/majesty-diffusion) (_under construction_), our [GitHub](https://github.com/multimodalart/majesty-diffusion), join our community on [Discord](https://discord.gg/yNBtQBEDfZ) or reach out via [@multimodalart on Twitter](https://twitter.com/multimodalart))\n", + "\\\n", + " \n", + "---\n", + "\\\n", + "\n", + "\n", + "#### CLIP Guided Latent Diffusion by [dango233](https://github.com/Dango233/) and [apolinario (@multimodalart)](https://twitter.com/multimodalart). \n", + "The LAION-400M-trained model and the modified inference code are from [CompVis Latent Diffusion](https://github.com/CompVis/latent-diffusion). The guided-diffusion method is modified by Dango233 based on [Katherine Crowson](https://twitter.com/RiversHaveWings)'s guided diffusion notebook. multimodalart savable settings, MMC and assembled the Colab. Check the complete list on our GitHub. Some functions and methods are from various code masters (nsheppard, DanielRussRuss and others)\n", + "\n", + "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uWLsDt7wkZfU" + }, + "source": [ + "## Save model and outputs on Google Drive? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "aJF6wP2zkWE_" + }, + "outputs": [], + "source": [ + "#@markdown Enable saving outputs to Google Drive to save your creations at AI/models\n", + "save_outputs_to_google_drive = True #@param {type:\"boolean\"}\n", + "#@markdown Enable saving models to Google Drive to avoid downloading the model every Colab instance\n", + "save_models_to_google_drive = True #@param {type:\"boolean\"}\n", + "\n", + "if save_outputs_to_google_drive or save_models_to_google_drive:\n", + " from google.colab import drive\n", + " try:\n", + " drive.mount('/content/gdrive')\n", + " except:\n", + " save_outputs_to_google_drive = False\n", + " save_models_to_google_drive = False\n", + "\n", + "model_path = \"/content/gdrive/MyDrive/AI/models\" if save_models_to_google_drive else \"/content/\"\n", + "outputs_path = \"/content/gdrive/MyDrive/AI/latent_majesty_diffusion\" if save_outputs_to_google_drive else \"/content/outputs\"\n", + "!mkdir -p $model_path\n", + "!mkdir -p $outputs_path\n", + "print(f\"Model will be stored at {model_path}\")\n", + "print(f\"Outputs will be saved to {outputs_path}\")\n", + "\n", + "#If you want to run it locally change it to true\n", + "is_local = False\n", + "skip_installs = False\n", + "if(is_local):\n", + " model_path = \"/choose/your/local/model/path\"\n", + " outputs_path = \"/choose/your/local/outputs/path\"\n", + " skip_installs = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "5Fxt-5TaYBs2" + }, + "outputs": [], + "source": [ + "#@title Model settings\n", + "#@markdown The `original` model is the model trained by CompVis in the LAION-400M dataset\n", + "#@markdown
The `finetuned` model is a finetune of the `original` model by Jack000 that generates less watermarks, but is a bit worse in text synthesis. Colab Free does not have enough run for the finetuned (for now)\n", + "latent_diffusion_model = 'original' #@param [\"original\", \"finetuned\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xEVSOJ4f0B21" + }, + "source": [ + "# Setup stuff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "NHgUAp48qwoG" + }, + "outputs": [], + "source": [ + "#@title Installation\n", + "if(not skip_installs):\n", + " import subprocess\n", + " nvidiasmi_output = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')\n", + " cards_requiring_downgrade = [\"Tesla T4\", \"V100\"]\n", + " if any(cardstr in nvidiasmi_output for cardstr in cards_requiring_downgrade):\n", + " downgrade_pytorch_result = subprocess.run(['pip', 'install', 'torch==1.10.2', 'torchvision==0.11.3', '-q'], stdout=subprocess.PIPE).stdout.decode('utf-8')\n", + " import sys\n", + " sys.path.append(\".\")\n", + " !git clone https://github.com/multimodalart/latent-diffusion\n", + " !git clone https://github.com/CompVis/taming-transformers\n", + " !git clone https://github.com/TencentARC/GFPGAN\n", + " !git clone https://github.com/multimodalart/majesty-diffusion\n", + " !git lfs clone https://github.com/LAION-AI/aesthetic-predictor\n", + " !pip install -e ./taming-transformers\n", + " !pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops\n", + " !pip install transformers\n", + " !pip install dotmap\n", + " !pip install resize-right\n", + " !pip install piq\n", + " !pip install lpips\n", + " !pip install basicsr\n", + " !pip install facexlib\n", + " !pip install realesrgan\n", + "\n", + " sys.path.append('./taming-transformers')\n", + " from taming.models import vqgan\n", + " from subprocess import Popen, PIPE\n", + " try:\n", + " import mmc\n", + " except:\n", + " # install mmc\n", + " !git clone https://github.com/apolinario/Multi-Modal-Comparators --branch gradient_checkpointing\n", + " !pip install poetry\n", + " !cd Multi-Modal-Comparators; poetry build\n", + " !cd Multi-Modal-Comparators; pip install dist/mmc*.whl\n", + " \n", + " # optional final step:\n", + " #poe napm_installs\n", + " !python Multi-Modal-Comparators/src/mmc/napm_installs/__init__.py\n", + " # suppress mmc warmup outputs\n", + " import mmc.loaders" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fNqCqQDoyZmq" + }, + "source": [ + "Now, download the checkpoint (~5.7 GB). This will usually take 3-6 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "cNHvQBhzyXCI" + }, + "outputs": [], + "source": [ + "#@title Download models\n", + "import os\n", + "if os.path.isfile(f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\"):\n", + " print(\"Using Latent Diffusion model saved from Google Drive\")\n", + "else: \n", + " !wget -O $model_path/latent_diffusion_txt2img_f8_large.ckpt https://ommer-lab.com/files/latent-diffusion/nitro/txt2img-f8-large/model.ckpt --no-check-certificate\n", + "\n", + "if os.path.isfile(f\"{model_path}/finetuned_state_dict.pt\"):\n", + " print(\"Using Latent Diffusion model saved from Google Drive\")\n", + "else: \n", + " !wget -O $model_path/finetuned_state_dict.pt https://huggingface.co/multimodalart/compvis-latent-diffusion-text2img-large/resolve/main/finetuned_state_dict.pt --no-check-certificate\n", + "\n", + "if os.path.isfile(f\"{model_path}/ava_vit_l_14_336_linear.pth\"):\n", + " print(\"Using ViT-L/14@336px aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/ava_vit_l_14_336_linear.pth https://multimodal.art/models/ava_vit_l_14_336_linear.pth\n", + "\n", + "if os.path.isfile(f\"{model_path}/sa_0_4_vit_l_14_linear.pth\"):\n", + " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/sa_0_4_vit_l_14_linear.pth https://multimodal.art/models/sa_0_4_vit_l_14_linear.pth\n", + "\n", + "if os.path.isfile(f\"{model_path}/ava_vit_l_14_linear.pth\"):\n", + " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/ava_vit_l_14_linear.pth https://multimodal.art/models/ava_vit_l_14_linear.pth\n", + "\n", + "if os.path.isfile(f\"{model_path}/ava_vit_b_16_linear.pth\"):\n", + " print(\"Using ViT-B/16 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/ava_vit_b_16_linear.pth http://batbot.tv/ai/models/v-diffusion/ava_vit_b_16_linear.pth\n", + "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_16_linear.pth\"):\n", + " print(\"Using ViT-B/16 sa aesthetic model already saved\")\n", + "else:\n", + " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth\n", + "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"):\n", + " print(\"Using ViT-B/32 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_32_linear.pth\n", + "if os.path.isfile(f\"{model_path}/openimages_512x_png_embed224.npz\"):\n", + " print(\"Using openimages png from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/openimages_512x_png_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/openimages_512x_png_embed224.npz\n", + "if os.path.isfile(f\"{model_path}/imagenet_512x_jpg_embed224.npz\"):\n", + " print(\"Using imagenet antijpeg from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/imagenet_512x_jpg_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/imagenet_512x_jpg_embed224.npz\n", + "if os.path.isfile(f\"{model_path}/GFPGANv1.3.pth\"):\n", + " print(\"Using GFPGAN v1.3 from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/GFPGANv1.3.pth https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth\n", + "!cp $model_path/GFPGANv1.3.pth GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ThxmCePqt1mt" + }, + "source": [ + "Let's also check what type of GPU we've got." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jbL2zJ7Pt7Jl" + }, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "BPnyd-XUKbfE" + }, + "outputs": [], + "source": [ + "#@title Import stuff\n", + "import argparse, os, sys, glob\n", + "import torch\n", + "import numpy as np\n", + "from omegaconf import OmegaConf\n", + "from PIL import Image\n", + "from tqdm.auto import tqdm, trange\n", + "tqdm_auto_model = __import__(\"tqdm.auto\", fromlist=[None]) \n", + "sys.modules['tqdm'] = tqdm_auto_model\n", + "from einops import rearrange\n", + "from torchvision.utils import make_grid\n", + "import transformers\n", + "import gc\n", + "sys.path.append('./latent-diffusion')\n", + "from ldm.util import instantiate_from_config\n", + "from ldm.models.diffusion.ddim import DDIMSampler\n", + "from ldm.models.diffusion.plms import PLMSSampler\n", + "import tensorflow as tf\n", + "from dotmap import DotMap\n", + "import ipywidgets as widgets\n", + "from math import pi\n", + "\n", + "from subprocess import Popen, PIPE\n", + "\n", + "from dataclasses import dataclass\n", + "from functools import partial\n", + "import gc\n", + "import io\n", + "import math\n", + "import sys\n", + "import random\n", + "from piq import brisque\n", + "from itertools import product\n", + "from IPython import display\n", + "import lpips\n", + "from PIL import Image, ImageOps\n", + "import requests\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "from torchvision import models\n", + "from torchvision import transforms\n", + "from torchvision import transforms as T\n", + "from torchvision.transforms import functional as TF\n", + "from numpy import nan\n", + "from threading import Thread\n", + "import time\n", + "\n", + "#sys.path.append('../CLIP')\n", + "#Resizeright for better gradient when resizing\n", + "#sys.path.append('../ResizeRight/')\n", + "#sys.path.append('../cloob-training/')\n", + "\n", + "from resize_right import resize\n", + "\n", + "import clip\n", + "#from cloob_training import model_pt, pretrained\n", + "\n", + "#pretrained.list_configs()\n", + "from torch.utils.tensorboard import SummaryWriter\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "twG4nxYCrI8F" + }, + "outputs": [], + "source": [ + "#@title Load the model\n", + "torch.backends.cudnn.benchmark = True\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "def load_model_from_config(config, ckpt, verbose=False, latent_diffusion_model=\"original\"):\n", + " print(f\"Loading model from {ckpt}\")\n", + " print(latent_diffusion_model)\n", + " model = instantiate_from_config(config.model)\n", + " sd = torch.load(ckpt, map_location=\"cuda\")[\"state_dict\"]\n", + " m, u = model.load_state_dict(sd, strict = False)\n", + " if(latent_diffusion_model == \"finetuned\"): \n", + " del sd\n", + " sd_finetune = torch.load(f\"{model_path}/finetuned_state_dict.pt\",map_location=\"cuda\")\n", + " m, u = model.model.load_state_dict(sd_finetune, strict = False)\n", + " model.model = model.model.half().eval().to(device)\n", + " del sd_finetune\n", + " # sd = pl_sd[\"state_dict\"]\n", + " \n", + " if len(m) > 0 and verbose:\n", + " print(\"missing keys:\")\n", + " print(m)\n", + " if len(u) > 0 and verbose:\n", + " print(\"unexpected keys:\")\n", + " print(u)\n", + "\n", + " model.requires_grad_(False).half().eval().to('cuda')\n", + " return model\n", + "\n", + "config = OmegaConf.load(\"./latent-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml\") # TODO: Optionally download from same location as ckpt and chnage this logic\n", + "model = load_model_from_config(config, f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\",False, latent_diffusion_model) # TODO: check path\n", + "model = model.half().eval().to(device)\n", + "#if(latent_diffusion_model == \"finetuned\"):\n", + "# model.model = model.model.half().eval().to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "HY_7vvnPThzS" + }, + "outputs": [], + "source": [ + "#@title Load necessary functions\n", + "def set_custom_schedules(schedule):\n", + " custom_schedules = []\n", + " for schedule_item in schedule:\n", + " if(isinstance(schedule_item,list)):\n", + " custom_schedules.append(np.arange(*schedule_item))\n", + " else:\n", + " custom_schedules.append(schedule_item)\n", + " \n", + " return custom_schedules\n", + "\n", + "def parse_prompt(prompt):\n", + " if prompt.startswith('http://') or prompt.startswith('https://') or prompt.startswith(\"E:\") or prompt.startswith(\"C:\") or prompt.startswith(\"D:\"):\n", + " vals = prompt.rsplit(':', 2)\n", + " vals = [vals[0] + ':' + vals[1], *vals[2:]]\n", + " else:\n", + " vals = prompt.rsplit(':', 1)\n", + " vals = vals + ['', '1'][len(vals):]\n", + " return vals[0], float(vals[1])\n", + "\n", + "\n", + "class MakeCutouts(nn.Module):\n", + " def __init__(self, cut_size,\n", + " Overview=4, \n", + " WholeCrop = 0, WC_Allowance = 10, WC_Grey_P=0.2,\n", + " InnerCrop = 0, IC_Size_Pow=0.5, IC_Grey_P = 0.2\n", + " ):\n", + " super().__init__()\n", + " self.cut_size = cut_size\n", + " self.Overview = Overview\n", + " self.WholeCrop= WholeCrop\n", + " self.WC_Allowance = WC_Allowance\n", + " self.WC_Grey_P = WC_Grey_P\n", + " self.InnerCrop = InnerCrop\n", + " self.IC_Size_Pow = IC_Size_Pow\n", + " self.IC_Grey_P = IC_Grey_P\n", + " self.augs = T.Compose([\n", + " #T.RandomHorizontalFlip(p=0.5),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " T.RandomAffine(degrees=0, \n", + " translate=(0.05, 0.05), \n", + " #scale=(0.9,0.95),\n", + " fill=-1, interpolation = T.InterpolationMode.BILINEAR, ),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " #T.RandomPerspective(p=1, interpolation = T.InterpolationMode.BILINEAR, fill=-1,distortion_scale=0.2),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " T.RandomGrayscale(p=0.1),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " T.ColorJitter(brightness=0.05, contrast=0.05, saturation=0.05),\n", + " ])\n", + "\n", + " def forward(self, input):\n", + " gray = transforms.Grayscale(3)\n", + " sideY, sideX = input.shape[2:4]\n", + " max_size = min(sideX, sideY)\n", + " min_size = min(sideX, sideY, self.cut_size)\n", + " l_size = max(sideX, sideY)\n", + " output_shape = [input.shape[0],3,self.cut_size,self.cut_size] \n", + " output_shape_2 = [input.shape[0],3,self.cut_size+2,self.cut_size+2]\n", + " pad_input = F.pad(input,((sideY-max_size)//2+round(max_size*0.055),(sideY-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055)), **padargs)\n", + " cutouts_list = []\n", + " \n", + " if self.Overview>0:\n", + " cutouts = []\n", + " cutout = resize(pad_input, out_shape=output_shape, antialiasing=True)\n", + " output_shape_all = list(output_shape)\n", + " output_shape_all[0]=self.Overview*input.shape[0]\n", + " pad_input = pad_input.repeat(input.shape[0],1,1,1)\n", + " cutout = resize(pad_input, out_shape=output_shape_all)\n", + " if aug: cutout=self.augs(cutout)\n", + " cutouts_list.append(cutout)\n", + " \n", + " if self.InnerCrop >0:\n", + " cutouts=[]\n", + " for i in range(self.InnerCrop):\n", + " size = int(torch.rand([])**self.IC_Size_Pow * (max_size - min_size) + min_size)\n", + " offsetx = torch.randint(0, sideX - size + 1, ())\n", + " offsety = torch.randint(0, sideY - size + 1, ())\n", + " cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]\n", + " if i <= int(self.IC_Grey_P * self.InnerCrop):\n", + " cutout = gray(cutout)\n", + " cutout = resize(cutout, out_shape=output_shape)\n", + " cutouts.append(cutout)\n", + " if cutout_debug:\n", + " TF.to_pil_image(cutouts[-1].add(1).div(2).clamp(0, 1).squeeze(0)).save(\"content/diff/cutouts/cutout_InnerCrop.jpg\",quality=99)\n", + " cutouts_tensor = torch.cat(cutouts)\n", + " cutouts=[]\n", + " cutouts_list.append(cutouts_tensor)\n", + " cutouts=torch.cat(cutouts_list)\n", + " return cutouts\n", + "\n", + "\n", + "def spherical_dist_loss(x, y):\n", + " x = F.normalize(x, dim=-1)\n", + " y = F.normalize(y, dim=-1)\n", + " return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)\n", + "\n", + "\n", + "def tv_loss(input):\n", + " \"\"\"L2 total variation loss, as in Mahendran et al.\"\"\"\n", + " input = F.pad(input, (0, 1, 0, 1), 'replicate')\n", + " x_diff = input[..., :-1, 1:] - input[..., :-1, :-1]\n", + " y_diff = input[..., 1:, :-1] - input[..., :-1, :-1]\n", + " return (x_diff**2 + y_diff**2).mean([1, 2, 3])\n", + "\n", + "\n", + "def range_loss(input, range_min, range_max):\n", + " return (input - input.clamp(range_min,range_max)).pow(2).mean([1, 2, 3])\n", + "\n", + "def symmetric_loss(x):\n", + " w = x.shape[3]\n", + " diff = (x - torch.flip(x,[3])).square().mean().sqrt()/(x.shape[2]*x.shape[3]/1e4)\n", + " return(diff)\n", + "\n", + "def fetch(url_or_path):\n", + " \"\"\"Fetches a file from an HTTP or HTTPS url, or opens the local file.\"\"\"\n", + " if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):\n", + " r = requests.get(url_or_path)\n", + " r.raise_for_status()\n", + " fd = io.BytesIO()\n", + " fd.write(r.content)\n", + " fd.seek(0)\n", + " return fd\n", + " return open(url_or_path, 'rb')\n", + "\n", + "\n", + "def to_pil_image(x):\n", + " \"\"\"Converts from a tensor to a PIL image.\"\"\"\n", + " if x.ndim == 4:\n", + " assert x.shape[0] == 1\n", + " x = x[0]\n", + " if x.shape[0] == 1:\n", + " x = x[0]\n", + " return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2)\n", + "\n", + "\n", + "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", + " std=[0.26862954, 0.26130258, 0.27577711])\n", + "\n", + "def centralized_grad(x, use_gc=True, gc_conv_only=False):\n", + " if use_gc:\n", + " if gc_conv_only:\n", + " if len(list(x.size())) > 3:\n", + " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", + " else:\n", + " if len(list(x.size())) > 1:\n", + " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", + " return x\n", + "\n", + "def cond_fn(x, t):\n", + " t=1000-t\n", + " t=t[0]\n", + " with torch.enable_grad():\n", + " global clamp_start_, clamp_max\n", + " x = x.detach()\n", + " x = x.requires_grad_()\n", + " x_in = model.decode_first_stage(x)\n", + " display_handler(x_in,t,1,False)\n", + " n = x_in.shape[0]\n", + " clip_guidance_scale = clip_guidance_index[t]\n", + " make_cutouts = {}\n", + " #rx_in_grad = torch.zeros_like(x_in)\n", + " for i in clip_list:\n", + " make_cutouts[i] = MakeCutouts(clip_size[i],\n", + " Overview= cut_overview[t], \n", + " InnerCrop = cut_innercut[t], \n", + " IC_Size_Pow=cut_ic_pow, IC_Grey_P = cut_icgray_p[t]\n", + " )\n", + " cutn = cut_overview[t]+cut_innercut[t]\n", + " for j in range(cutn_batches):\n", + " losses=0\n", + " for i in clip_list:\n", + " clip_in = clip_normalize[i](make_cutouts[i](x_in.add(1).div(2)).to(\"cuda\"))\n", + " image_embeds = clip_model[i].encode_image(clip_in).float().unsqueeze(0).expand([target_embeds[i].shape[0],-1,-1])\n", + " target_embeds_temp = target_embeds[i]\n", + " if i == 'ViT-B-32--openai' and experimental_aesthetic_embeddings:\n", + " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_b_32_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", + " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", + " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", + " if i == 'ViT-L-14--openai' and experimental_aesthetic_embeddings:\n", + " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_l_14_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", + " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", + " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", + " target_embeds_temp = target_embeds_temp.unsqueeze(1).expand([-1,cutn*n,-1]) \n", + " dists = spherical_dist_loss(image_embeds, target_embeds_temp)\n", + " dists = dists.mean(1).mul(weights[i].squeeze()).mean()\n", + " losses+=dists*clip_guidance_scale * (2 if i in [\"ViT-L-14-336--openai\", \"RN50x64--openai\", \"ViT-B-32--laion2b_e16\"] else (.4 if \"cloob\" in i else 1))\n", + " if i == \"ViT-L-14-336--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_336(F.normalize(image_embeds, dim=-1))).mean() \n", + " losses -= aes_loss * aes_scale \n", + " if i == \"ViT-L-14--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_224(F.normalize(image_embeds, dim=-1))).mean() \n", + " losses -= aes_loss * aes_scale \n", + " if i == \"ViT-B-16--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_16(F.normalize(image_embeds, dim=-1))).mean() \n", + " losses -= aes_loss * aes_scale \n", + " if i == \"ViT-B-32--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_32(F.normalize(image_embeds, dim=-1))).mean()\n", + " losses -= aes_loss * aes_scale\n", + " #x_in_grad += torch.autograd.grad(losses, x_in)[0] / cutn_batches / len(clip_list)\n", + " #losses += dists\n", + " #losses = losses / len(clip_list) \n", + " #gc.collect()\n", + " \n", + " tv_losses = tv_loss(x).sum() * tv_scales[0] +\\\n", + " tv_loss(F.interpolate(x, scale_factor= 1/2)).sum()* tv_scales[1] + \\\n", + " tv_loss(F.interpolate(x, scale_factor = 1/4)).sum()* tv_scales[2] + \\\n", + " tv_loss(F.interpolate(x, scale_factor = 1/8)).sum()* tv_scales[3] \n", + " range_scale= range_index[t]\n", + " range_losses = range_loss(x_in,RGB_min,RGB_max).sum() * range_scale\n", + " loss = tv_losses + range_losses + losses\n", + " #del losses\n", + " if symmetric_loss_scale != 0: loss += symmetric_loss(x_in) * symmetric_loss_scale\n", + " if init_image is not None and init_scale:\n", + " lpips_loss = (lpips_model(x_in, init) * init_scale).squeeze().mean()\n", + " #print(lpips_loss)\n", + " loss += lpips_loss\n", + " #loss_grad = torch.autograd.grad(loss, x_in, )[0]\n", + " #x_in_grad += loss_grad\n", + " #grad = -torch.autograd.grad(x_in, x, x_in_grad)[0]\n", + " loss.backward()\n", + " grad = -x.grad\n", + " grad = torch.nan_to_num(grad, nan=0.0, posinf=0, neginf=0)\n", + " if grad_center: grad = centralized_grad(grad, use_gc=True, gc_conv_only=False)\n", + " mag = grad.square().mean().sqrt()\n", + " if mag==0 or torch.isnan(mag):\n", + " print(\"ERROR\")\n", + " print(t)\n", + " return(grad)\n", + " if t>=0:\n", + " if active_function == \"softsign\":\n", + " grad = F.softsign(grad*grad_scale/mag)\n", + " if active_function == \"tanh\":\n", + " grad = (grad/mag*grad_scale).tanh()\n", + " if active_function==\"clamp\":\n", + " grad = grad.clamp(-mag*grad_scale*2,mag*grad_scale*2)\n", + " if grad.abs().max()>0:\n", + " grad=grad/grad.abs().max()*opt.mag_mul\n", + " magnitude = grad.square().mean().sqrt()\n", + " else:\n", + " return(grad)\n", + " clamp_max = clamp_index[t]\n", + " #print(magnitude, end = \"\\r\")\n", + " grad = grad* magnitude.clamp(max= clamp_max) /magnitude#0.2\n", + " grad = grad.detach()\n", + " return grad\n", + "\n", + "def null_fn(x_in):\n", + " return(torch.zeros_like(x_in))\n", + "\n", + "def display_handler(x,i,cadance = 5, decode = True):\n", + " global progress, image_grid, writer, img_tensor, im\n", + " img_tensor = x\n", + " if i%cadance==0:\n", + " if decode: \n", + " x = model.decode_first_stage(x)\n", + " grid = make_grid(torch.clamp((x+1.0)/2.0, min=0.0, max=1.0),round(x.shape[0]**0.5))\n", + " grid = 255. * rearrange(grid, 'c h w -> h w c').detach().cpu().numpy()\n", + " image_grid = grid.copy(order = \"C\") \n", + " with io.BytesIO() as output:\n", + " im = Image.fromarray(grid.astype(np.uint8))\n", + " im.save(output, format = \"PNG\")\n", + " progress.value = output.getvalue()\n", + " if generate_video:\n", + " im.save(p.stdin, 'PNG')\n", + "\n", + "\n", + " \n", + "def cond_clamp(image,t): \n", + " #if t >=0:\n", + " #mag=image.square().mean().sqrt()\n", + " #mag = (mag*cc).clamp(1.6,100)\n", + " image = image.clamp(-cc, cc)\n", + " image = torch.nan_to_num(image, nan=0.0, posinf=cc, neginf=-cc)\n", + " return(image)\n", + "\n", + "def make_schedule(t_start, t_end, step_size=1):\n", + " schedule = []\n", + " par_schedule = []\n", + " t = t_start\n", + " while t > t_end:\n", + " schedule.append(t)\n", + " t -= step_size\n", + " schedule.append(t_end)\n", + " return np.array(schedule)\n", + "\n", + "lpips_model = lpips.LPIPS(net='vgg').to(device)\n", + "\n", + "def list_mul_to_array(list_mul):\n", + " i = 0\n", + " mul_count = 0\n", + " mul_string = ''\n", + " full_list = list_mul\n", + " full_list_len = len(full_list)\n", + " for item in full_list:\n", + " if(i == 0):\n", + " last_item = item\n", + " if(item == last_item):\n", + " mul_count+=1\n", + " if(item != last_item or full_list_len == i+1):\n", + " mul_string = mul_string + f' [{last_item}]*{mul_count} +'\n", + " mul_count=1\n", + " last_item = item\n", + " i+=1\n", + " return(mul_string[1:-2])\n", + "\n", + "def generate_settings_file(add_prompts=False, add_dimensions=False):\n", + " \n", + " if(add_prompts):\n", + " prompts = f'''\n", + " clip_prompts = {clip_prompts}\n", + " latent_prompts = {latent_prompts}\n", + " latent_negatives = {latent_negatives}\n", + " image_prompts = {image_prompts}\n", + " '''\n", + " else:\n", + " prompts = ''\n", + "\n", + " if(add_dimensions):\n", + " dimensions = f'''width = {width}\n", + " height = {height}\n", + " '''\n", + " else:\n", + " dimensions = ''\n", + " settings = f'''\n", + " #This settings file can be loaded back to Latent Majesty Diffusion. If you like your setting consider sharing it to the settings library at https://github.com/multimodalart/MajestyDiffusion\n", + " [clip_list]\n", + " perceptors = {clip_load_list}\n", + " \n", + " [basic_settings]\n", + " #Perceptor things\n", + " {prompts}\n", + " {dimensions}\n", + " latent_diffusion_guidance_scale = {latent_diffusion_guidance_scale}\n", + " clip_guidance_scale = {clip_guidance_scale}\n", + " aesthetic_loss_scale = {aesthetic_loss_scale}\n", + " augment_cuts={augment_cuts}\n", + "\n", + " #Init image settings\n", + " starting_timestep = {starting_timestep}\n", + " init_scale = {init_scale} \n", + " init_brightness = {init_brightness}\n", + " init_noise = {init_noise}\n", + "\n", + " [advanced_settings]\n", + " #Add CLIP Guidance and all the flavors or just run normal Latent Diffusion\n", + " use_cond_fn = {use_cond_fn}\n", + "\n", + " #Custom schedules for cuts. Check out the schedules documentation here\n", + " custom_schedule_setting = {custom_schedule_setting}\n", + "\n", + " #Cut settings\n", + " clamp_index = {list_mul_to_array(clamp_index)}\n", + " cut_overview = {list_mul_to_array(cut_overview)}\n", + " cut_innercut = {list_mul_to_array(cut_innercut)}\n", + " cut_ic_pow = {cut_ic_pow}\n", + " cut_icgray_p = {list_mul_to_array(cut_icgray_p)}\n", + " cutn_batches = {cutn_batches}\n", + " range_index = {list_mul_to_array(range_index)}\n", + " active_function = \"{active_function}\"\n", + " tv_scales = {list_mul_to_array(tv_scales)}\n", + " latent_tv_loss = {latent_tv_loss}\n", + "\n", + " #If you uncomment this line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale will be used\n", + " clip_guidance_schedule = {list_mul_to_array(clip_guidance_index)}\n", + " \n", + " #Apply symmetric loss (force simmetry to your results)\n", + " symmetric_loss_scale = {symmetric_loss_scale} \n", + "\n", + " #Latent Diffusion Advanced Settings\n", + " #Use when latent upscale to correct satuation problem\n", + " scale_div = {scale_div}\n", + " #Magnify grad before clamping by how many times\n", + " opt_mag_mul = {opt_mag_mul}\n", + " opt_ddim_eta = {opt_ddim_eta}\n", + " opt_eta_end = {opt_eta_end}\n", + " opt_temperature = {opt_temperature}\n", + "\n", + " #Grad advanced settings\n", + " grad_center = {grad_center}\n", + " #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + " grad_scale={grad_scale} \n", + "\n", + " #Init image advanced settings\n", + " init_rotate={init_rotate}\n", + " mask_rotate={mask_rotate}\n", + " init_magnitude = {init_magnitude}\n", + "\n", + " #More settings\n", + " RGB_min = {RGB_min}\n", + " RGB_max = {RGB_max}\n", + " #How to pad the image with cut_overview\n", + " padargs = {padargs} \n", + " flip_aug={flip_aug}\n", + " cc = {cc}\n", + " #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", + " experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings}\n", + " #How much you want this to influence your result\n", + " experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight}\n", + " #9 are good aesthetic embeddings, 0 are bad ones\n", + " experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score}\n", + " '''\n", + " return(settings)\n", + "\n", + "#Alstro's aesthetic model\n", + "aesthetic_model_336 = torch.nn.Linear(768,1).cuda()\n", + "aesthetic_model_336.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_336_linear.pth\"))\n", + "\n", + "aesthetic_model_224 = torch.nn.Linear(768,1).cuda()\n", + "aesthetic_model_224.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_linear.pth\"))\n", + "\n", + "aesthetic_model_16 = torch.nn.Linear(512,1).cuda()\n", + "aesthetic_model_16.load_state_dict(torch.load(f\"{model_path}/ava_vit_b_16_linear.pth\"))\n", + "\n", + "aesthetic_model_32 = torch.nn.Linear(512,1).cuda()\n", + "aesthetic_model_32.load_state_dict(torch.load(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"))\n", + "\n", + "from ldm.modules.diffusionmodules.util import noise_like\n", + "def do_run():\n", + " # with torch.cuda.amp.autocast():\n", + " global progress,target_embeds, weights, zero_embed, init, scale_factor\n", + " scale_factor = 1\n", + " make_cutouts = {}\n", + " for i in clip_list:\n", + " make_cutouts[i] = MakeCutouts(clip_size[i],Overview=1)\n", + " target_embeds, weights ,zero_embed = {}, {}, {}\n", + " for i in clip_list:\n", + " target_embeds[i] = []\n", + " weights[i]=[]\n", + "\n", + " for prompt in prompts:\n", + " txt, weight = parse_prompt(prompt)\n", + " for i in clip_list:\n", + " if \"cloob\" not in i:\n", + " with torch.cuda.amp.autocast():\n", + " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", + " target_embeds[i].append(embeds)\n", + " weights[i].append(weight)\n", + " else:\n", + " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", + " target_embeds[i].append(embeds)\n", + " weights[i].append(weight)\n", + "\n", + " for prompt in image_prompts:\n", + " print(f\"processing{prompt}\",end=\"\\r\")\n", + " path, weight = parse_prompt(prompt)\n", + " img = Image.open(fetch(path)).convert('RGB')\n", + " img = TF.resize(img, min(opt.W, opt.H, *img.size), transforms.InterpolationMode.LANCZOS)\n", + " for i in clip_list:\n", + " if \"cloob\" not in i:\n", + " with torch.cuda.amp.autocast():\n", + " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", + " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", + " target_embeds[i].append(embed)\n", + " weights[i].extend([weight])\n", + " else:\n", + " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", + " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", + " target_embeds[i].append(embed)\n", + " weights[i].extend([weight])\n", + " if anti_jpg != 0:\n", + " target_embeds[\"ViT-B-32--openai\"].append(torch.tensor([np.load(f\"{model_path}/openimages_512x_png_embed224.npz\")['arr_0']-np.load(f\"{model_path}/imagenet_512x_jpg_embed224.npz\")['arr_0']], device = device))\n", + " weights[\"ViT-B-32--openai\"].append(anti_jpg)\n", + "\n", + " for i in clip_list:\n", + " target_embeds[i] = torch.cat(target_embeds[i])\n", + " weights[i] = torch.tensor([weights[i]], device=device)\n", + " shape = [4, opt.H//8, opt.W//8]\n", + " init = None\n", + " mask = None\n", + " transform = T.GaussianBlur(kernel_size=3, sigma=0.4)\n", + " if init_image is not None:\n", + " init = Image.open(fetch(init_image)).convert('RGB')\n", + " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", + " if init_rotate: init = torch.rot90(init, 1, [3,2]) \n", + " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W])\n", + " init = init.mul(2).sub(1).half()\n", + " init_encoded = model.first_stage_model.encode(init).sample()* init_magnitude + init_brightness\n", + " init_encoded = init_encoded + noise_like(init_encoded.shape,device,False).mul(init_noise)\n", + " else:\n", + " init = None\n", + " init_encoded = None\n", + " if init_mask is not None:\n", + " mask = Image.open(fetch(init_mask)).convert('RGB')\n", + " mask = TF.to_tensor(mask).to(device).unsqueeze(0)\n", + " if mask_rotate: mask = torch.rot90(init, 1, [3,2]) \n", + " mask = resize(mask,out_shape = [opt.n_samples,1,opt.H//8, opt.W//8])\n", + " mask = transform(mask)\n", + " print(mask)\n", + "\n", + "\n", + " progress = widgets.Image(layout = widgets.Layout(max_width = \"400px\",max_height = \"512px\"))\n", + " display.display(progress)\n", + "\n", + " if opt.plms:\n", + " sampler = PLMSSampler(model)\n", + " else:\n", + " sampler = DDIMSampler(model)\n", + "\n", + " os.makedirs(opt.outdir, exist_ok=True)\n", + " outpath = opt.outdir\n", + "\n", + " prompt = opt.prompt\n", + " sample_path = os.path.join(outpath, \"samples\")\n", + " os.makedirs(sample_path, exist_ok=True)\n", + " base_count = len(os.listdir(sample_path))\n", + "\n", + " all_samples=list()\n", + " last_step_upscale = False\n", + " with torch.enable_grad():\n", + " with torch.cuda.amp.autocast():\n", + " with model.ema_scope():\n", + " uc = None\n", + " if opt.scale != 1.0:\n", + " uc = model.get_learned_conditioning(opt.n_samples * opt.uc).cuda()\n", + " \n", + " for n in trange(opt.n_iter, desc=\"Sampling\"):\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " c = model.get_learned_conditioning(opt.n_samples * prompt).cuda()\n", + " if init_encoded is None:\n", + " x_T = torch.randn([opt.n_samples,*shape], device=device)\n", + " else:\n", + " x_T = init_encoded\n", + " last_step_uspcale_list = []\n", + " \n", + " for custom_schedule in custom_schedules:\n", + " if type(custom_schedule) != type(\"\"):\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " last_step_upscale = False\n", + " samples_ddim, _ = sampler.sample(S=opt.ddim_steps,\n", + " conditioning=c,\n", + " batch_size=opt.n_samples,\n", + " shape=shape,\n", + " custom_schedule = custom_schedule,\n", + " verbose=False,\n", + " unconditional_guidance_scale=opt.scale,\n", + " unconditional_conditioning=uc,\n", + " eta=opt.ddim_eta,\n", + " eta_end = opt.eta_end,\n", + " img_callback=None if use_cond_fn else display_handler,\n", + " cond_fn=cond_fn, #if use_cond_fn else None,\n", + " temperature = opt.temperature,\n", + " x_adjust_fn=cond_clamp,\n", + " x_T = x_T,\n", + " x0=x_T,\n", + " mask=mask\n", + " )\n", + " x_T = samples_ddim.clamp(-6,6)\n", + " else:\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " method, scale_factor = custom_schedule.split(\":\")\n", + " scale_factor = float(scale_factor)\n", + " #clamp_index = np.array(clamp_index) * scale_factor\n", + " if method == \"latent\":\n", + " x_T = resize(samples_ddim, scale_factors=scale_factor, antialiasing=True)*scale_div\n", + " x_T += noise_like(x_T.shape,device,False)*init_noise\n", + " if method == \"gfpgan\":\n", + " last_step_upscale = True\n", + " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", + " temp_file = os.path.join(sample_path, temp_file_name)\n", + " im.save(temp_file, format = \"PNG\")\n", + " GFP_factor = 2 if scale_factor > 1 else 1\n", + " GFP_ver = 1.3 #if GFP_factor == 1 else 1.2\n", + " %cd GFPGAN\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " !python inference_gfpgan.py -i $temp_file -o results -v $GFP_ver -s $GFP_factor\n", + " %cd ..\n", + " face_corrected = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\"))\n", + " with io.BytesIO() as output:\n", + " face_corrected.save(output,format=\"PNG\")\n", + " progress.value = output.getvalue()\n", + " init = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", + " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", + " opt.H, opt.W = opt.H*scale_factor, opt.W*scale_factor\n", + " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W], antialiasing=True)\n", + " init = init.mul(2).sub(1).half()\n", + " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", + " x_T += noise_like(x_T.shape,device,False)*init_noise\n", + " x_T = x_T.clamp(-6,6)\n", + "\n", + " #last_step_uspcale_list.append(last_step_upscale)\n", + " scale_factor = 1\n", + " current_time = str(round(time.time()))\n", + " if(last_step_upscale):\n", + " latest_upscale = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", + " latest_upscale.save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", + " else:\n", + " Image.fromarray(image_grid.astype(np.uint8)).save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", + " settings = generate_settings_file(add_prompts=True, add_dimensions=False)\n", + " text_file = open(f\"{outpath}/{current_time}.cfg\", \"w\")\n", + " text_file.write(settings)\n", + " text_file.close()\n", + " x_samples_ddim = model.decode_first_stage(samples_ddim)\n", + " x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0)\n", + " all_samples.append(x_samples_ddim)\n", + "\n", + "\n", + " if(len(all_samples) > 1):\n", + " # additionally, save as grid\n", + " grid = torch.stack(all_samples, 0)\n", + " grid = rearrange(grid, 'n b c h w -> (n b) c h w')\n", + " grid = make_grid(grid, nrow=opt.n_samples)\n", + "\n", + " # to image\n", + " grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()\n", + " Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid_{str(round(time.time()))}.png'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ILHGCEla2Rrm" + }, + "source": [ + "# Run!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VpR9JhyCu5iq" + }, + "source": [ + "#### Perceptors (Choose your CLIP and CLIP-like models) \n", + "Be careful if you don't pay for Colab Pro selecting more CLIPs might make you go out of memory. If you do have Pro, try adding ViT-L14 to your mix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "8K7l_E2JvLWC" + }, + "outputs": [], + "source": [ + "#@title Choose your perceptor models\n", + "\n", + "# suppress mmc warmup outputs\n", + "import mmc.loaders\n", + "clip_load_list = []\n", + "#@markdown #### Open AI CLIP models\n", + "ViT_B32 = False #@param {type:\"boolean\"}\n", + "ViT_B16 = True #@param {type:\"boolean\"}\n", + "ViT_L14 = False #@param {type:\"boolean\"}\n", + "ViT_L14_336px = False #@param {type:\"boolean\"}\n", + "#RN101 = False #@param {type:\"boolean\"}\n", + "#RN50 = False #@param {type:\"boolean\"}\n", + "RN50x4 = False #@param {type:\"boolean\"}\n", + "RN50x16 = False #@param {type:\"boolean\"}\n", + "RN50x64 = False #@param {type:\"boolean\"}\n", + "\n", + "#@markdown #### OpenCLIP models\n", + "ViT_B16_plus = False #@param {type: \"boolean\"}\n", + "ViT_B32_laion2b = True #@param {type: \"boolean\"}\n", + "\n", + "#@markdown #### Multilangual CLIP models \n", + "clip_farsi = False #@param {type: \"boolean\"}\n", + "clip_korean = False #@param {type: \"boolean\"}\n", + "\n", + "#@markdown #### CLOOB models\n", + "cloob_ViT_B16 = False #@param {type: \"boolean\"}\n", + "\n", + "# @markdown Load even more CLIP and CLIP-like models (from [Multi-Modal-Comparators](https://github.com/dmarx/Multi-Modal-Comparators))\n", + "model1 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model2 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model3 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "\n", + "if ViT_B32: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-32--openai]\")\n", + "if ViT_B16: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-16--openai]\")\n", + "if ViT_L14: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14--openai]\")\n", + "if RN50x4: \n", + " clip_load_list.append(\"[clip - mlfoundations - RN50x4--openai]\")\n", + "if RN50x64: \n", + " clip_load_list.append(\"[clip - mlfoundations - RN50x64--openai]\")\n", + "if RN50x16: \n", + " clip_load_list.append(\"[clip - mlfoundations - RN50x16--openai]\")\n", + "if ViT_L14_336px:\n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14-336--openai]\")\n", + "if ViT_B16_plus:\n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-16-plus-240--laion400m_e32]\")\n", + "if ViT_B32_laion2b:\n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-32--laion2b_e16]\")\n", + "if clip_farsi:\n", + " clip_load_list.append(\"[clip - sajjjadayobi - clipfa]\")\n", + "if clip_korean:\n", + " clip_load_list.append(\"[clip - navervision - kelip_ViT-B/32]\")\n", + "if cloob_ViT_B16:\n", + " clip_load_list.append(\"[cloob - crowsonkb - cloob_laion_400m_vit_b_16_32_epochs]\")\n", + "\n", + "if model1:\n", + " clip_load_list.append(model1)\n", + "if model2:\n", + " clip_load_list.append(model2)\n", + "if model3:\n", + " clip_load_list.append(model3)\n", + "\n", + "\n", + "i = 0\n", + "from mmc.multimmc import MultiMMC\n", + "from mmc.modalities import TEXT, IMAGE\n", + "temp_perceptor = MultiMMC(TEXT, IMAGE)\n", + "\n", + "def get_mmc_models(clip_load_list):\n", + " mmc_models = []\n", + " for model_key in clip_load_list:\n", + " if not model_key:\n", + " continue\n", + " arch, pub, m_id = model_key[1:-1].split(' - ')\n", + " mmc_models.append({\n", + " 'architecture':arch,\n", + " 'publisher':pub,\n", + " 'id':m_id,\n", + " })\n", + " return mmc_models\n", + "mmc_models = get_mmc_models(clip_load_list)\n", + "\n", + "import mmc\n", + "from mmc.registry import REGISTRY\n", + "import mmc.loaders # force trigger model registrations\n", + "from mmc.mock.openai import MockOpenaiClip\n", + "\n", + "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", + " std=[0.26862954, 0.26130258, 0.27577711])\n", + "\n", + "\n", + "def load_clip_models(mmc_models):\n", + " clip_model, clip_size, clip_tokenize, clip_normalize= {},{},{},{}\n", + " clip_list = []\n", + " for item in mmc_models:\n", + " print(\"Loaded \", item[\"id\"])\n", + " clip_list.append(item[\"id\"])\n", + " model_loaders = REGISTRY.find(**item)\n", + " for model_loader in model_loaders:\n", + " clip_model_loaded = model_loader.load()\n", + " clip_model[item[\"id\"]] = MockOpenaiClip(clip_model_loaded)\n", + " clip_size[item[\"id\"]] = clip_model[item[\"id\"]].visual.input_resolution\n", + " clip_tokenize[item[\"id\"]] = clip_model[item[\"id\"]].preprocess_text()\n", + " if(item[\"architecture\"] == 'cloob'):\n", + " clip_normalize[item[\"id\"]] = clip_model[item[\"id\"]].normalize\n", + " else:\n", + " clip_normalize[item[\"id\"]] = normalize\n", + " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + "\n", + "\n", + "def full_clip_load(clip_load_list):\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " try:\n", + " del clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + " except:\n", + " pass\n", + " mmc_models = get_mmc_models(clip_load_list)\n", + " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = load_clip_models(mmc_models)\n", + " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + "\n", + "clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", + "\n", + "torch.cuda.empty_cache()\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N_Di3xFSXGWe" + }, + "source": [ + "#### Advanced settings for the generation\n", + "##### Access [our guide](https://multimodal.art/majesty-diffusion) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pAALegoCXEbm" + }, + "outputs": [], + "source": [ + "opt = DotMap()\n", + "\n", + "#Change it to false to not use CLIP Guidance at all \n", + "use_cond_fn = True\n", + "\n", + "#Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion\n", + "custom_schedule_setting = [\n", + " [200,1000,8],\n", + " [50,200,5],\n", + " #\"gfpgan:1.5\",\n", + " #[50,200,5],\n", + "]\n", + " \n", + "#Cut settings\n", + "clamp_index = [1]*1000 \n", + "cut_overview = [8]*500 + [4]*500\n", + "cut_innercut = [0]*500 + [4]*500\n", + "cut_ic_pow = .1\n", + "cut_icgray_p = [.1]*300+[0]*1000\n", + "cutn_batches = 1\n", + "range_index = [0]*300 + [0]*1000 \n", + "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", + "tv_scales = [1000]*1+[600]*3\n", + "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", + "\n", + "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", + "#clip_guidance_schedule = [10000]*300 + [500]*700\n", + "\n", + "symmetric_loss_scale = 0 #Apply symmetric loss\n", + "\n", + "#Latent Diffusion Advanced Settings\n", + "scale_div = 0.5 # Use when latent upscale to correct satuation problem\n", + "opt_mag_mul = 10 #Magnify grad before clamping\n", + "#PLMS Currently not working, working on a fix\n", + "#opt.plms = False #Won;=t work with clip guidance\n", + "opt_ddim_eta, opt_eta_end = [1.4,1] # linear variation of eta\n", + "opt_temperature = .975 \n", + "\n", + "#Grad advanced settings\n", + "grad_center = False\n", + "grad_scale= 0.5 #5 Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "anti_jpg = 0 #not working\n", + "\n", + "#Init image advanced settings\n", + "init_rotate, mask_rotate=[False, False]\n", + "init_magnitude = 0.15\n", + "\n", + "#More settings\n", + "RGB_min, RGB_max = [-0.95,0.95]\n", + "padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", + "flip_aug=False\n", + "cc = 60\n", + "cutout_debug = False\n", + "opt.outdir = outputs_path\n", + "\n", + "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", + "experimental_aesthetic_embeddings = False\n", + "#How much you want this to influence your result\n", + "experimental_aesthetic_embeddings_weight = 0.5\n", + "#9 are good aesthetic embeddings, 0 are bad ones\n", + "experimental_aesthetic_embeddings_score = 9" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZUu_pyTkuxiT" + }, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wo1tM270ryit" + }, + "source": [ + "### Prompts\n", + "The main prompt is the CLIP prompt. The Latent Prompts usually help with style and composition, you can turn them off by setting `latent_diffsion_guidance_scale=0` " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rRIC0eQervDN" + }, + "outputs": [], + "source": [ + "#Amp up your prompt game with prompt engineering, check out this guide: https://matthewmcateer.me/blog/clip-prompt-engineering/\n", + "#Prompt for CLIP Guidance\n", + "clip_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "\n", + "#Prompt for Latent Diffusion\n", + "latent_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "\n", + "#Negative prompts for Latent Diffusion\n", + "latent_negatives = [\"low quality image\"]\n", + "\n", + "image_prompts = []" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iv8-gEvUsADL" + }, + "source": [ + "### Diffuse!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "fmafGmcyT1mZ" + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "#@markdown ### Basic settings \n", + "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", + "width = 256#@param{type: 'integer'}\n", + "height = 256#@param{type: 'integer'}\n", + "latent_diffusion_guidance_scale = 2 #@param {type:\"number\"}\n", + "clip_guidance_scale = 5000 #@param{type: 'integer'}\n", + "how_many_batches = 1 #@param{type: 'integer'}\n", + "aesthetic_loss_scale = 200 #@param{type: 'integer'}\n", + "augment_cuts=True #@param{type:'boolean'}\n", + "\n", + "#@markdown\n", + "\n", + "#@markdown ### Init image settings\n", + "#@markdown `init_image` requires the path of an image to use as init to the model\n", + "init_image = None #@param{type: 'string'}\n", + "if(init_image == '' or init_image == 'None'):\n", + " init_image = None\n", + "#@markdown `starting_timestep`: How much noise do you want to add to your init image for it to then be difused by the model\n", + "starting_timestep = 0.9 #@param{type: 'number'}\n", + "#@markdown `init_mask` is a mask same width and height as the original image with the color black indicating where to inpaint\n", + "init_mask = None #@param{type: 'string'}\n", + "#@markdown `init_scale` controls how much the init image should influence the final result. Experiment with values around `1000`\n", + "init_scale = 1000 #@param{type: 'integer'}\n", + "init_brightness = 0.0 #@param{type: 'number'}\n", + "#@markdown How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)\n", + "init_noise = 0.6 #@param{type: 'number'}\n", + "\n", + "#@markdown\n", + "\n", + "#@markdown ### Custom saved settings\n", + "#@markdown If you choose custom saved settings, the settings set by the preset overrule some of your choices. You can still modify the settings not in the preset. Check what each preset modifies here\n", + "custom_settings = 'path/to/settings.cfg' #@param{type:'string'}\n", + "settings_library = 'None (use settings defined above)' #@param [\"None (use settings defined above)\", \"default (optimized for colab free)\", \"dango233_princesses\", \"the_other_zippy_defaults\", \"makeitrad_defaults\"]\n", + "if(settings_library != 'None (use settings defined above)'):\n", + " if(settings_library == 'default (optimized for colab free)'):\n", + " custom_settings = f'majesty-diffusion/latent_settings_library/default.cfg'\n", + " else:\n", + " custom_settings = f'majesty-diffusion/latent_settings_library/{settings_library}.cfg'\n", + "\n", + "global_var_scope = globals()\n", + "if(custom_settings is not None and custom_settings != '' and custom_settings != 'path/to/settings.cfg'):\n", + " print('Loaded ', custom_settings)\n", + " try:\n", + " from configparser import ConfigParser\n", + " except ImportError:\n", + " from ConfigParser import ConfigParser\n", + " import configparser\n", + " \n", + " config = ConfigParser()\n", + " config.read(custom_settings)\n", + " #custom_settings_stream = fetch(custom_settings)\n", + " #Load CLIP models from config\n", + " if(config.has_section('clip_list')):\n", + " clip_incoming_list = config.items('clip_list')\n", + " clip_incoming_models = clip_incoming_list[0]\n", + " incoming_perceptors = eval(clip_incoming_models[1])\n", + " if((len(incoming_perceptors) != len(clip_load_list)) or not all(elem in incoming_perceptors for elem in clip_load_list)):\n", + " clip_load_list = incoming_perceptors\n", + " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", + "\n", + " #Load settings from config and replace variables\n", + " if(config.has_section('basic_settings')):\n", + " basic_settings = config.items('basic_settings')\n", + " for basic_setting in basic_settings:\n", + " global_var_scope[basic_setting[0]] = eval(basic_setting[1])\n", + " \n", + " if(config.has_section('advanced_settings')):\n", + " advanced_settings = config.items('advanced_settings')\n", + " for advanced_setting in advanced_settings:\n", + " global_var_scope[advanced_setting[0]] = eval(advanced_setting[1])\n", + "\n", + "if(((init_image is not None) and (init_image != 'None') and (init_image != '')) and starting_timestep != 1 and custom_schedule_setting[0][1] == 1000):\n", + " custom_schedule_setting[0] = [custom_schedule_setting[0][0], int(custom_schedule_setting[0][1]*starting_timestep), custom_schedule_setting[0][2]]\n", + "\n", + "prompts = clip_prompts\n", + "opt.prompt = latent_prompts\n", + "opt.uc = latent_negatives\n", + "custom_schedules = set_custom_schedules(custom_schedule_setting)\n", + "aes_scale = aesthetic_loss_scale\n", + "try: \n", + " clip_guidance_schedule\n", + " clip_guidance_index = clip_guidance_schedule\n", + "except:\n", + " clip_guidance_index = [clip_guidance_scale]*1000\n", + "\n", + "opt.W = (width//64)*64;\n", + "opt.H = (height//64)*64;\n", + "if opt.W != width or opt.H != height:\n", + " print(f'Changing output size to {opt.W}x{opt.H}. Dimensions must by multiples of 64.')\n", + "\n", + "opt.mag_mul = opt_mag_mul \n", + "opt.ddim_eta = opt_ddim_eta\n", + "opt.eta_end = opt_eta_end\n", + "opt.temperature = opt_temperature\n", + "opt.n_iter = how_many_batches\n", + "opt.n_samples = 1\n", + "#opt.W, opt.H = [width,height]\n", + "opt.scale = latent_diffusion_guidance_scale\n", + "aug = augment_cuts\n", + "\n", + "torch.cuda.empty_cache()\n", + "gc.collect()\n", + "generate_video = False\n", + "if generate_video: \n", + " fps = 24\n", + " p = Popen(['ffmpeg', '-y', '-f', 'image2pipe', '-vcodec', 'png', '-r', str(fps), '-i', '-', '-vcodec', 'libx264', '-r', str(fps), '-pix_fmt', 'yuv420p', '-crf', '17', '-preset', 'veryslow', 'video.mp4'], stdin=PIPE)\n", + "do_run()\n", + "if generate_video: \n", + " p.stdin.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4cvUzcO9FeMT" + }, + "source": [ + "### Save your own settings\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "LGLUCX_UGqka" + }, + "outputs": [], + "source": [ + "\n", + "#@markdown ### Save current settings\n", + "#@markdown If you would like to save your current settings, uncheck `skip_saving` and run this cell. You will get a `custom_settings.cfg` file you can reuse and share. If you like your results, send us a pull request to add your settings to the selectable library\n", + "skip_saving = True #@param{type:'boolean'}\n", + "if(not skip_saving):\n", + " data = generate_settings_file(add_prompts=False, add_dimensions=True)\n", + " text_file = open(\"custom_settings.cfg\", \"w\")\n", + " text_file.write(data)\n", + " text_file.close()\n", + " from google.colab import files\n", + " files.download('custom_settings.cfg')\n", + " print(\"Downloaded as custom_settings.cfg\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fzd-2mVMWHV0" + }, + "source": [ + "### Biases acknowledgment\n", + "Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the Latent Diffusion paper: \\\"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\\\". \n", + "\n", + "The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. You can read more on LAION's website" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "xEVSOJ4f0B21", + "VpR9JhyCu5iq", + "N_Di3xFSXGWe", + "xEVSOJ4f0B21" + ], + "machine_shape": "hm", + "name": "Latent Majesty Diffusion v1.3", + "private_outputs": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From e1e935686649962c42e00a66bb0519a2cd2fdcc5 Mon Sep 17 00:00:00 2001 From: apolinario Date: Mon, 6 Jun 2022 01:10:10 +0200 Subject: [PATCH 07/13] Fix clip_load_list --- latent.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 8535d36..8c5f4b3 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -29,6 +29,7 @@ "The LAION-400M-trained model and the modified inference code are from [CompVis Latent Diffusion](https://github.com/CompVis/latent-diffusion). The guided-diffusion method is modified by Dango233 based on [Katherine Crowson](https://twitter.com/RiversHaveWings)'s guided diffusion notebook. multimodalart savable settings, MMC and assembled the Colab. Check the complete list on our GitHub. Some functions and methods are from various code masters (nsheppard, DanielRussRuss and others)\n", "\n", "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))\n", + "\n", "Changelog: 1.4 - better defaults, added ViT-L/14 LAION-400M trained, fix CLOOB, adds modified dynamic thresholding, removes latent upscaler (was broken), adds RGB upscaler\n" ] }, @@ -1024,13 +1025,12 @@ " x_T += noise_like(x_T.shape,device,False)*init_noise\n", " x_T = x_T.clamp(-6,6)\n", " if method == \"purge\":\n", - " global has_purged\n", " has_purged = True\n", " for i in scale_factor.split(\",\"):\n", - " if i in clip_list:\n", + " if i in clip_load_list:\n", " arch, pub, m_id = i[1:-1].split(' - ')\n", " print(\"Purge \",i)\n", - " del clip_list[i]\n", + " del clip_list[clip_list.index(m_id)]\n", " del clip_model[m_id]\n", " del clip_size[m_id]\n", " del clip_tokenize[m_id]\n", @@ -1299,11 +1299,11 @@ "opt.outdir = outputs_path\n", "\n", "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", - "experimental_aesthetic_embeddings = False\n", + "experimental_aesthetic_embeddings = True\n", "#How much you want this to influence your result\n", - "experimental_aesthetic_embeddings_weight = 0.5\n", + "experimental_aesthetic_embeddings_weight = 0.3\n", "#9 are good aesthetic embeddings, 0 are bad ones\n", - "experimental_aesthetic_embeddings_score = 9\n", + "experimental_aesthetic_embeddings_score = 8\n", "\n", "# For fun dont change except if you really know what your are doing\n", "cut_blur_n = 0\n", From 13ef7f1569b30a284e2fe5cdf5d32d138141775a Mon Sep 17 00:00:00 2001 From: apolinario Date: Mon, 6 Jun 2022 01:46:07 +0200 Subject: [PATCH 08/13] Fix more details --- latent.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 8c5f4b3..1b15341 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -842,6 +842,7 @@ "\n", "has_purged = False\n", "def do_run():\n", + " global has_purged\n", " if(has_purged):\n", " global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", @@ -986,9 +987,8 @@ " torch.cuda.empty_cache()\n", " gc.collect()\n", " method, scale_factor = custom_schedule.split(\":\")\n", - " scale_factor = float(scale_factor)\n", - " #clamp_index = np.array(clamp_index) * scale_factor\n", " if method == \"RGB\":\n", + " scale_factor = float(scale_factor)\n", " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", " temp_file = os.path.join(sample_path, temp_file_name)\n", " im.save(temp_file, format = \"PNG\")\n", @@ -1001,6 +1001,7 @@ " x_T += noise_like(x_T.shape,device,False)*init_noise\n", " x_T = x_T.clamp(-6,6)\n", " if method == \"gfpgan\":\n", + " scale_factor = float(scale_factor)\n", " last_step_upscale = True\n", " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", " temp_file = os.path.join(sample_path, temp_file_name)\n", @@ -1482,7 +1483,7 @@ "\n", "\n", "def modify_score(e_t, e_t_uncond):\n", - " if(!score_modifier):\n", + " if(not score_modifier):\n", " return e_t\n", " else:\n", " e_t_d = (e_t - e_t_uncond)\n", From 69e9320e2cd8110d1185fc43c3502fed56535ae7 Mon Sep 17 00:00:00 2001 From: apolinario Date: Mon, 6 Jun 2022 10:38:38 +0200 Subject: [PATCH 09/13] cut_blur fix --- latent.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 1b15341..9881fa1 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -554,7 +554,8 @@ " make_cutouts[i] = MakeCutouts(clip_size[i],\n", " Overview= cut_overview[t], \n", " InnerCrop = cut_innercut[t], \n", - " IC_Size_Pow=cut_ic_pow, IC_Grey_P = cut_icgray_p[t]\n", + " IC_Size_Pow=cut_ic_pow, IC_Grey_P = cut_icgray_p[t],\n", + " cut_blur_n = cut_blur_n[t]\n", " )\n", " cutn = cut_overview[t]+cut_innercut[t]\n", " for j in range(cutn_batches):\n", @@ -765,6 +766,8 @@ " clamp_index = {clamp_index}\n", " cut_overview = {list_mul_to_array(cut_overview)}\n", " cut_innercut = {list_mul_to_array(cut_innercut)}\n", + " cut_blur_n = {list_mul_to_array(cut_blur_n)}\n", + " cut_blur_kernel = {cut_blur_kernel}\n", " cut_ic_pow = {cut_ic_pow}\n", " cut_icgray_p = {list_mul_to_array(cut_icgray_p)}\n", " cutn_batches = {cutn_batches}\n", @@ -817,8 +820,6 @@ " experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score}\n", "\n", " # For fun dont change except if you really know what your are doing\n", - " cut_blur_n = {cut_blur_n}\n", - " cut_blur_kernel = {cut_blur_kernel}\n", " grad_blur = {grad_blur}\n", " compress_steps = {compress_steps}\n", " compress_factor = {compress_factor}\n", @@ -1259,6 +1260,7 @@ "cut_icgray_p = [.1]*300+[0]*1000\n", "cutn_batches = 1\n", "cut_blur_n = [0]*400 + [0]*600\n", + "cut_blur_kernel = 3\n", "range_index = [0]*1000\n", "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", "tv_scales = [800]*1+[100]*3\n", @@ -1307,8 +1309,6 @@ "experimental_aesthetic_embeddings_score = 8\n", "\n", "# For fun dont change except if you really know what your are doing\n", - "cut_blur_n = 0\n", - "cut_blur_kernel = 3\n", "grad_blur = False\n", "compress_steps = 200\n", "compress_factor = 0.1\n", From 7555b895635e08ea02baf10fc43d7f9e1739a22b Mon Sep 17 00:00:00 2001 From: apolinario Date: Tue, 7 Jun 2022 07:44:29 +0200 Subject: [PATCH 10/13] Fix Score Modifier --- latent.ipynb | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 9881fa1..4050502 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -773,6 +773,7 @@ " cutn_batches = {cutn_batches}\n", " range_index = {list_mul_to_array(range_index)}\n", " active_function = \"{active_function}\"\n", + " ths_method= \"{ths_method}\"\n", " tv_scales = {list_mul_to_array(tv_scales)}\n", " latent_tv_loss = {latent_tv_loss}\n", "\n", @@ -1263,6 +1264,7 @@ "cut_blur_kernel = 3\n", "range_index = [0]*1000\n", "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", + "ths_method = \"softsign\"\n", "tv_scales = [800]*1+[100]*3\n", "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", @@ -1483,7 +1485,7 @@ "\n", "\n", "def modify_score(e_t, e_t_uncond):\n", - " if(not score_modifier):\n", + " if(score_modifier is False):\n", " return e_t\n", " else:\n", " e_t_d = (e_t - e_t_uncond)\n", @@ -1495,12 +1497,13 @@ "\n", " s.clamp_(min = 1.)\n", " s = s.view(-1, *((1,) * (e_t_d.ndim - 1)))\n", - " e_t_d = F.softsign(e_t_d) / s / 3\n", - " \n", - " #e_t_d = e_t_d.clamp(-s,s) / s\n", + " if ths_method == \"softsign\":\n", + " e_t_d = F.softsign(e_t_d*3) / s / 3\n", + " elif ths_method == \"clamp\":\n", + " e_t_d = e_t_d.clamp(-s,s) / s\n", " e_t = e_t_uncond + e_t_d\n", " return(e_t)\n", - "\n", + " \n", "score_corrector.modify_score = modify_score\n", "\n", "torch.cuda.empty_cache()\n", From 048b36344ebfe7d81f5b419d2c290aab0f5d02af Mon Sep 17 00:00:00 2001 From: apolinario Date: Tue, 7 Jun 2022 07:59:36 +0200 Subject: [PATCH 11/13] More settings tuning --- latent.ipynb | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/latent.ipynb b/latent.ipynb index 4050502..9423d7a 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -532,17 +532,6 @@ " with torch.enable_grad():\n", " global clamp_start_, clamp_max\n", " x = x.detach()\n", - " #if dynamic_decode and t<=350 :\n", - " # x_recon = x\n", - " # s = torch.quantile(\n", - " # rearrange(x_recon, 'b ... -> b (...)').abs(),\n", - " # threshold_percentile,\n", - " # dim = -1\n", - " # )\n", - "\n", - " # s.clamp_(min = 1.)\n", - " # s = s.view(-1, *((1,) * (x_recon.ndim - 1)))\n", - " # x = x_recon.clamp(-s, s) / s\n", " x = x.requires_grad_()\n", " x_in = model.decode_first_stage(x)\n", " display_handler(x_in,t,1,False)\n", @@ -685,7 +674,7 @@ " if t<= compress_steps:\n", " image = image / (ths/threshold)**compress_factor\n", " image += noise_like(image.shape,device,False) * ((ths/threshold)**compress_factor - 1)\n", - " return(image) \n", + " return(image)\n", "\n", "def make_schedule(t_start, t_end, step_size=1):\n", " schedule = []\n", @@ -1254,7 +1243,7 @@ "]\n", " \n", "#Cut settings\n", - "clamp_index = [2,1] #linear variation of the index for clamping the gradient \n", + "clamp_index = [2,1.4] #linear variation of the index for clamping the gradient \n", "cut_overview = [8]*500 + [4]*500\n", "cut_innercut = [0]*500 + [4]*500\n", "cut_ic_pow = .2\n", @@ -1265,7 +1254,7 @@ "range_index = [0]*1000\n", "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", "ths_method = \"softsign\"\n", - "tv_scales = [800]*1+[100]*3\n", + "tv_scales = [600]*1+[50]*1 +[0]*2\n", "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", @@ -1278,12 +1267,12 @@ "opt_mag_mul = 15 #Magnify grad before clamping\n", "#PLMS Currently not working, working on a fix\n", "#opt.plms = False #Won;=t work with clip guidance\n", - "opt_ddim_eta, opt_eta_end = [1.6,1] # linear variation of eta\n", + "opt_ddim_eta, opt_eta_end = [1.5,1.2] # linear variation of eta\n", "opt_temperature = .95\n", "\n", "#Grad advanced settings\n", "grad_center = False\n", - "grad_scale= 0.5 #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "grad_scale= 0.75 #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", "\n", "#Restraints the model from explodign despite larger clamp\n", "score_modifier = True\n", @@ -1312,9 +1301,9 @@ "\n", "# For fun dont change except if you really know what your are doing\n", "grad_blur = False\n", - "compress_steps = 200\n", + "compress_steps = 0\n", "compress_factor = 0.1\n", - "punish_steps = 200\n", + "punish_steps = 0\n", "punish_factor = 0.8" ] }, From 100e709c551b5a2667c8fddba8f3adcb254ceedd Mon Sep 17 00:00:00 2001 From: Stephan Auerhahn Date: Wed, 8 Jun 2022 07:00:53 -0700 Subject: [PATCH 12/13] Port Docker/CLI changes to 1.4 --- .dockerignore | 1 + .github/workflows/docker-publish.yml | 93 ++ .gitignore | 1 + Dockerfile | 40 + latent.ipynb | 1073 ++------------------ latent.py | 318 ++++++ majesty.py | 1374 ++++++++++++++++++++++++++ 7 files changed, 1916 insertions(+), 984 deletions(-) create mode 100644 .dockerignore create mode 100644 .github/workflows/docker-publish.yml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 latent.py create mode 100644 majesty.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d6d95cf --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +models \ No newline at end of file diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..5d793d4 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,93 @@ +name: Docker + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +on: + schedule: + - cron: '31 4 * * *' + push: + branches: [ main ] + # Publish semver tags as releases. + tags: [ 'v*.*.*' ] + pull_request: + branches: [ main ] + +env: + # Use docker.io for Docker Hub if empty + REGISTRY: ghcr.io + # github.repository as / + IMAGE_NAME: ${{ github.repository }} + + +jobs: + build: + + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Install the cosign tool except on PR + # https://github.com/sigstore/cosign-installer +# - name: Install cosign +# if: github.event_name != 'pull_request' +# uses: sigstore/cosign-installer@d6a3abf1bdea83574e28d40543793018b6035605 +# with: +# cosign-release: 'v1.7.1' + + + # Workaround: https://github.com/docker/build-push-action/issues/461 + - name: Setup Docker buildx + uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf + + # Login against a Docker registry except on PR + # https://github.com/docker/login-action + - name: Log into registry ${{ env.REGISTRY }} + if: github.event_name != 'pull_request' + uses: docker/login-action@28218f9b04b4f3f62068d7b6ce6ca5b26e35336c + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Extract metadata (tags, labels) for Docker + # https://github.com/docker/metadata-action + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + # Build and push Docker image with Buildx (don't push on PR) + # https://github.com/docker/build-push-action + - name: Build and push Docker image + id: build-and-push + uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a + with: + context: . + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + # Sign the resulting Docker image digest except on PRs. + # This will only write to the public Rekor transparency log when the Docker + # repository is public to avoid leaking data. If you would like to publish + # transparency data even for private images, pass --force to cosign below. + # https://github.com/sigstore/cosign +# - name: Sign the published Docker image +# if: ${{ github.event_name != 'pull_request' }} +# env: +# COSIGN_EXPERIMENTAL: "true" + # This step uses the identity token to provision an ephemeral certificate + # against the sigstore community Fulcio instance. +# run: cosign sign ${{ steps.meta.outputs.tags }}@${{ steps.build-and-push.outputs.digest }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d6d95cf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +models \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..71a5a73 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata build-essential wget git git-lfs \ + && apt-get clean + +RUN mkdir -p /src +WORKDIR /src + +RUN git clone https://github.com/multimodalart/latent-diffusion --branch 1.4 +RUN git clone https://github.com/CompVis/taming-transformers +RUN git clone https://github.com/TencentARC/GFPGAN +RUN git lfs clone https://github.com/LAION-AI/aesthetic-predictor + +RUN pip install tensorflow==2.9.1 +RUN pip install -e ./taming-transformers +RUN pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops +RUN pip install transformers +RUN pip install dotmap +RUN pip install resize-right +RUN pip install piq +RUN pip install lpips +RUN pip install basicsr +RUN pip install facexlib +RUN pip install realesrgan +RUN pip install ipywidgets + +RUN git clone https://github.com/apolinario/Multi-Modal-Comparators --branch gradient_checkpointing +RUN pip install poetry +WORKDIR /src/Multi-Modal-Comparators +RUN poetry build; pip install dist/mmc*.whl +WORKDIR /src +RUN python Multi-Modal-Comparators/src/mmc/napm_installs/__init__.py + +VOLUME [ "/src/models" ] +VOLUME [ "/root/.cache" ] + +COPY majesty.py . +COPY latent.py . +COPY latent_settings_library . +ENTRYPOINT ["python", "latent.py"] \ No newline at end of file diff --git a/latent.ipynb b/latent.ipynb index 9423d7a..9ebb837 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -125,7 +125,7 @@ " !git clone https://github.com/multimodalart/latent-diffusion --branch 1.4\n", " !git clone https://github.com/CompVis/taming-transformers\n", " !git clone https://github.com/TencentARC/GFPGAN\n", - " !git clone https://github.com/multimodalart/majesty-diffusion\n", + " !git clone https://github.com/NightmareAI/majesty-diffusion --branch 1.4\n", " !git lfs clone https://github.com/LAION-AI/aesthetic-predictor\n", " !pip install -e ./taming-transformers\n", " !pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops\n", @@ -176,57 +176,11 @@ "outputs": [], "source": [ "#@title Download models\n", - "import os\n", - "if os.path.isfile(f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\"):\n", - " print(\"Using Latent Diffusion model saved from Google Drive\")\n", - "else: \n", - " !wget -O $model_path/latent_diffusion_txt2img_f8_large.ckpt https://ommer-lab.com/files/latent-diffusion/nitro/txt2img-f8-large/model.ckpt --no-check-certificate\n", - "\n", - "if os.path.isfile(f\"{model_path}/finetuned_state_dict.pt\"):\n", - " print(\"Using Latent Diffusion model saved from Google Drive\")\n", - "else: \n", - " !wget -O $model_path/finetuned_state_dict.pt https://huggingface.co/multimodalart/compvis-latent-diffusion-text2img-large/resolve/main/finetuned_state_dict.pt --no-check-certificate\n", - "\n", - "if os.path.isfile(f\"{model_path}/ava_vit_l_14_336_linear.pth\"):\n", - " print(\"Using ViT-L/14@336px aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/ava_vit_l_14_336_linear.pth https://multimodal.art/models/ava_vit_l_14_336_linear.pth\n", - "\n", - "if os.path.isfile(f\"{model_path}/sa_0_4_vit_l_14_linear.pth\"):\n", - " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/sa_0_4_vit_l_14_linear.pth https://multimodal.art/models/sa_0_4_vit_l_14_linear.pth\n", - "\n", - "if os.path.isfile(f\"{model_path}/ava_vit_l_14_linear.pth\"):\n", - " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/ava_vit_l_14_linear.pth https://multimodal.art/models/ava_vit_l_14_linear.pth\n", - "\n", - "if os.path.isfile(f\"{model_path}/ava_vit_b_16_linear.pth\"):\n", - " print(\"Using ViT-B/16 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/ava_vit_b_16_linear.pth http://batbot.tv/ai/models/v-diffusion/ava_vit_b_16_linear.pth\n", - "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_16_linear.pth\"):\n", - " print(\"Using ViT-B/16 sa aesthetic model already saved\")\n", - "else:\n", - " !wget -O $model_path/sa_0_4_vit_b_16_linear.pth https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth\n", - "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"):\n", - " print(\"Using ViT-B/32 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_32_linear.pth\n", - "if os.path.isfile(f\"{model_path}/openimages_512x_png_embed224.npz\"):\n", - " print(\"Using openimages png from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/openimages_512x_png_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/openimages_512x_png_embed224.npz\n", - "if os.path.isfile(f\"{model_path}/imagenet_512x_jpg_embed224.npz\"):\n", - " print(\"Using imagenet antijpeg from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/imagenet_512x_jpg_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/imagenet_512x_jpg_embed224.npz\n", - "if os.path.isfile(f\"{model_path}/GFPGANv1.3.pth\"):\n", - " print(\"Using GFPGAN v1.3 from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/GFPGANv1.3.pth https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth\n", - "!cp $model_path/GFPGANv1.3.pth GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth\n" + "import sys\n", + "sys.path.append('./majesty-diffusion')\n", + "import majesty\n", + "majesty.model_path = model_path\n", + "majesty.download_models()\n" ] }, { @@ -260,6 +214,8 @@ "source": [ "#@title Import stuff\n", "import argparse, os, sys, glob\n", + "sys.path.append('./majesty-diffusion')\n", + "import majesty\n", "import torch\n", "import numpy as np\n", "from omegaconf import OmegaConf\n", @@ -330,729 +286,19 @@ "outputs": [], "source": [ "#@title Load the model\n", + "majesty.model_path = model_path\n", + "majesty.outputs_path = outputs_path\n", "torch.backends.cudnn.benchmark = True\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", - "def load_model_from_config(config, ckpt, verbose=False, latent_diffusion_model=\"original\"):\n", - " print(f\"Loading model from {ckpt}\")\n", - " print(latent_diffusion_model)\n", - " model = instantiate_from_config(config.model)\n", - " sd = torch.load(ckpt, map_location=\"cuda\")[\"state_dict\"]\n", - " m, u = model.load_state_dict(sd, strict = False)\n", - " if(latent_diffusion_model == \"finetuned\"): \n", - " del sd\n", - " sd_finetune = torch.load(f\"{model_path}/finetuned_state_dict.pt\",map_location=\"cuda\")\n", - " m, u = model.model.load_state_dict(sd_finetune, strict = False)\n", - " model.model = model.model.half().eval().to(device)\n", - " del sd_finetune\n", - " # sd = pl_sd[\"state_dict\"]\n", - " \n", - " if len(m) > 0 and verbose:\n", - " print(\"missing keys:\")\n", - " print(m)\n", - " if len(u) > 0 and verbose:\n", - " print(\"unexpected keys:\")\n", - " print(u)\n", - "\n", - " model.requires_grad_(False).half().eval().to('cuda')\n", - " return model\n", + "majesty.device = device\n", "\n", "config = OmegaConf.load(\"./latent-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml\") # TODO: Optionally download from same location as ckpt and chnage this logic\n", - "model = load_model_from_config(config, f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\",False, latent_diffusion_model) # TODO: check path\n", - "model = model.half().eval().to(device)\n", + "model = majesty.load_model_from_config(config, f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\",False, latent_diffusion_model) # TODO: check path\n", + "majesty.model = model.half().eval().to(device)\n", "#if(latent_diffusion_model == \"finetuned\"):\n", - "# model.model = model.model.half().eval().to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "HY_7vvnPThzS" - }, - "outputs": [], - "source": [ - "#@title Load necessary functions\n", - "def set_custom_schedules(schedule):\n", - " custom_schedules = []\n", - " for schedule_item in schedule:\n", - " if(isinstance(schedule_item,list)):\n", - " custom_schedules.append(np.arange(*schedule_item))\n", - " else:\n", - " custom_schedules.append(schedule_item)\n", - " \n", - " return custom_schedules\n", - "\n", - "def parse_prompt(prompt):\n", - " if prompt.startswith('http://') or prompt.startswith('https://') or prompt.startswith(\"E:\") or prompt.startswith(\"C:\") or prompt.startswith(\"D:\"):\n", - " vals = prompt.rsplit(':', 2)\n", - " vals = [vals[0] + ':' + vals[1], *vals[2:]]\n", - " else:\n", - " vals = prompt.rsplit(':', 1)\n", - " vals = vals + ['', '1'][len(vals):]\n", - " return vals[0], float(vals[1])\n", - "\n", - "\n", - "class MakeCutouts(nn.Module):\n", - " def __init__(self, cut_size,\n", - " Overview=4, \n", - " WholeCrop = 0, WC_Allowance = 10, WC_Grey_P=0.2,\n", - " InnerCrop = 0, IC_Size_Pow=0.5, IC_Grey_P = 0.2,\n", - " cut_blur_n = 0\n", - " ):\n", - " super().__init__()\n", - " self.cut_size = cut_size\n", - " self.Overview = Overview\n", - " self.WholeCrop= WholeCrop\n", - " self.WC_Allowance = WC_Allowance\n", - " self.WC_Grey_P = WC_Grey_P\n", - " self.InnerCrop = InnerCrop\n", - " self.IC_Size_Pow = IC_Size_Pow\n", - " self.IC_Grey_P = IC_Grey_P\n", - " self.cut_blur_n = cut_blur_n\n", - " self.augs = T.Compose([\n", - " #T.RandomHorizontalFlip(p=0.5),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " T.RandomAffine(degrees=0, \n", - " translate=(0.05, 0.05), \n", - " #scale=(0.9,0.95),\n", - " fill=-1, interpolation = T.InterpolationMode.BILINEAR, ),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " #T.RandomPerspective(p=1, interpolation = T.InterpolationMode.BILINEAR, fill=-1,distortion_scale=0.2),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " T.RandomGrayscale(p=0.1),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " T.ColorJitter(brightness=0.05, contrast=0.05, saturation=0.05),\n", - " ])\n", - "\n", - " def forward(self, input):\n", - " gray = transforms.Grayscale(3)\n", - " sideY, sideX = input.shape[2:4]\n", - " max_size = min(sideX, sideY)\n", - " min_size = min(sideX, sideY, self.cut_size)\n", - " l_size = max(sideX, sideY)\n", - " output_shape = [input.shape[0],3,self.cut_size,self.cut_size] \n", - " output_shape_2 = [input.shape[0],3,self.cut_size+2,self.cut_size+2]\n", - " pad_input = F.pad(input,((sideY-max_size)//2+round(max_size*0.055),(sideY-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055)), **padargs)\n", - " cutouts_list = []\n", - " \n", - " if self.Overview>0:\n", - " cutouts = []\n", - " cutout = resize(pad_input, out_shape=output_shape, antialiasing=True)\n", - " output_shape_all = list(output_shape)\n", - " output_shape_all[0]=self.Overview*input.shape[0]\n", - " pad_input = pad_input.repeat(input.shape[0],1,1,1)\n", - " cutout = resize(pad_input, out_shape=output_shape_all)\n", - " if aug: cutout=self.augs(cutout)\n", - " if self.cut_blur_n > 0: cutout[0:self.cut_blur_n,:,:,:] = TF.gaussian_blur(cutout[0:self.cut_blur_n,:,:,:],cut_blur_kernel)\n", - " cutouts_list.append(cutout)\n", - " \n", - " if self.InnerCrop >0:\n", - " cutouts=[]\n", - " for i in range(self.InnerCrop):\n", - " size = int(torch.rand([])**self.IC_Size_Pow * (max_size - min_size) + min_size)\n", - " offsetx = torch.randint(0, sideX - size + 1, ())\n", - " offsety = torch.randint(0, sideY - size + 1, ())\n", - " cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]\n", - " if i <= int(self.IC_Grey_P * self.InnerCrop):\n", - " cutout = gray(cutout)\n", - " cutout = resize(cutout, out_shape=output_shape)\n", - " cutouts.append(cutout)\n", - " if cutout_debug:\n", - " TF.to_pil_image(cutouts[-1].add(1).div(2).clamp(0, 1).squeeze(0)).save(\"content/diff/cutouts/cutout_InnerCrop.jpg\",quality=99)\n", - " cutouts_tensor = torch.cat(cutouts)\n", - " cutouts=[]\n", - " cutouts_list.append(cutouts_tensor)\n", - " cutouts=torch.cat(cutouts_list)\n", - " return cutouts\n", - "\n", - "\n", - "def spherical_dist_loss(x, y):\n", - " x = F.normalize(x, dim=-1)\n", - " y = F.normalize(y, dim=-1)\n", - " return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)\n", - "\n", - "\n", - "def tv_loss(input):\n", - " \"\"\"L2 total variation loss, as in Mahendran et al.\"\"\"\n", - " input = F.pad(input, (0, 1, 0, 1), 'replicate')\n", - " x_diff = input[..., :-1, 1:] - input[..., :-1, :-1]\n", - " y_diff = input[..., 1:, :-1] - input[..., :-1, :-1]\n", - " return (x_diff**2 + y_diff**2).mean([1, 2, 3])\n", - "\n", - "\n", - "def range_loss(input, range_min, range_max):\n", - " return (input - input.clamp(range_min,range_max)).pow(2).mean([1, 2, 3])\n", - "\n", - "def symmetric_loss(x):\n", - " w = x.shape[3]\n", - " diff = (x - torch.flip(x,[3])).square().mean().sqrt()/(x.shape[2]*x.shape[3]/1e4)\n", - " return(diff)\n", - "\n", - "def fetch(url_or_path):\n", - " \"\"\"Fetches a file from an HTTP or HTTPS url, or opens the local file.\"\"\"\n", - " if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):\n", - " r = requests.get(url_or_path)\n", - " r.raise_for_status()\n", - " fd = io.BytesIO()\n", - " fd.write(r.content)\n", - " fd.seek(0)\n", - " return fd\n", - " return open(url_or_path, 'rb')\n", - "\n", - "\n", - "def to_pil_image(x):\n", - " \"\"\"Converts from a tensor to a PIL image.\"\"\"\n", - " if x.ndim == 4:\n", - " assert x.shape[0] == 1\n", - " x = x[0]\n", - " if x.shape[0] == 1:\n", - " x = x[0]\n", - " return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2)\n", - "\n", - "\n", - "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", - " std=[0.26862954, 0.26130258, 0.27577711])\n", - "\n", - "def centralized_grad(x, use_gc=True, gc_conv_only=False):\n", - " if use_gc:\n", - " if gc_conv_only:\n", - " if len(list(x.size())) > 3:\n", - " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", - " else:\n", - " if len(list(x.size())) > 1:\n", - " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", - " return x\n", - "\n", - "def cond_fn(x, t):\n", - " global cur_step\n", - " cur_step += 1\n", - " t=1000-t\n", - " t=t[0]\n", - " with torch.enable_grad():\n", - " global clamp_start_, clamp_max\n", - " x = x.detach()\n", - " x = x.requires_grad_()\n", - " x_in = model.decode_first_stage(x)\n", - " display_handler(x_in,t,1,False)\n", - " n = x_in.shape[0]\n", - " clip_guidance_scale = clip_guidance_index[t]\n", - " make_cutouts = {}\n", - " #rx_in_grad = torch.zeros_like(x_in)\n", - " for i in clip_list:\n", - " make_cutouts[i] = MakeCutouts(clip_size[i],\n", - " Overview= cut_overview[t], \n", - " InnerCrop = cut_innercut[t], \n", - " IC_Size_Pow=cut_ic_pow, IC_Grey_P = cut_icgray_p[t],\n", - " cut_blur_n = cut_blur_n[t]\n", - " )\n", - " cutn = cut_overview[t]+cut_innercut[t]\n", - " for j in range(cutn_batches):\n", - " losses=0\n", - " for i in clip_list:\n", - " clip_in = clip_normalize[i](make_cutouts[i](x_in.add(1).div(2)).to(\"cuda\"))\n", - " image_embeds = clip_model[i].encode_image(clip_in).float().unsqueeze(0).expand([target_embeds[i].shape[0],-1,-1])\n", - " target_embeds_temp = target_embeds[i]\n", - " if i == 'ViT-B-32--openai' and experimental_aesthetic_embeddings:\n", - " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_b_32_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", - " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", - " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", - " if i == 'ViT-L-14--openai' and experimental_aesthetic_embeddings:\n", - " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_l_14_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", - " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", - " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", - " target_embeds_temp = target_embeds_temp.unsqueeze(1).expand([-1,cutn*n,-1]) \n", - " dists = spherical_dist_loss(image_embeds, target_embeds_temp)\n", - " dists = dists.mean(1).mul(weights[i].squeeze()).mean()\n", - " losses+=dists*clip_guidance_scale * (2 if i in [\"ViT-L-14-336--openai\", \"RN50x64--openai\", \"ViT-B-32--laion2b_e16\"] else (.4 if \"cloob\" in i else 1))\n", - " if i == \"ViT-L-14-336--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_336(F.normalize(image_embeds, dim=-1))).mean() \n", - " losses -= aes_loss * aes_scale \n", - " if i == \"ViT-L-14--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_224(F.normalize(image_embeds, dim=-1))).mean() \n", - " losses -= aes_loss * aes_scale \n", - " if i == \"ViT-B-16--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_16(F.normalize(image_embeds, dim=-1))).mean() \n", - " losses -= aes_loss * aes_scale \n", - " if i == \"ViT-B-32--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_32(F.normalize(image_embeds, dim=-1))).mean()\n", - " losses -= aes_loss * aes_scale\n", - " #x_in_grad += torch.autograd.grad(losses, x_in)[0] / cutn_batches / len(clip_list)\n", - " #losses += dists\n", - " #losses = losses / len(clip_list) \n", - " #gc.collect()\n", - " \n", - " tv_losses = tv_loss(x).sum() * tv_scales[0] +\\\n", - " tv_loss(F.interpolate(x, scale_factor= 1/2)).sum()* tv_scales[1] + \\\n", - " tv_loss(F.interpolate(x, scale_factor = 1/4)).sum()* tv_scales[2] + \\\n", - " tv_loss(F.interpolate(x, scale_factor = 1/8)).sum()* tv_scales[3] \n", - " range_scale= range_index[t]\n", - " range_losses = range_loss(x_in,RGB_min,RGB_max).sum() * range_scale\n", - " var_scale = var_index[t]\n", - " loss = tv_losses + range_losses + losses\n", - " #del losses\n", - " if symmetric_loss_scale != 0: loss += symmetric_loss(x_in) * symmetric_loss_scale\n", - " if init_image is not None and init_scale:\n", - " lpips_loss = (lpips_model(x_in, init) * init_scale).squeeze().mean()\n", - " #print(lpips_loss)\n", - " loss += lpips_loss\n", - " #loss_grad = torch.autograd.grad(loss, x_in, )[0]\n", - " #x_in_grad += loss_grad\n", - " #grad = -torch.autograd.grad(x_in, x, x_in_grad)[0]\n", - " loss.backward()\n", - " grad = -x.grad\n", - " grad = torch.nan_to_num(grad, nan=0.0, posinf=0, neginf=0)\n", - " if grad_center: grad = centralized_grad(grad, use_gc=True, gc_conv_only=False)\n", - " mag = grad.square().mean().sqrt()\n", - " if mag==0 or torch.isnan(mag):\n", - " print(\"ERROR\")\n", - " print(t)\n", - " return(grad)\n", - " if t>=0:\n", - " if active_function == \"softsign\":\n", - " grad = F.softsign(grad*grad_scale/mag)\n", - " if active_function == \"tanh\":\n", - " grad = (grad/mag*grad_scale).tanh()\n", - " if active_function==\"clamp\":\n", - " grad = grad.clamp(-mag*grad_scale*2,mag*grad_scale*2)\n", - " if grad.abs().max()>0:\n", - " grad=grad/grad.abs().max()*opt.mag_mul\n", - " magnitude = grad.square().mean().sqrt()\n", - " else:\n", - " return(grad)\n", - " clamp_max = clamp_index_variation[t]\n", - " #print(magnitude, end = \"\\r\")\n", - " grad = grad* magnitude.clamp(max= clamp_max) /magnitude#0.2\n", - " grad = grad.detach()\n", - " grad = grad_fn(grad,t)\n", - " x = x.detach()\n", - " x = x.requires_grad_()\n", - " var = x.var()\n", - " var_losses = (var.pow(2).clamp(min = 1)- 1) * var_scale\n", - " var_losses.backward()\n", - " grad -= x.grad\n", - " print(grad.abs().mean(), x.grad.abs().mean(), end = \"\\r\")\n", - " return grad\n", - "\n", - "def null_fn(x_in):\n", - " return(torch.zeros_like(x_in))\n", - "\n", - "def display_handler(x,i,cadance = 5, decode = True):\n", - " global progress, image_grid, writer, img_tensor, im\n", - " img_tensor = x\n", - " if i%cadance==0:\n", - " if decode: \n", - " x = model.decode_first_stage(x)\n", - " grid = make_grid(torch.clamp((x+1.0)/2.0, min=0.0, max=1.0),round(x.shape[0]**0.5))\n", - " grid = 255. * rearrange(grid, 'c h w -> h w c').detach().cpu().numpy()\n", - " image_grid = grid.copy(order = \"C\") \n", - " with io.BytesIO() as output:\n", - " im = Image.fromarray(grid.astype(np.uint8))\n", - " im.save(output, format = \"PNG\")\n", - " progress.value = output.getvalue()\n", - " if generate_video:\n", - " im.save(p.stdin, 'PNG')\n", - "\n", - "\n", - "def grad_fn(x,t):\n", - " if t <= 500 and grad_blur: x = TF.gaussian_blur(x, 2*round(int(max(grad_blur-t/150, 1)))-1, 1.5)\n", - " return x\n", - "def cond_clamp(image,t): \n", - " t = 1000-t[0]\n", - " if t<= max(punish_steps, compress_steps):\n", - " s = torch.quantile(\n", - " rearrange(image, 'b ... -> b (...)').abs(),\n", - " threshold_percentile,\n", - " dim = -1\n", - " )\n", - " s = s.view(-1, *((1,) * (image.ndim - 1)))\n", - " ths = s.clamp(min = threshold)\n", - " im_max = image.clamp(min = ths) - image.clamp(min = ths, max = ths)\n", - " im_min = image.clamp(max = -ths, min = -ths) - image.clamp(max = -ths)\n", - " if t<=punish_steps:\n", - " image = image.clamp(min = -ths, max = ths)+(im_max-im_min) * punish_factor #((im_max-im_min)*punish_factor).tanh()/punish_factor \n", - " if t<= compress_steps:\n", - " image = image / (ths/threshold)**compress_factor\n", - " image += noise_like(image.shape,device,False) * ((ths/threshold)**compress_factor - 1)\n", - " return(image)\n", - "\n", - "def make_schedule(t_start, t_end, step_size=1):\n", - " schedule = []\n", - " par_schedule = []\n", - " t = t_start\n", - " while t > t_end:\n", - " schedule.append(t)\n", - " t -= step_size\n", - " schedule.append(t_end)\n", - " return np.array(schedule)\n", - "\n", - "lpips_model = lpips.LPIPS(net='vgg').to(device)\n", - "\n", - "def list_mul_to_array(list_mul):\n", - " i = 0\n", - " mul_count = 0\n", - " mul_string = ''\n", - " full_list = list_mul\n", - " full_list_len = len(full_list)\n", - " for item in full_list:\n", - " if(i == 0):\n", - " last_item = item\n", - " if(item == last_item):\n", - " mul_count+=1\n", - " if(item != last_item or full_list_len == i+1):\n", - " mul_string = mul_string + f' [{last_item}]*{mul_count} +'\n", - " mul_count=1\n", - " last_item = item\n", - " i+=1\n", - " return(mul_string[1:-2])\n", - "\n", - "def generate_settings_file(add_prompts=False, add_dimensions=False):\n", - " \n", - " if(add_prompts):\n", - " prompts = f'''\n", - " clip_prompts = {clip_prompts}\n", - " latent_prompts = {latent_prompts}\n", - " latent_negatives = {latent_negatives}\n", - " image_prompts = {image_prompts}\n", - " '''\n", - " else:\n", - " prompts = ''\n", - "\n", - " if(add_dimensions):\n", - " dimensions = f'''width = {width}\n", - " height = {height}\n", - " '''\n", - " else:\n", - " dimensions = ''\n", - " settings = f'''\n", - " #This settings file can be loaded back to Latent Majesty Diffusion. If you like your setting consider sharing it to the settings library at https://github.com/multimodalart/MajestyDiffusion\n", - " [clip_list]\n", - " perceptors = {clip_load_list}\n", - " \n", - " [basic_settings]\n", - " #Perceptor things\n", - " {prompts}\n", - " {dimensions}\n", - " latent_diffusion_guidance_scale = {latent_diffusion_guidance_scale}\n", - " clip_guidance_scale = {clip_guidance_scale}\n", - " aesthetic_loss_scale = {aesthetic_loss_scale}\n", - " augment_cuts={augment_cuts}\n", - "\n", - " #Init image settings\n", - " starting_timestep = {starting_timestep}\n", - " init_scale = {init_scale} \n", - " init_brightness = {init_brightness}\n", - " init_noise = {init_noise}\n", - "\n", - " [advanced_settings]\n", - " #Add CLIP Guidance and all the flavors or just run normal Latent Diffusion\n", - " use_cond_fn = {use_cond_fn}\n", - "\n", - " #Custom schedules for cuts. Check out the schedules documentation here\n", - " custom_schedule_setting = {custom_schedule_setting}\n", - "\n", - " #Cut settings\n", - " clamp_index = {clamp_index}\n", - " cut_overview = {list_mul_to_array(cut_overview)}\n", - " cut_innercut = {list_mul_to_array(cut_innercut)}\n", - " cut_blur_n = {list_mul_to_array(cut_blur_n)}\n", - " cut_blur_kernel = {cut_blur_kernel}\n", - " cut_ic_pow = {cut_ic_pow}\n", - " cut_icgray_p = {list_mul_to_array(cut_icgray_p)}\n", - " cutn_batches = {cutn_batches}\n", - " range_index = {list_mul_to_array(range_index)}\n", - " active_function = \"{active_function}\"\n", - " ths_method= \"{ths_method}\"\n", - " tv_scales = {list_mul_to_array(tv_scales)}\n", - " latent_tv_loss = {latent_tv_loss}\n", - "\n", - " #If you uncomment this line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale will be used\n", - " clip_guidance_schedule = {list_mul_to_array(clip_guidance_index)}\n", - " \n", - " #Apply symmetric loss (force simmetry to your results)\n", - " symmetric_loss_scale = {symmetric_loss_scale} \n", - "\n", - " #Latent Diffusion Advanced Settings\n", - " #Use when latent upscale to correct satuation problem\n", - " scale_div = {scale_div}\n", - " #Magnify grad before clamping by how many times\n", - " opt_mag_mul = {opt_mag_mul}\n", - " opt_ddim_eta = {opt_ddim_eta}\n", - " opt_eta_end = {opt_eta_end}\n", - " opt_temperature = {opt_temperature}\n", - "\n", - " #Grad advanced settings\n", - " grad_center = {grad_center}\n", - " #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", - " grad_scale={grad_scale} \n", - " score_modifier = {score_modifier}\n", - " threshold_percentile = {threshold_percentile}\n", - " threshold = {threshold}\n", - " var_index = {list_mul_to_array(var_index)}\n", - " \n", - " #Init image advanced settings\n", - " init_rotate={init_rotate}\n", - " mask_rotate={mask_rotate}\n", - " init_magnitude = {init_magnitude}\n", - "\n", - " #More settings\n", - " RGB_min = {RGB_min}\n", - " RGB_max = {RGB_max}\n", - " #How to pad the image with cut_overview\n", - " padargs = {padargs} \n", - " flip_aug={flip_aug}\n", - " \n", - " #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", - " experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings}\n", - " #How much you want this to influence your result\n", - " experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight}\n", - " #9 are good aesthetic embeddings, 0 are bad ones\n", - " experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score}\n", - "\n", - " # For fun dont change except if you really know what your are doing\n", - " grad_blur = {grad_blur}\n", - " compress_steps = {compress_steps}\n", - " compress_factor = {compress_factor}\n", - " punish_steps = {punish_steps}\n", - " punish_factor = {punish_factor}\n", - " '''\n", - " return(settings)\n", - "\n", - "#Alstro's aesthetic model\n", - "aesthetic_model_336 = torch.nn.Linear(768,1).cuda()\n", - "aesthetic_model_336.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_336_linear.pth\"))\n", - "\n", - "aesthetic_model_224 = torch.nn.Linear(768,1).cuda()\n", - "aesthetic_model_224.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_linear.pth\"))\n", - "\n", - "aesthetic_model_16 = torch.nn.Linear(512,1).cuda()\n", - "aesthetic_model_16.load_state_dict(torch.load(f\"{model_path}/ava_vit_b_16_linear.pth\"))\n", - "\n", - "aesthetic_model_32 = torch.nn.Linear(512,1).cuda()\n", - "aesthetic_model_32.load_state_dict(torch.load(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"))\n", - "\n", - "has_purged = False\n", - "def do_run():\n", - " global has_purged\n", - " if(has_purged):\n", - " global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", - " has_purged = False\n", - " # with torch.cuda.amp.autocast():\n", - " global progress,target_embeds, weights, zero_embed, init, scale_factor, cur_step\n", - " cur_step = 0\n", - " scale_factor = 1\n", - " make_cutouts = {}\n", - " for i in clip_list:\n", - " make_cutouts[i] = MakeCutouts(clip_size[i],Overview=1)\n", - " target_embeds, weights ,zero_embed = {}, {}, {}\n", - " for i in clip_list:\n", - " target_embeds[i] = []\n", - " weights[i]=[]\n", - "\n", - " for prompt in prompts:\n", - " txt, weight = parse_prompt(prompt)\n", - " for i in clip_list:\n", - " if \"cloob\" not in i:\n", - " with torch.cuda.amp.autocast():\n", - " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", - " target_embeds[i].append(embeds)\n", - " weights[i].append(weight)\n", - " else:\n", - " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", - " target_embeds[i].append(embeds)\n", - " weights[i].append(weight)\n", - "\n", - " for prompt in image_prompts:\n", - " print(f\"processing{prompt}\",end=\"\\r\")\n", - " path, weight = parse_prompt(prompt)\n", - " img = Image.open(fetch(path)).convert('RGB')\n", - " img = TF.resize(img, min(opt.W, opt.H, *img.size), transforms.InterpolationMode.LANCZOS)\n", - " for i in clip_list:\n", - " if \"cloob\" not in i:\n", - " with torch.cuda.amp.autocast():\n", - " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", - " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", - " target_embeds[i].append(embed)\n", - " weights[i].extend([weight])\n", - " else:\n", - " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", - " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", - " target_embeds[i].append(embed)\n", - " weights[i].extend([weight])\n", - " #if anti_jpg != 0:\n", - " # target_embeds[\"ViT-B-32--openai\"].append(torch.tensor([np.load(f\"{model_path}/openimages_512x_png_embed224.npz\")['arr_0']-np.load(f\"{model_path}/imagenet_512x_jpg_embed224.npz\")['arr_0']], device = device))\n", - " # weights[\"ViT-B-32--openai\"].append(anti_jpg)\n", - "\n", - " for i in clip_list:\n", - " target_embeds[i] = torch.cat(target_embeds[i])\n", - " weights[i] = torch.tensor([weights[i]], device=device)\n", - " shape = [4, opt.H//8, opt.W//8]\n", - " init = None\n", - " mask = None\n", - " transform = T.GaussianBlur(kernel_size=3, sigma=0.4)\n", - " if init_image is not None:\n", - " init = Image.open(fetch(init_image)).convert('RGB')\n", - " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", - " if init_rotate: init = torch.rot90(init, 1, [3,2]) \n", - " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W])\n", - " init = init.mul(2).sub(1).half()\n", - " init_encoded = model.first_stage_model.encode(init).sample()* init_magnitude + init_brightness\n", - " init_encoded = init_encoded + noise_like(init_encoded.shape,device,False).mul(init_noise)\n", - " else:\n", - " init = None\n", - " init_encoded = None\n", - " if init_mask is not None:\n", - " mask = Image.open(fetch(init_mask)).convert('RGB')\n", - " mask = TF.to_tensor(mask).to(device).unsqueeze(0)\n", - " if mask_rotate: mask = torch.rot90(init, 1, [3,2]) \n", - " mask = resize(mask,out_shape = [opt.n_samples,1,opt.H//8, opt.W//8])\n", - " mask = transform(mask)\n", - " print(mask)\n", - "\n", - "\n", - " progress = widgets.Image(layout = widgets.Layout(max_width = \"400px\",max_height = \"512px\"))\n", - " display.display(progress)\n", - "\n", - " if opt.plms:\n", - " sampler = PLMSSampler(model)\n", - " else:\n", - " sampler = DDIMSampler(model)\n", - "\n", - " os.makedirs(opt.outdir, exist_ok=True)\n", - " outpath = opt.outdir\n", - "\n", - " prompt = opt.prompt\n", - " sample_path = os.path.join(outpath, \"samples\")\n", - " os.makedirs(sample_path, exist_ok=True)\n", - " base_count = len(os.listdir(sample_path))\n", - "\n", - " all_samples=list()\n", - " last_step_upscale = False\n", - " eta1 = opt.ddim_eta\n", - " eta2 = opt.eta_end\n", - " with torch.enable_grad():\n", - " with torch.cuda.amp.autocast():\n", - " with model.ema_scope():\n", - " uc = None\n", - " if opt.scale != 1.0:\n", - " uc = model.get_learned_conditioning(opt.n_samples * opt.uc).cuda()\n", - " \n", - " for n in trange(opt.n_iter, desc=\"Sampling\"):\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " c = model.get_learned_conditioning(opt.n_samples * prompt).cuda()\n", - " if init_encoded is None:\n", - " x_T = torch.randn([opt.n_samples,*shape], device=device)\n", - " else:\n", - " x_T = init_encoded\n", - " last_step_uspcale_list = []\n", - " \n", - " for custom_schedule in custom_schedules:\n", - " if type(custom_schedule) != type(\"\"):\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " last_step_upscale = False\n", - " samples_ddim, _ = sampler.sample(S=opt.ddim_steps,\n", - " conditioning=c,\n", - " batch_size=opt.n_samples,\n", - " shape=shape,\n", - " custom_schedule = custom_schedule,\n", - " verbose=False,\n", - " unconditional_guidance_scale=opt.scale,\n", - " unconditional_conditioning=uc,\n", - " eta=eta1,\n", - " eta_end = eta2,\n", - " img_callback=None if use_cond_fn else display_handler,\n", - " cond_fn=cond_fn if use_cond_fn else None,\n", - " temperature = opt.temperature,\n", - " x_adjust_fn=cond_clamp,\n", - " x_T = x_T,\n", - " x0=x_T,\n", - " mask=mask,\n", - " score_corrector = score_corrector,\n", - " corrector_kwargs = {}\n", - " )\n", - " x_T = samples_ddim.clamp(-6,6)\n", - " else:\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " method, scale_factor = custom_schedule.split(\":\")\n", - " if method == \"RGB\":\n", - " scale_factor = float(scale_factor)\n", - " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", - " temp_file = os.path.join(sample_path, temp_file_name)\n", - " im.save(temp_file, format = \"PNG\")\n", - " init = Image.open(fetch(temp_file)).convert('RGB')\n", - " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", - " opt.H, opt.W = opt.H*scale_factor, opt.W*scale_factor\n", - " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W], antialiasing=True)\n", - " init = init.mul(2).sub(1).half()\n", - " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", - " x_T += noise_like(x_T.shape,device,False)*init_noise\n", - " x_T = x_T.clamp(-6,6)\n", - " if method == \"gfpgan\":\n", - " scale_factor = float(scale_factor)\n", - " last_step_upscale = True\n", - " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", - " temp_file = os.path.join(sample_path, temp_file_name)\n", - " im.save(temp_file, format = \"PNG\")\n", - " GFP_factor = 2 if scale_factor > 1 else 1\n", - " GFP_ver = 1.3 #if GFP_factor == 1 else 1.2\n", - " %cd GFPGAN\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " !python inference_gfpgan.py -i $temp_file -o results -v $GFP_ver -s $GFP_factor\n", - " %cd ..\n", - " face_corrected = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\"))\n", - " with io.BytesIO() as output:\n", - " face_corrected.save(output,format=\"PNG\")\n", - " progress.value = output.getvalue()\n", - " init = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", - " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", - " opt.H, opt.W = opt.H*scale_factor, opt.W*scale_factor\n", - " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W], antialiasing=True)\n", - " init = init.mul(2).sub(1).half()\n", - " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", - " x_T += noise_like(x_T.shape,device,False)*init_noise\n", - " x_T = x_T.clamp(-6,6)\n", - " if method == \"purge\":\n", - " has_purged = True\n", - " for i in scale_factor.split(\",\"):\n", - " if i in clip_load_list:\n", - " arch, pub, m_id = i[1:-1].split(' - ')\n", - " print(\"Purge \",i)\n", - " del clip_list[clip_list.index(m_id)]\n", - " del clip_model[m_id]\n", - " del clip_size[m_id]\n", - " del clip_tokenize[m_id]\n", - " del clip_normalize[m_id]\n", - " #last_step_uspcale_list.append(last_step_upscale)\n", - " scale_factor = 1\n", - " current_time = str(round(time.time()))\n", - " if(last_step_upscale):\n", - " latest_upscale = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", - " latest_upscale.save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", - " else:\n", - " Image.fromarray(image_grid.astype(np.uint8)).save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", - " settings = generate_settings_file(add_prompts=True, add_dimensions=False)\n", - " text_file = open(f\"{outpath}/{current_time}.cfg\", \"w\")\n", - " text_file.write(settings)\n", - " text_file.close()\n", - " x_samples_ddim = model.decode_first_stage(samples_ddim)\n", - " x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0)\n", - " all_samples.append(x_samples_ddim)\n", - "\n", - "\n", - " if(len(all_samples) > 1):\n", - " # additionally, save as grid\n", - " grid = torch.stack(all_samples, 0)\n", - " grid = rearrange(grid, 'n b c h w -> (n b) c h w')\n", - " grid = make_grid(grid, nrow=opt.n_samples)\n", - "\n", - " # to image\n", - " grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()\n", - " Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid_{str(round(time.time()))}.png'))" + "# model.model = model.model.half().eval().to(device)\n", + "majesty.load_lpips_model()\n", + "majesty.load_aesthetic_model()" ] }, { @@ -1151,63 +397,7 @@ " clip_load_list.append(model3)\n", "\n", "\n", - "i = 0\n", - "from mmc.multimmc import MultiMMC\n", - "from mmc.modalities import TEXT, IMAGE\n", - "temp_perceptor = MultiMMC(TEXT, IMAGE)\n", - "\n", - "def get_mmc_models(clip_load_list):\n", - " mmc_models = []\n", - " for model_key in clip_load_list:\n", - " if not model_key:\n", - " continue\n", - " arch, pub, m_id = model_key[1:-1].split(' - ')\n", - " mmc_models.append({\n", - " 'architecture':arch,\n", - " 'publisher':pub,\n", - " 'id':m_id,\n", - " })\n", - " return mmc_models\n", - "mmc_models = get_mmc_models(clip_load_list)\n", - "\n", - "import mmc\n", - "from mmc.registry import REGISTRY\n", - "import mmc.loaders # force trigger model registrations\n", - "from mmc.mock.openai import MockOpenaiClip\n", - "\n", - "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", - " std=[0.26862954, 0.26130258, 0.27577711])\n", - "\n", - "\n", - "def load_clip_models(mmc_models):\n", - " clip_model, clip_size, clip_tokenize, clip_normalize= {},{},{},{}\n", - " clip_list = []\n", - " for item in mmc_models:\n", - " print(\"Loaded \", item[\"id\"])\n", - " clip_list.append(item[\"id\"])\n", - " model_loaders = REGISTRY.find(**item)\n", - " for model_loader in model_loaders:\n", - " clip_model_loaded = model_loader.load()\n", - " clip_model[item[\"id\"]] = MockOpenaiClip(clip_model_loaded)\n", - " clip_size[item[\"id\"]] = clip_model[item[\"id\"]].visual.input_resolution\n", - " clip_tokenize[item[\"id\"]] = clip_model[item[\"id\"]].preprocess_text()\n", - " clip_normalize[item[\"id\"]] = normalize\n", - " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - "\n", - "\n", - "def full_clip_load(clip_load_list):\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " try:\n", - " del clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - " except:\n", - " pass\n", - " mmc_models = get_mmc_models(clip_load_list)\n", - " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = load_clip_models(mmc_models)\n", - " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - "\n", - "clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", - "clip_load_list_universal = clip_load_list\n", + "majesty.clip_load_list = clip_load_list\n", "torch.cuda.empty_cache()\n", "gc.collect()" ] @@ -1233,78 +423,78 @@ "opt = DotMap()\n", "\n", "#Change it to false to not use CLIP Guidance at all \n", - "use_cond_fn = True\n", + "majesty.use_cond_fn = True\n", "\n", "#Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion\n", - "custom_schedule_setting = [\n", + "majesty.custom_schedule_setting = [\n", " [50,1000,8],\n", " \"gfpgan:1.5\",\n", " [5,200,5],\n", "]\n", " \n", "#Cut settings\n", - "clamp_index = [2,1.4] #linear variation of the index for clamping the gradient \n", - "cut_overview = [8]*500 + [4]*500\n", - "cut_innercut = [0]*500 + [4]*500\n", - "cut_ic_pow = .2\n", - "cut_icgray_p = [.1]*300+[0]*1000\n", - "cutn_batches = 1\n", - "cut_blur_n = [0]*400 + [0]*600\n", - "cut_blur_kernel = 3\n", - "range_index = [0]*1000\n", - "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", - "ths_method = \"softsign\"\n", - "tv_scales = [600]*1+[50]*1 +[0]*2\n", - "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", + "majesty.clamp_index = [2,1.4] #linear variation of the index for clamping the gradient \n", + "majesty.cut_overview = [8]*500 + [4]*500\n", + "majesty.cut_innercut = [0]*500 + [4]*500\n", + "majesty.cut_ic_pow = .2\n", + "majesty.cut_icgray_p = [.1]*300+[0]*1000\n", + "majesty.cutn_batches = 1\n", + "majesty.cut_blur_n = [0]*400 + [0]*600\n", + "majesty.cut_blur_kernel = 3\n", + "majesty.range_index = [0]*1000\n", + "majesty.active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", + "majesty.ths_method = \"softsign\"\n", + "majesty.tv_scales = [600]*1+[50]*1 +[0]*2\n", + "majesty.latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", "#clip_guidance_schedule = [10000]*300 + [500]*700\n", "\n", - "symmetric_loss_scale = 0 #Apply symmetric loss\n", + "majesty.symmetric_loss_scale = 0 #Apply symmetric loss\n", "\n", "#Latent Diffusion Advanced Settings\n", - "scale_div = 1 # Use when latent upscale to correct satuation problem\n", - "opt_mag_mul = 15 #Magnify grad before clamping\n", + "majesty.scale_div = 1 # Use when latent upscale to correct satuation problem\n", + "majesty.opt_mag_mul = 15 #Magnify grad before clamping\n", "#PLMS Currently not working, working on a fix\n", "#opt.plms = False #Won;=t work with clip guidance\n", - "opt_ddim_eta, opt_eta_end = [1.5,1.2] # linear variation of eta\n", - "opt_temperature = .95\n", + "majesty.opt_ddim_eta, majesty.opt_eta_end = [1.5,1.2] # linear variation of eta\n", + "majesty.opt_temperature = .95\n", "\n", "#Grad advanced settings\n", - "grad_center = False\n", - "grad_scale= 0.75 #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "majesty.grad_center = False\n", + "majesty.grad_scale= 0.75 #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", "\n", "#Restraints the model from explodign despite larger clamp\n", - "score_modifier = True\n", - "threshold_percentile = .9\n", - "threshold = 1.2\n", - "var_index = [0]*1000\n", + "majesty.score_modifier = True\n", + "majesty.threshold_percentile = .9\n", + "majesty.threshold = 1.2\n", + "majesty.var_index = [0]*1000\n", "\n", "\n", "#Init image advanced settings\n", - "init_rotate, mask_rotate=[False, False]\n", - "init_magnitude = 0.15\n", + "majesty.init_rotate, majesty.mask_rotate=[False, False]\n", + "majesty.init_magnitude = 0.15\n", "\n", "#More settings\n", - "RGB_min, RGB_max = [-1,1]\n", - "padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", - "flip_aug=False\n", - "cutout_debug = False\n", - "opt.outdir = outputs_path\n", + "majesty.RGB_min, majesty.RGB_max = [-1,1]\n", + "majesty.padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", + "majesty.flip_aug=False\n", + "majesty.cutout_debug = False\n", + "majesty.opt.outdir = outputs_path\n", "\n", "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", - "experimental_aesthetic_embeddings = True\n", + "majesty.experimental_aesthetic_embeddings = True\n", "#How much you want this to influence your result\n", - "experimental_aesthetic_embeddings_weight = 0.3\n", + "majesty.experimental_aesthetic_embeddings_weight = 0.3\n", "#9 are good aesthetic embeddings, 0 are bad ones\n", - "experimental_aesthetic_embeddings_score = 8\n", + "majesty.experimental_aesthetic_embeddings_score = 8\n", "\n", "# For fun dont change except if you really know what your are doing\n", - "grad_blur = False\n", - "compress_steps = 0\n", - "compress_factor = 0.1\n", - "punish_steps = 0\n", - "punish_factor = 0.8" + "majesty.grad_blur = False\n", + "majesty.compress_steps = 0\n", + "majesty.compress_factor = 0.1\n", + "majesty.punish_steps = 0\n", + "majesty.punish_factor = 0.8" ] }, { @@ -1342,7 +532,7 @@ "#Negative prompts for Latent Diffusion\n", "latent_negatives = [\"\"]\n", "\n", - "image_prompts = []" + "majesty.image_prompts = []" ] }, { @@ -1367,143 +557,58 @@ "warnings.filterwarnings('ignore')\n", "#@markdown ### Basic settings \n", "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", - "width = 256#@param{type: 'integer'}\n", - "height = 256#@param{type: 'integer'}\n", - "latent_diffusion_guidance_scale = 15 #@param {type:\"number\"}\n", - "clip_guidance_scale = 5000#@param{type: 'integer'}\n", - "how_many_batches = 1 #@param{type: 'integer'}\n", - "aesthetic_loss_scale = 400 #@param{type: 'integer'}\n", - "augment_cuts=True #@param{type:'boolean'}\n", + "majesty.width = 256#@param{type: 'integer'}\n", + "majesty.height = 256#@param{type: 'integer'}\n", + "majesty.latent_diffusion_guidance_scale = 15 #@param {type:\"number\"}\n", + "majesty.clip_guidance_scale = 5000#@param{type: 'integer'}\n", + "majesty.how_many_batches = 1 #@param{type: 'integer'}\n", + "majesty.aesthetic_loss_scale = 400 #@param{type: 'integer'}\n", + "majesty.augment_cuts=True #@param{type:'boolean'}\n", "\n", "#@markdown\n", "\n", "#@markdown ### Init image settings\n", "#@markdown `init_image` requires the path of an image to use as init to the model\n", - "init_image = None #@param{type: 'string'}\n", - "if(init_image == '' or init_image == 'None'):\n", - " init_image = None\n", + "majesty.init_image = None #@param{type: 'string'}\n", + "if(majesty.init_image == '' or majesty.init_image == 'None'):\n", + " majesty.init_image = None\n", "#@markdown `starting_timestep`: How much noise do you want to add to your init image for it to then be difused by the model\n", - "starting_timestep = 0.9 #@param{type: 'number'}\n", + "majesty.starting_timestep = 0.9 #@param{type: 'number'}\n", "#@markdown `init_mask` is a mask same width and height as the original image with the color black indicating where to inpaint\n", - "init_mask = None #@param{type: 'string'}\n", + "majesty.init_mask = None #@param{type: 'string'}\n", "#@markdown `init_scale` controls how much the init image should influence the final result. Experiment with values around `1000`\n", - "init_scale = 1000 #@param{type: 'integer'}\n", - "init_brightness = 0.0 #@param{type: 'number'}\n", + "majesty.init_scale = 1000 #@param{type: 'integer'}\n", + "majesty.init_brightness = 0.0 #@param{type: 'number'}\n", "#@markdown How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)\n", - "init_noise = 0.57 #@param{type: 'number'}\n", + "majesty.init_noise = 0.57 #@param{type: 'number'}\n", "\n", "#@markdown\n", "\n", "#@markdown ### Custom saved settings\n", "#@markdown If you choose custom saved settings, the settings set by the preset overrule some of your choices. You can still modify the settings not in the preset. Check what each preset modifies here\n", - "custom_settings = 'path/to/settings.cfg' #@param{type:'string'}\n", + "majesty.custom_settings = 'path/to/settings.cfg' #@param{type:'string'}\n", "settings_library = 'None (use settings defined above)' #@param [\"None (use settings defined above)\", \"default (optimized for colab free)\", \"dango233_princesses\", \"the_other_zippy_defaults\", \"makeitrad_defaults\"]\n", "if(settings_library != 'None (use settings defined above)'):\n", " if(settings_library == 'default (optimized for colab free)'):\n", " custom_settings = f'majesty-diffusion/latent_settings_library/default.cfg'\n", " else:\n", " custom_settings = f'majesty-diffusion/latent_settings_library/{settings_library}.cfg'\n", - "\n", - "global_var_scope = globals()\n", - "if(custom_settings is not None and custom_settings != '' and custom_settings != 'path/to/settings.cfg'):\n", - " print('Loaded ', custom_settings)\n", - " try:\n", - " from configparser import ConfigParser\n", - " except ImportError:\n", - " from ConfigParser import ConfigParser\n", - " import configparser\n", - " \n", - " config = ConfigParser()\n", - " config.read(custom_settings)\n", - " #custom_settings_stream = fetch(custom_settings)\n", - " #Load CLIP models from config\n", - " if(config.has_section('clip_list')):\n", - " clip_incoming_list = config.items('clip_list')\n", - " clip_incoming_models = clip_incoming_list[0]\n", - " incoming_perceptors = eval(clip_incoming_models[1])\n", - " if((len(incoming_perceptors) != len(clip_load_list)) or not all(elem in incoming_perceptors for elem in clip_load_list)):\n", - " clip_load_list = incoming_perceptors\n", - " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", - "\n", - " #Load settings from config and replace variables\n", - " if(config.has_section('basic_settings')):\n", - " basic_settings = config.items('basic_settings')\n", - " for basic_setting in basic_settings:\n", - " global_var_scope[basic_setting[0]] = eval(basic_setting[1])\n", - " \n", - " if(config.has_section('advanced_settings')):\n", - " advanced_settings = config.items('advanced_settings')\n", - " for advanced_setting in advanced_settings:\n", - " global_var_scope[advanced_setting[0]] = eval(advanced_setting[1])\n", - "\n", - "if(((init_image is not None) and (init_image != 'None') and (init_image != '')) and starting_timestep != 1 and custom_schedule_setting[0][1] == 1000):\n", - " custom_schedule_setting[0] = [custom_schedule_setting[0][0], int(custom_schedule_setting[0][1]*starting_timestep), custom_schedule_setting[0][2]]\n", - "\n", - "prompts = clip_prompts\n", - "opt.prompt = latent_prompts\n", - "opt.uc = latent_negatives\n", - "custom_schedules = set_custom_schedules(custom_schedule_setting)\n", - "aes_scale = aesthetic_loss_scale\n", - "try: \n", - " clip_guidance_schedule\n", - " clip_guidance_index = clip_guidance_schedule\n", - "except:\n", - " clip_guidance_index = [clip_guidance_scale]*1000\n", - "\n", - "opt.W = (width//64)*64;\n", - "opt.H = (height//64)*64;\n", - "if opt.W != width or opt.H != height:\n", - " print(f'Changing output size to {opt.W}x{opt.H}. Dimensions must by multiples of 64.')\n", - "\n", - "opt.mag_mul = opt_mag_mul \n", - "opt.ddim_eta = opt_ddim_eta\n", - "opt.eta_end = opt_eta_end\n", - "opt.temperature = opt_temperature\n", - "opt.n_iter = how_many_batches\n", - "opt.n_samples = 1\n", - "#opt.W, opt.H = [width,height]\n", - "opt.scale = latent_diffusion_guidance_scale\n", - "aug = augment_cuts\n", - "#Checks if it's not a normal schedule (legacy purposes to keep old configs compatible)\n", - "if(len(clamp_index) == 2): \n", - " clamp_index_variation = np.linspace(clamp_index[0],clamp_index[1],1000) \n", - "\n", - "else:\n", - " clamp_index_variation = clamp_index\n", - "score_corrector = DotMap()\n", - "\n", - "\n", - "def modify_score(e_t, e_t_uncond):\n", - " if(score_modifier is False):\n", - " return e_t\n", - " else:\n", - " e_t_d = (e_t - e_t_uncond)\n", - " s = torch.quantile(\n", - " rearrange(e_t_d, 'b ... -> b (...)').abs().float(),\n", - " threshold_percentile,\n", - " dim = -1\n", - " )\n", - "\n", - " s.clamp_(min = 1.)\n", - " s = s.view(-1, *((1,) * (e_t_d.ndim - 1)))\n", - " if ths_method == \"softsign\":\n", - " e_t_d = F.softsign(e_t_d*3) / s / 3\n", - " elif ths_method == \"clamp\":\n", - " e_t_d = e_t_d.clamp(-s,s) / s\n", - " e_t = e_t_uncond + e_t_d\n", - " return(e_t)\n", - " \n", - "score_corrector.modify_score = modify_score\n", + "majesty.load_custom_settings()\n", + "majesty.full_clip_load()\n", + "majesty.config_init_image()\n", + "\n", + "majesty.prompts = clip_prompts\n", + "majesty.opt.prompt = latent_prompts\n", + "majesty.opt.uc = latent_negatives\n", + "majesty.set_custom_schedules()\n", + "majesty.config_clip_guidance()\n", + "majesty.config_output_size()\n", + "majesty.config_options()\n", "\n", "torch.cuda.empty_cache()\n", "gc.collect()\n", - "generate_video = False\n", - "if generate_video: \n", - " fps = 24\n", - " p = Popen(['ffmpeg', '-y', '-f', 'image2pipe', '-vcodec', 'png', '-r', str(fps), '-i', '-', '-vcodec', 'libx264', '-r', str(fps), '-pix_fmt', 'yuv420p', '-crf', '17', '-preset', 'veryslow', 'video.mp4'], stdin=PIPE)\n", - "do_run()\n", - "if generate_video: \n", - " p.stdin.close()" + "majesty.generate_video = False\n", + "majesty.do_run()\n" ] }, { @@ -1529,7 +634,7 @@ "#@markdown If you would like to save your current settings, uncheck `skip_saving` and run this cell. You will get a `custom_settings.cfg` file you can reuse and share. If you like your results, send us a pull request to add your settings to the selectable library\n", "skip_saving = True #@param{type:'boolean'}\n", "if(not skip_saving):\n", - " data = generate_settings_file(add_prompts=False, add_dimensions=True)\n", + " data = majesty.generate_settings_file(add_prompts=False, add_dimensions=True)\n", " text_file = open(\"custom_settings.cfg\", \"w\")\n", " text_file.write(data)\n", " text_file.close()\n", diff --git a/latent.py b/latent.py new file mode 100644 index 0000000..e053fd2 --- /dev/null +++ b/latent.py @@ -0,0 +1,318 @@ +import argparse, sys +import torch +from omegaconf import OmegaConf +from subprocess import Popen, PIPE +import gc + +import torch +import json +import majesty as majesty + + +def main(argv): + + custom_settings = None + + parser = argparse.ArgumentParser( + description="Generate images from text with majesty" + ) + parser.add_argument( + "-p", + "--clip_prompts", + type=str, + help="CLIP prompts", + default=[ + "portrait of a princess in sanctuary, hyperrealistic painting trending on artstation" + ], + dest="clip_prompts", + ) + parser.add_argument( + "--latent_prompts", + type=str, + help="Latent prompts", + default=None, + dest="latent_prompts", + ) + parser.add_argument( + "--latent_negatives", + type=str, + help="Negative prompts", + default=["low quality image"], + dest="latent_negatives", + ) + parser.add_argument( + "--image_prompts", + type=str, + help="Image prompts", + default=[], + dest="image_prompts", + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + help="Model path", + default="models", + dest="model_path", + ) + parser.add_argument( + "--model_source", + type=str, + help="Source URL prefix for a local HTTP server with model downloads to use instead of authoritative URLs (useful in ephemeral stups)", + default=None, + dest="model_source", + ) + parser.add_argument( + "-o", + "--outputs_path", + type=str, + help="Outputs path", + default="outputs", + dest="outputs_path", + ) + parser.add_argument( + "-c", + "--custom_settings", + type=str, + help="Custom settings file", + default=None, + dest="custom_settings", + ) + parser.add_argument( + "-W", "--width", type=int, help="Output width", default=256, dest="width" + ) + parser.add_argument( + "-H", "--height", type=int, help="Output height", default=256, dest="height" + ) + parser.add_argument( + "-ls", + "--latent_scale", + type=float, + help="Latent diffusion guidance scale", + default=2, + dest="latent_diffusion_guidance_scale", + ) + parser.add_argument( + "-cs", + "--clip_scale", + type=int, + help="CLIP guidance scale", + default=5000, + dest="clip_guidance_scale", + ) + parser.add_argument( + "-b", + "--batches", + type=int, + help="Number of batches", + default=1, + dest="how_many_batches", + ) + parser.add_argument( + "--aesthetic_loss_scale", + type=int, + help="Aesthetic loss scale", + default=200, + dest="aesthetic_loss_scale", + ) + parser.add_argument( + "--disable_augment_cuts", + help="Disable Augment cuts", + dest="augment_cuts", + action="store_false", + ) + parser.add_argument( + "-ns", + "--n_samples", + type=int, + help="Number of samples", + default=1, + dest="n_samples", + ) + parser.add_argument( + "--init_image", + type=str, + help="Initial image", + default=None, + dest="init_image", + ) + parser.add_argument( + "--starting_timestep", + type=float, + help="Starting timestep", + default=0.9, + dest="starting_timestep", + ) + parser.add_argument( + "--init_mask", + type=str, + help="A mask same width and height as the original image with the color black indicating where to inpaint", + default=None, + dest="init_mask", + ) + parser.add_argument( + "--init_scale", + type=int, + help="Controls how much the init image should influence the final result. Experiment with values around 1000", + default=1000, + dest="init_scale", + ) + parser.add_argument( + "--init_brightness", + type=float, + help="Init image brightness", + default=0.0, + dest="init_brightness", + ) + parser.add_argument( + "--init_noise", + type=float, + help="How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)", + default=0.6, + dest="init_noise", + ) + parser.add_argument( + "--enable_aesthetic_embeddings", + help="Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14", + dest="experimental_aesthetic_embeddings", + action="store_true", + ) + parser.add_argument( + "--aesthetic_embeddings_weight", + help="How much you want experimental aesthetic embeddings to influence your result", + type=float, + default=0.5, + dest="experimental_aesthetic_embeddings_weight", + ) + parser.add_argument( + "--aesthetic_embeddings_score", + help="9 are good aesthetic embeddings, 0 are bad ones", + type=int, + default=9, + dest="experimental_aesthetic_embeddings_score", + ) + + args = parser.parse_args() + majesty.use_args(args) + + majesty.download_models() + + torch.backends.cudnn.benchmark = True + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + majesty.device = device + + latent_diffusion_model = "finetuned" + config = OmegaConf.load( + "./latent-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml" + ) # TODO: Optionally download from same location as ckpt and chnage this logic + model = majesty.load_model_from_config( + config, + f"{majesty.model_path}/latent_diffusion_txt2img_f8_large.ckpt", + False, + latent_diffusion_model, + ) # TODO: check path + majesty.model = model.half().eval().to(device) + # if(latent_diffusion_model == "finetuned"): + # model.model = model.model.half().eval().to(device) + + majesty.load_lpips_model() + # Alstro's aesthetic model + majesty.load_aesthetic_model() + + clip_load_list = [] + # @markdown #### Open AI CLIP models + ViT_B32 = False # @param {type:"boolean"} + ViT_B16 = True # @param {type:"boolean"} + ViT_L14 = False # @param {type:"boolean"} + ViT_L14_336px = False # @param {type:"boolean"} + # RN101 = False #@param {type:"boolean"} + # RN50 = False #@param {type:"boolean"} + RN50x4 = False # @param {type:"boolean"} + RN50x16 = False # @param {type:"boolean"} + RN50x64 = False # @param {type:"boolean"} + + # @markdown #### OpenCLIP models + ViT_B16_plus = False # @param {type: "boolean"} + ViT_B32_laion2b = True # @param {type: "boolean"} + + # @markdown #### Multilangual CLIP models + clip_farsi = False # @param {type: "boolean"} + clip_korean = False # @param {type: "boolean"} + + # @markdown #### CLOOB models + cloob_ViT_B16 = False # @param {type: "boolean"} + + # @markdown Load even more CLIP and CLIP-like models (from [Multi-Modal-Comparators](https://github.com/dmarx/Multi-Modal-Comparators)) + model1 = "" # @param ["[clip - openai - RN50]","[clip - openai - RN101]","[clip - mlfoundations - RN50--yfcc15m]","[clip - mlfoundations - RN50--cc12m]","[clip - mlfoundations - RN50-quickgelu--yfcc15m]","[clip - mlfoundations - RN50-quickgelu--cc12m]","[clip - mlfoundations - RN101--yfcc15m]","[clip - mlfoundations - RN101-quickgelu--yfcc15m]","[clip - mlfoundations - ViT-B-32--laion400m_e31]","[clip - mlfoundations - ViT-B-32--laion400m_e32]","[clip - mlfoundations - ViT-B-32--laion400m_avg]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]","[clip - mlfoundations - ViT-B-16--laion400m_e31]","[clip - mlfoundations - ViT-B-16--laion400m_e32]","[clip - sbert - ViT-B-32-multilingual-v1]","[clip - facebookresearch - clip_small_25ep]","[simclr - facebookresearch - simclr_small_25ep]","[slip - facebookresearch - slip_small_25ep]","[slip - facebookresearch - slip_small_50ep]","[slip - facebookresearch - slip_small_100ep]","[clip - facebookresearch - clip_base_25ep]","[simclr - facebookresearch - simclr_base_25ep]","[slip - facebookresearch - slip_base_25ep]","[slip - facebookresearch - slip_base_50ep]","[slip - facebookresearch - slip_base_100ep]","[clip - facebookresearch - clip_large_25ep]","[simclr - facebookresearch - simclr_large_25ep]","[slip - facebookresearch - slip_large_25ep]","[slip - facebookresearch - slip_large_50ep]","[slip - facebookresearch - slip_large_100ep]","[clip - facebookresearch - clip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc12m_35ep]","[clip - facebookresearch - clip_base_cc12m_35ep]"] {allow-input: true} + model2 = "" # @param ["[clip - openai - RN50]","[clip - openai - RN101]","[clip - mlfoundations - RN50--yfcc15m]","[clip - mlfoundations - RN50--cc12m]","[clip - mlfoundations - RN50-quickgelu--yfcc15m]","[clip - mlfoundations - RN50-quickgelu--cc12m]","[clip - mlfoundations - RN101--yfcc15m]","[clip - mlfoundations - RN101-quickgelu--yfcc15m]","[clip - mlfoundations - ViT-B-32--laion400m_e31]","[clip - mlfoundations - ViT-B-32--laion400m_e32]","[clip - mlfoundations - ViT-B-32--laion400m_avg]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]","[clip - mlfoundations - ViT-B-16--laion400m_e31]","[clip - mlfoundations - ViT-B-16--laion400m_e32]","[clip - sbert - ViT-B-32-multilingual-v1]","[clip - facebookresearch - clip_small_25ep]","[simclr - facebookresearch - simclr_small_25ep]","[slip - facebookresearch - slip_small_25ep]","[slip - facebookresearch - slip_small_50ep]","[slip - facebookresearch - slip_small_100ep]","[clip - facebookresearch - clip_base_25ep]","[simclr - facebookresearch - simclr_base_25ep]","[slip - facebookresearch - slip_base_25ep]","[slip - facebookresearch - slip_base_50ep]","[slip - facebookresearch - slip_base_100ep]","[clip - facebookresearch - clip_large_25ep]","[simclr - facebookresearch - simclr_large_25ep]","[slip - facebookresearch - slip_large_25ep]","[slip - facebookresearch - slip_large_50ep]","[slip - facebookresearch - slip_large_100ep]","[clip - facebookresearch - clip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc12m_35ep]","[clip - facebookresearch - clip_base_cc12m_35ep]"] {allow-input: true} + model3 = "" # @param ["[clip - openai - RN50]","[clip - openai - RN101]","[clip - mlfoundations - RN50--yfcc15m]","[clip - mlfoundations - RN50--cc12m]","[clip - mlfoundations - RN50-quickgelu--yfcc15m]","[clip - mlfoundations - RN50-quickgelu--cc12m]","[clip - mlfoundations - RN101--yfcc15m]","[clip - mlfoundations - RN101-quickgelu--yfcc15m]","[clip - mlfoundations - ViT-B-32--laion400m_e31]","[clip - mlfoundations - ViT-B-32--laion400m_e32]","[clip - mlfoundations - ViT-B-32--laion400m_avg]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]","[clip - mlfoundations - ViT-B-16--laion400m_e31]","[clip - mlfoundations - ViT-B-16--laion400m_e32]","[clip - sbert - ViT-B-32-multilingual-v1]","[clip - facebookresearch - clip_small_25ep]","[simclr - facebookresearch - simclr_small_25ep]","[slip - facebookresearch - slip_small_25ep]","[slip - facebookresearch - slip_small_50ep]","[slip - facebookresearch - slip_small_100ep]","[clip - facebookresearch - clip_base_25ep]","[simclr - facebookresearch - simclr_base_25ep]","[slip - facebookresearch - slip_base_25ep]","[slip - facebookresearch - slip_base_50ep]","[slip - facebookresearch - slip_base_100ep]","[clip - facebookresearch - clip_large_25ep]","[simclr - facebookresearch - simclr_large_25ep]","[slip - facebookresearch - slip_large_25ep]","[slip - facebookresearch - slip_large_50ep]","[slip - facebookresearch - slip_large_100ep]","[clip - facebookresearch - clip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc12m_35ep]","[clip - facebookresearch - clip_base_cc12m_35ep]"] {allow-input: true} + + if ViT_B32: + clip_load_list.append("[clip - mlfoundations - ViT-B-32--openai]") + if ViT_B16: + clip_load_list.append("[clip - mlfoundations - ViT-B-16--openai]") + if ViT_L14: + clip_load_list.append("[clip - mlfoundations - ViT-L-14--openai]") + if RN50x4: + clip_load_list.append("[clip - mlfoundations - RN50x4--openai]") + if RN50x64: + clip_load_list.append("[clip - mlfoundations - RN50x64--openai]") + if RN50x16: + clip_load_list.append("[clip - mlfoundations - RN50x16--openai]") + if ViT_L14_336px: + clip_load_list.append("[clip - mlfoundations - ViT-L-14-336--openai]") + if ViT_B16_plus: + clip_load_list.append( + "[clip - mlfoundations - ViT-B-16-plus-240--laion400m_e32]" + ) + if ViT_B32_laion2b: + clip_load_list.append("[clip - mlfoundations - ViT-B-32--laion2b_e16]") + if clip_farsi: + clip_load_list.append("[clip - sajjjadayobi - clipfa]") + if clip_korean: + clip_load_list.append("[clip - navervision - kelip_ViT-B/32]") + if cloob_ViT_B16: + clip_load_list.append( + "[cloob - crowsonkb - cloob_laion_400m_vit_b_16_32_epochs]" + ) + + if model1: + clip_load_list.append(model1) + if model2: + clip_load_list.append(model2) + if model3: + clip_load_list.append(model3) + + torch.cuda.empty_cache() + gc.collect() + + majesty.opt.outdir = majesty.outputs_path + + majesty.clip_load_list = clip_load_list + + majesty.load_custom_settings() + + majesty.full_clip_load() + + majesty.config_init_image() + + majesty.prompts = majesty.clip_prompts + if majesty.latent_prompts == [] or majesty.latent_prompts == None: + majesty.opt.prompt = majesty.prompts + else: + majesty.opt.prompt = majesty.latent_prompts + majesty.opt.uc = majesty.latent_negatives + majesty.set_custom_schedules() + + majesty.config_clip_guidance() + majesty.config_output_size() + majesty.config_options() + + torch.cuda.empty_cache() + gc.collect() + + majesty.do_run() + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/majesty.py b/majesty.py new file mode 100644 index 0000000..6514179 --- /dev/null +++ b/majesty.py @@ -0,0 +1,1374 @@ +import argparse, os, sys, glob +import shutil +import torch +import numpy as np +from omegaconf import OmegaConf +from PIL import Image +from tqdm.auto import tqdm, trange + +tqdm_auto_model = __import__("tqdm.auto", fromlist="") +sys.modules["tqdm"] = tqdm_auto_model +from einops import rearrange +from torchvision.utils import make_grid +import transformers +import gc + +sys.path.append("./latent-diffusion") +from ldm.util import instantiate_from_config +from ldm.models.diffusion.ddim import DDIMSampler +from ldm.models.diffusion.plms import PLMSSampler +from ldm.modules.diffusionmodules.util import noise_like +import tensorflow as tf +from dotmap import DotMap +import ipywidgets as widgets +from math import pi + +from resize_right import resize + +import subprocess +from subprocess import Popen, PIPE + +from dataclasses import dataclass +from functools import partial +import gc +import io +import math +import sys +import random +from piq import brisque +from itertools import product +from IPython import display +import lpips +from PIL import Image, ImageOps +import requests +import torch +from torch import nn +from torch.nn import functional as F +from torchvision import models +from torchvision import transforms +from torchvision import transforms as T +from torchvision.transforms import functional as TF +from numpy import nan +from threading import Thread +import time +import json +import warnings + +import mmc +from mmc.registry import REGISTRY +import mmc.loaders # force trigger model registrations +from mmc.mock.openai import MockOpenaiClip + +model_path = "models" +outputs_path = "results" +device = None +opt = DotMap() + +# Change it to false to not use CLIP Guidance at all +use_cond_fn = True + +# Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion +custom_schedule_setting = [ + [50, 1000, 8], + "gfpgan:1.5", + [5, 200, 5], + # "gfpgan:1.5", + # [50,200,5], +] + +# Cut settings +clamp_index = [2, 1.4] # linear variation of the index for clamping the gradient +cut_overview = [8] * 500 + [4] * 500 +cut_innercut = [0] * 500 + [4] * 500 +cut_ic_pow = 0.2 +cut_icgray_p = [0.1] * 300 + [0] * 1000 +cutn_batches = 1 +cut_blur_n = [0] * 400 + [0] * 600 +cut_blur_kernel = 3 +range_index = [0] * 1000 +active_function = ( + "softsign" # function to manipulate the gradient - help things to stablize +) +ths_method = "softsign" +tv_scales = [600] * 1 + [50] * 1 + [0] * 2 +latent_tv_loss = True # Applies the TV Loss in the Latent space instead of pixel, improves generation quality + +# If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used +# clip_guidance_schedule = [10000]*300 + [500]*700 + +symmetric_loss_scale = 0 # Apply symmetric loss + +# Latent Diffusion Advanced Settings +scale_div = 1 # Use when latent upscale to correct satuation problem +opt_mag_mul = 15 # Magnify grad before clamping +# PLMS Currently not working, working on a fix +# opt.plms = False #Won;=t work with clip guidance +opt_ddim_eta, opt_eta_end = [1.5, 1.2] # linear variation of eta +opt_temperature = 0.95 + +# Grad advanced settings +grad_center = False +grad_scale = 0.75 # Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept + +# Restraints the model from exploding despite larger clamp +score_modifier = True +threshold_percentile = 0.9 +threshold = 1.2 +var_index = [0] * 1000 + +# Init image advanced settings +init_rotate, mask_rotate = [False, False] +init_magnitude = 0.15 + +# More settings +RGB_min, RGB_max = [-1, 1] +padargs = {"mode": "constant", "value": -1} # How to pad the image with cut_overview +flip_aug = False +cutout_debug = False + +# Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14 +experimental_aesthetic_embeddings = True +# How much you want this to influence your result +experimental_aesthetic_embeddings_weight = 0.3 +# 9 are good aesthetic embeddings, 0 are bad ones +experimental_aesthetic_embeddings_score = 8 + +# For fun dont change except if you really know what your are doing +grad_blur = False +compress_steps = 0 +compress_factor = 0.1 +punish_steps = 0 +punish_factor = 0.8 + +# Amp up your prompt game with prompt engineering, check out this guide: https://matthewmcateer.me/blog/clip-prompt-engineering/ +# Prompt for CLIP Guidance +clip_prompts = ["portrait of a Majestic Princess, trending on artstation"] + +# Prompt for Latent Diffusion +latent_prompts = ["portrait of a Majestic Princess, trending on artstation"] + +# Negative prompts for Latent Diffusion +latent_negatives = [""] + +image_prompts = [] + +width = 256 +height = 256 +latent_diffusion_guidance_scale = 15 +clip_guidance_scale = 5000 +how_many_batches = 1 +aesthetic_loss_scale = 400 +augment_cuts = True +n_samples = 1 + +init_image = None +starting_timestep = 0.9 +init_mask = None +init_scale = 1000 +init_brightness = 0.0 +init_noise = 0.57 + +normalize = transforms.Normalize( + mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711] +) + +# Globals +custom_settings = None +generate_video = False +model = {} +aes_scale = None +aug = None + +clip_model, clip_size, clip_tokenize, clip_normalize = {}, {}, {}, {} +clip_list, clip_load_list, clip_guidance_index = [], [], [] + + +aesthetic_model_336, aesthetic_model_224, aesthetic_model_16, aesthetic_model_32 = ( + {}, + {}, + {}, + {}, +) +custom_schedules = [] + +progress = None +image_grid, writer, img_tensor, im = {}, {}, {}, {} +target_embeds, weights, zero_embed, init = {}, {}, {}, {} +make_cutouts = {} +scale_factor = 1 +clamp_start_, clamp_max = None, None +clip_guidance_schedule = None +prompts = [] +mmc_models = [] +last_step_uspcale_list = [] + +has_purged = False + +# Used to override download locations, allows rehosting models in a bucket for ephemeral servers to download +model_source = None + + +def download_models(): + # download models as needed + models = [ + [ + "latent_diffusion_txt2img_f8_large.ckpt", + "https://ommer-lab.com/files/latent-diffusion/nitro/txt2img-f8-large/model.ckpt", + ], + [ + "finetuned_state_dict.pt", + "https://huggingface.co/multimodalart/compvis-latent-diffusion-text2img-large/resolve/main/finetuned_state_dict.pt", + ], + [ + "ava_vit_l_14_336_linear.pth", + "https://multimodal.art/models/ava_vit_l_14_336_linear.pth", + ], + [ + "sa_0_4_vit_l_14_linear.pth", + "https://multimodal.art/models/sa_0_4_vit_l_14_linear.pth", + ], + [ + "ava_vit_l_14_linear.pth", + "https://multimodal.art/models/ava_vit_l_14_linear.pth", + ], + [ + "ava_vit_b_16_linear.pth", + "http://batbot.tv/ai/models/v-diffusion/ava_vit_b_16_linear.pth", + ], + [ + "sa_0_4_vit_b_16_linear.pth", + "https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth", + ], + [ + "sa_0_4_vit_b_32_linear.pth", + "https://multimodal.art/models/sa_0_4_vit_b_32_linear.pth", + ], + [ + "openimages_512x_png_embed224.npz", + "https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/openimages_512x_png_embed224.npz", + ], + [ + "imagenet_512x_jpg_embed224.npz", + "https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/imagenet_512x_jpg_embed224.npz", + ], + [ + "GFPGANv1.3.pth", + "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth", + ], + ] + + if not os.path.exists(model_path): + os.makedirs(model_path) + + for item in models: + model_file = f"{model_path}/{item[0]}" + if not os.path.exists(model_file): + if model_source: + url = f"{model_source}/{item[0]}" + else: + url = item[1] + print(f"Downloading {url}") + subprocess.call( + ["wget", "-nv", "-O", model_file, url, "--no-check-certificate"], + shell=False, + ) + if not os.path.exists("GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth"): + shutil.copyfile( + f"{model_path}/GFPGANv1.3.pth", + "GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth", + ) + + +def load_model_from_config( + config, ckpt, verbose=False, latent_diffusion_model="original" +): + print(f"Loading model from {ckpt}") + print(latent_diffusion_model) + model = instantiate_from_config(config.model) + sd = torch.load(ckpt, map_location="cuda")["state_dict"] + m, u = model.load_state_dict(sd, strict=False) + if latent_diffusion_model == "finetuned": + del sd + sd_finetune = torch.load( + f"{model_path}/finetuned_state_dict.pt", map_location="cuda" + ) + m, u = model.model.load_state_dict(sd_finetune, strict=False) + model.model = model.model.half().eval().to(device) + del sd_finetune + # sd = pl_sd["state_dict"] + + if len(m) > 0 and verbose: + print("missing keys:") + print(m) + if len(u) > 0 and verbose: + print("unexpected keys:") + print(u) + + model.requires_grad_(False).half().eval().to("cuda") + return model + + +def get_mmc_models(): + global mmc_models + mmc_models = [] + for model_key in clip_load_list: + if not model_key: + continue + arch, pub, m_id = model_key[1:-1].split(" - ") + mmc_models.append( + { + "architecture": arch, + "publisher": pub, + "id": m_id, + } + ) + + +def set_custom_schedules(): + global custom_schedules + custom_schedules = [] + for schedule_item in custom_schedule_setting: + if isinstance(schedule_item, list): + custom_schedules.append(np.arange(*schedule_item)) + else: + custom_schedules.append(schedule_item) + + +def parse_prompt(prompt): + if ( + prompt.startswith("http://") + or prompt.startswith("https://") + or prompt.startswith("E:") + or prompt.startswith("C:") + or prompt.startswith("D:") + ): + vals = prompt.rsplit(":", 2) + vals = [vals[0] + ":" + vals[1], *vals[2:]] + else: + vals = prompt.rsplit(":", 1) + vals = vals + ["", "1"][len(vals) :] + return vals[0], float(vals[1]) + + +class MakeCutouts(nn.Module): + def __init__( + self, + cut_size, + Overview=4, + WholeCrop=0, + WC_Allowance=10, + WC_Grey_P=0.2, + InnerCrop=0, + IC_Size_Pow=0.5, + IC_Grey_P=0.2, + cut_blur_n=0, + ): + super().__init__() + self.cut_size = cut_size + self.Overview = Overview + self.WholeCrop = WholeCrop + self.WC_Allowance = WC_Allowance + self.WC_Grey_P = WC_Grey_P + self.InnerCrop = InnerCrop + self.IC_Size_Pow = IC_Size_Pow + self.IC_Grey_P = IC_Grey_P + self.cut_blur_n = cut_blur_n + self.augs = T.Compose( + [ + # T.RandomHorizontalFlip(p=0.5), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomAffine( + degrees=0, + translate=(0.05, 0.05), + # scale=(0.9,0.95), + fill=-1, + interpolation=T.InterpolationMode.BILINEAR, + ), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + # T.RandomPerspective(p=1, interpolation = T.InterpolationMode.BILINEAR, fill=-1,distortion_scale=0.2), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomGrayscale(p=0.1), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.ColorJitter(brightness=0.05, contrast=0.05, saturation=0.05), + ] + ) + + def forward(self, input): + gray = transforms.Grayscale(3) + sideY, sideX = input.shape[2:4] + max_size = min(sideX, sideY) + min_size = min(sideX, sideY, self.cut_size) + l_size = max(sideX, sideY) + output_shape = [input.shape[0], 3, self.cut_size, self.cut_size] + output_shape_2 = [input.shape[0], 3, self.cut_size + 2, self.cut_size + 2] + pad_input = F.pad( + input, + ( + (sideY - max_size) // 2 + round(max_size * 0.055), + (sideY - max_size) // 2 + round(max_size * 0.055), + (sideX - max_size) // 2 + round(max_size * 0.055), + (sideX - max_size) // 2 + round(max_size * 0.055), + ), + **padargs, + ) + cutouts_list = [] + + if self.Overview > 0: + cutouts = [] + cutout = resize(pad_input, out_shape=output_shape, antialiasing=True) + output_shape_all = list(output_shape) + output_shape_all[0] = self.Overview * input.shape[0] + pad_input = pad_input.repeat(input.shape[0], 1, 1, 1) + cutout = resize(pad_input, out_shape=output_shape_all) + if aug: + cutout = self.augs(cutout) + if self.cut_blur_n > 0: + cutout[0 : self.cut_blur_n, :, :, :] = TF.gaussian_blur( + cutout[0 : self.cut_blur_n, :, :, :], cut_blur_kernel + ) + cutouts_list.append(cutout) + + if self.InnerCrop > 0: + cutouts = [] + for i in range(self.InnerCrop): + size = int( + torch.rand([]) ** self.IC_Size_Pow * (max_size - min_size) + + min_size + ) + offsetx = torch.randint(0, sideX - size + 1, ()) + offsety = torch.randint(0, sideY - size + 1, ()) + cutout = input[:, :, offsety : offsety + size, offsetx : offsetx + size] + if i <= int(self.IC_Grey_P * self.InnerCrop): + cutout = gray(cutout) + cutout = resize(cutout, out_shape=output_shape) + cutouts.append(cutout) + if cutout_debug: + TF.to_pil_image(cutouts[-1].add(1).div(2).clamp(0, 1).squeeze(0)).save( + "content/diff/cutouts/cutout_InnerCrop.jpg", quality=99 + ) + cutouts_tensor = torch.cat(cutouts) + cutouts = [] + cutouts_list.append(cutouts_tensor) + cutouts = torch.cat(cutouts_list) + return cutouts + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) + + +def tv_loss(input): + """L2 total variation loss, as in Mahendran et al.""" + input = F.pad(input, (0, 1, 0, 1), "replicate") + x_diff = input[..., :-1, 1:] - input[..., :-1, :-1] + y_diff = input[..., 1:, :-1] - input[..., :-1, :-1] + return (x_diff**2 + y_diff**2).mean([1, 2, 3]) + + +def range_loss(input, range_min, range_max): + return (input - input.clamp(range_min, range_max)).pow(2).mean([1, 2, 3]) + + +def symmetric_loss(x): + w = x.shape[3] + diff = (x - torch.flip(x, [3])).square().mean().sqrt() / ( + x.shape[2] * x.shape[3] / 1e4 + ) + return diff + + +def fetch(url_or_path): + """Fetches a file from an HTTP or HTTPS url, or opens the local file.""" + if str(url_or_path).startswith("http://") or str(url_or_path).startswith( + "https://" + ): + r = requests.get(url_or_path) + r.raise_for_status() + fd = io.BytesIO() + fd.write(r.content) + fd.seek(0) + return fd + return open(url_or_path, "rb") + + +def to_pil_image(x): + """Converts from a tensor to a PIL image.""" + if x.ndim == 4: + assert x.shape[0] == 1 + x = x[0] + if x.shape[0] == 1: + x = x[0] + return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2) + + +def centralized_grad(x, use_gc=True, gc_conv_only=False): + if use_gc: + if gc_conv_only: + if len(list(x.size())) > 3: + x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True)) + else: + if len(list(x.size())) > 1: + x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True)) + return x + + +def cond_fn(x, t): + global cur_step + cur_step += 1 + t = 1000 - t + t = t[0] + with torch.enable_grad(): + x = x.detach() + x = x.requires_grad_() + x_in = model.decode_first_stage(x) + display_handler(x_in, t, 1, False) + n = x_in.shape[0] + clip_guidance_scale = clip_guidance_index[t] + make_cutouts = {} + # rx_in_grad = torch.zeros_like(x_in) + for i in clip_list: + make_cutouts[i] = MakeCutouts( + clip_size[i], + Overview=cut_overview[t], + InnerCrop=cut_innercut[t], + IC_Size_Pow=cut_ic_pow, + IC_Grey_P=cut_icgray_p[t], + cut_blur_n=cut_blur_n[t], + ) + cutn = cut_overview[t] + cut_innercut[t] + for j in range(cutn_batches): + losses = 0 + for i in clip_list: + clip_in = clip_normalize[i]( + make_cutouts[i](x_in.add(1).div(2)).to("cuda") + ) + image_embeds = ( + clip_model[i] + .encode_image(clip_in) + .float() + .unsqueeze(0) + .expand([target_embeds[i].shape[0], -1, -1]) + ) + target_embeds_temp = target_embeds[i] + if i == "ViT-B-32--openai" and experimental_aesthetic_embeddings: + aesthetic_embedding = torch.from_numpy( + np.load( + f"aesthetic-predictor/vit_b_32_embeddings/rating{experimental_aesthetic_embeddings_score}.npy" + ) + ).to(device) + aesthetic_query = ( + target_embeds_temp + + aesthetic_embedding * experimental_aesthetic_embeddings_weight + ) + target_embeds_temp = (aesthetic_query) / torch.linalg.norm( + aesthetic_query + ) + if i == "ViT-L-14--openai" and experimental_aesthetic_embeddings: + aesthetic_embedding = torch.from_numpy( + np.load( + f"aesthetic-predictor/vit_l_14_embeddings/rating{experimental_aesthetic_embeddings_score}.npy" + ) + ).to(device) + aesthetic_query = ( + target_embeds_temp + + aesthetic_embedding * experimental_aesthetic_embeddings_weight + ) + target_embeds_temp = (aesthetic_query) / torch.linalg.norm( + aesthetic_query + ) + target_embeds_temp = target_embeds_temp.unsqueeze(1).expand( + [-1, cutn * n, -1] + ) + dists = spherical_dist_loss(image_embeds, target_embeds_temp) + dists = dists.mean(1).mul(weights[i].squeeze()).mean() + losses += ( + dists + * clip_guidance_scale + * ( + 2 + if i + in [ + "ViT-L-14-336--openai", + "RN50x64--openai", + "ViT-B-32--laion2b_e16", + ] + else (0.4 if "cloob" in i else 1) + ) + ) + if i == "ViT-L-14-336--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_336(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + if i == "ViT-L-14--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_224(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + if i == "ViT-B-16--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_16(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + if i == "ViT-B-32--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_32(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + # x_in_grad += torch.autograd.grad(losses, x_in)[0] / cutn_batches / len(clip_list) + # losses += dists + # losses = losses / len(clip_list) + # gc.collect() + + tv_losses = ( + tv_loss(x).sum() * tv_scales[0] + + tv_loss(F.interpolate(x, scale_factor=1 / 2)).sum() * tv_scales[1] + + tv_loss(F.interpolate(x, scale_factor=1 / 4)).sum() * tv_scales[2] + + tv_loss(F.interpolate(x, scale_factor=1 / 8)).sum() * tv_scales[3] + ) + range_scale = range_index[t] + range_losses = range_loss(x_in, RGB_min, RGB_max).sum() * range_scale + var_scale = var_index[t] + loss = tv_losses + range_losses + losses + # del losses + if symmetric_loss_scale != 0: + loss += symmetric_loss(x_in) * symmetric_loss_scale + if init_image is not None and init_scale: + lpips_loss = (lpips_model(x_in, init) * init_scale).squeeze().mean() + # print(lpips_loss) + loss += lpips_loss + # loss_grad = torch.autograd.grad(loss, x_in, )[0] + # x_in_grad += loss_grad + # grad = -torch.autograd.grad(x_in, x, x_in_grad)[0] + loss.backward() + grad = -x.grad + grad = torch.nan_to_num(grad, nan=0.0, posinf=0, neginf=0) + if grad_center: + grad = centralized_grad(grad, use_gc=True, gc_conv_only=False) + mag = grad.square().mean().sqrt() + if mag == 0 or torch.isnan(mag): + print("ERROR") + print(t) + return grad + if t >= 0: + if active_function == "softsign": + grad = F.softsign(grad * grad_scale / mag) + if active_function == "tanh": + grad = (grad / mag * grad_scale).tanh() + if active_function == "clamp": + grad = grad.clamp(-mag * grad_scale * 2, mag * grad_scale * 2) + if grad.abs().max() > 0: + grad = grad / grad.abs().max() * opt.mag_mul + magnitude = grad.square().mean().sqrt() + else: + return grad + clamp_max = clamp_index_variation[t] + # print(magnitude, end = "\r") + grad = grad * magnitude.clamp(max=clamp_max) / magnitude # 0.2 + grad = grad.detach() + grad = grad_fn(grad, t) + x = x.detach() + x = x.requires_grad_() + var = x.var() + var_losses = (var.pow(2).clamp(min=1) - 1) * var_scale + var_losses.backward() + grad -= x.grad + print(grad.abs().mean(), x.grad.abs().mean(), end="\r") + return grad + + +def null_fn(x_in): + return torch.zeros_like(x_in) + + +def display_handler(x, i, cadance=5, decode=True): + global img_tensor, image_grid, p, progress + img_tensor = x + if i % cadance == 0: + if decode: + x = model.decode_first_stage(x) + grid = make_grid( + torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0), round(x.shape[0] ** 0.5) + ) + grid = 255.0 * rearrange(grid, "c h w -> h w c").detach().cpu().numpy() + image_grid = grid.copy(order="C") + with io.BytesIO() as output: + im = Image.fromarray(grid.astype(np.uint8)) + im.save(output, format="PNG") + if progress: + progress.value = output.getvalue() + if generate_video: + im.save(p.stdin, "PNG") + + +def grad_fn(x, t): + if t <= 500 and grad_blur: + x = TF.gaussian_blur(x, 2 * round(int(max(grad_blur - t / 150, 1))) - 1, 1.5) + return x + + +def cond_clamp(image, t): + t = 1000 - t[0] + if t <= max(punish_steps, compress_steps): + s = torch.quantile( + rearrange(image, "b ... -> b (...)").abs(), threshold_percentile, dim=-1 + ) + s = s.view(-1, *((1,) * (image.ndim - 1))) + ths = s.clamp(min=threshold) + im_max = image.clamp(min=ths) - image.clamp(min=ths, max=ths) + im_min = image.clamp(max=-ths, min=-ths) - image.clamp(max=-ths) + if t <= punish_steps: + image = ( + image.clamp(min=-ths, max=ths) + (im_max - im_min) * punish_factor + ) # ((im_max-im_min)*punish_factor).tanh()/punish_factor + if t <= compress_steps: + image = image / (ths / threshold) ** compress_factor + image += noise_like(image.shape, device, False) * ( + (ths / threshold) ** compress_factor - 1 + ) + return image + + +def make_schedule(t_start, t_end, step_size=1): + schedule = [] + par_schedule = [] + t = t_start + while t > t_end: + schedule.append(t) + t -= step_size + schedule.append(t_end) + return np.array(schedule) + + +def list_mul_to_array(list_mul): + i = 0 + mul_count = 0 + mul_string = "" + full_list = list_mul + full_list_len = len(full_list) + for item in full_list: + if i == 0: + last_item = item + if item == last_item: + mul_count += 1 + if item != last_item or full_list_len == i + 1: + mul_string = mul_string + f" [{last_item}]*{mul_count} +" + mul_count = 1 + last_item = item + i += 1 + return mul_string[1:-2] + + +def generate_settings_file(add_prompts=False, add_dimensions=False): + + if add_prompts: + prompts = f""" + clip_prompts = {clip_prompts} + latent_prompts = {latent_prompts} + latent_negatives = {latent_negatives} + image_prompts = [] + """ + else: + prompts = "" + + if add_dimensions: + dimensions = f"""width = {width} + height = {height} + """ + else: + dimensions = "" + settings = f""" + #This settings file can be loaded back to Latent Majesty Diffusion. If you like your setting consider sharing it to the settings library at https://github.com/multimodalart/MajestyDiffusion + [clip_list] + perceptors = {clip_load_list} + + [basic_settings] + #Perceptor things + {prompts} + {dimensions} + latent_diffusion_guidance_scale = {latent_diffusion_guidance_scale} + clip_guidance_scale = {clip_guidance_scale} + aesthetic_loss_scale = {aesthetic_loss_scale} + augment_cuts={augment_cuts} + + #Init image settings + starting_timestep = {starting_timestep} + init_scale = {init_scale} + init_brightness = {init_brightness} + init_noise = {init_noise} + + [advanced_settings] + #Add CLIP Guidance and all the flavors or just run normal Latent Diffusion + use_cond_fn = {use_cond_fn} + + #Custom schedules for cuts. Check out the schedules documentation here + custom_schedule_setting = {custom_schedule_setting} + + #Cut settings + clamp_index = {clamp_index} + cut_overview = {list_mul_to_array(cut_overview)} + cut_innercut = {list_mul_to_array(cut_innercut)} + cut_blur_n = {list_mul_to_array(cut_blur_n)} + cut_blur_kernel = {cut_blur_kernel} + cut_ic_pow = {cut_ic_pow} + cut_icgray_p = {list_mul_to_array(cut_icgray_p)} + cutn_batches = {cutn_batches} + range_index = {list_mul_to_array(range_index)} + active_function = "{active_function}" + ths_method= "{ths_method}" + tv_scales = {list_mul_to_array(tv_scales)} + latent_tv_loss = {latent_tv_loss} + + #If you uncomment this line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale will be used + clip_guidance_schedule = {list_mul_to_array(clip_guidance_index)} + + #Apply symmetric loss (force simmetry to your results) + symmetric_loss_scale = {symmetric_loss_scale} + + #Latent Diffusion Advanced Settings + #Use when latent upscale to correct satuation problem + scale_div = {scale_div} + #Magnify grad before clamping by how many times + opt_mag_mul = {opt_mag_mul} + opt_ddim_eta = {opt_ddim_eta} + opt_eta_end = {opt_eta_end} + opt_temperature = {opt_temperature} + + #Grad advanced settings + grad_center = {grad_center} + #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept + grad_scale={grad_scale} + score_modifier = {score_modifier} + threshold_percentile = {threshold_percentile} + threshold = {threshold} + var_index = {list_mul_to_array(var_index)} + + #Init image advanced settings + init_rotate={init_rotate} + mask_rotate={mask_rotate} + init_magnitude = {init_magnitude} + + #More settings + RGB_min = {RGB_min} + RGB_max = {RGB_max} + #How to pad the image with cut_overview + padargs = {padargs} + flip_aug={flip_aug} + + #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14 + experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings} + #How much you want this to influence your result + experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight} + #9 are good aesthetic embeddings, 0 are bad ones + experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score} + + # For fun dont change except if you really know what your are doing + grad_blur = {grad_blur} + compress_steps = {compress_steps} + compress_factor = {compress_factor} + punish_steps = {punish_steps} + punish_factor = {punish_factor} + """ + return settings + + +def load_clip_models(): + global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list + for item in mmc_models: + print("Loaded ", item["id"]) + clip_list.append(item["id"]) + model_loaders = REGISTRY.find(**item) + for model_loader in model_loaders: + clip_model_loaded = model_loader.load() + clip_model[item["id"]] = MockOpenaiClip(clip_model_loaded) + clip_size[item["id"]] = clip_model[item["id"]].visual.input_resolution + clip_tokenize[item["id"]] = clip_model[item["id"]].preprocess_text() + clip_normalize[item["id"]] = normalize + + +def full_clip_load(): + torch.cuda.empty_cache() + gc.collect() + get_mmc_models() + load_clip_models() + + +# Alstro's aesthetic model +def load_aesthetic_model(): + global aesthetic_model_336, aesthetic_model_224, aesthetic_model_16, aesthetic_model_32 + aesthetic_model_336 = torch.nn.Linear(768, 1).cuda() + aesthetic_model_336.load_state_dict( + torch.load(f"{model_path}/ava_vit_l_14_336_linear.pth") + ) + + aesthetic_model_224 = torch.nn.Linear(768, 1).cuda() + aesthetic_model_224.load_state_dict( + torch.load(f"{model_path}/ava_vit_l_14_linear.pth") + ) + + aesthetic_model_16 = torch.nn.Linear(512, 1).cuda() + aesthetic_model_16.load_state_dict( + torch.load(f"{model_path}/ava_vit_b_16_linear.pth") + ) + + aesthetic_model_32 = torch.nn.Linear(512, 1).cuda() + aesthetic_model_32.load_state_dict( + torch.load(f"{model_path}/sa_0_4_vit_b_32_linear.pth") + ) + + +def load_lpips_model(): + global lpips_model + lpips_model = lpips.LPIPS(net="vgg").to(device) + + +def config_init_image(): + global custom_schedule_setting + if ( + ((init_image is not None) and (init_image != "None") and (init_image != "")) + and starting_timestep != 1 + and custom_schedule_setting[0][1] == 1000 + ): + custom_schedule_setting[0] = [ + custom_schedule_setting[0][0], + int(custom_schedule_setting[0][1] * starting_timestep), + custom_schedule_setting[0][2], + ] + + +def config_clip_guidance(): + global clip_guidance_index, clip_guidance_schedule, clip_guidance_scale + if clip_guidance_schedule: + clip_guidance_index = clip_guidance_schedule + else: + clip_guidance_index = [clip_guidance_scale] * 1000 + + +def config_output_size(): + global opt + opt.W = (width // 64) * 64 + opt.H = (height // 64) * 64 + if opt.W != width or opt.H != height: + print( + f"Changing output size to {opt.W}x{opt.H}. Dimensions must by multiples of 64." + ) + + +def config_options(): + global aes_scale, opt, aug, clamp_index_variation, score_corrector + aes_scale = aesthetic_loss_scale + opt.mag_mul = opt_mag_mul + opt.ddim_eta = opt_ddim_eta + opt.eta_end = opt_eta_end + opt.temperature = opt_temperature + opt.n_iter = how_many_batches + opt.n_samples = n_samples + opt.scale = latent_diffusion_guidance_scale + aug = augment_cuts + if len(clamp_index) == 2: + clamp_index_variation = np.linspace(clamp_index[0], clamp_index[1], 1000) + else: + clamp_index_variation = clamp_index + score_corrector = DotMap() + score_corrector.modify_score = modify_score + + +def modify_score(e_t, e_t_uncond): + if score_modifier is False: + return e_t + else: + e_t_d = e_t - e_t_uncond + s = torch.quantile( + rearrange(e_t_d, "b ... -> b (...)").abs().float(), + threshold_percentile, + dim=-1, + ) + + s.clamp_(min=1.0) + s = s.view(-1, *((1,) * (e_t_d.ndim - 1))) + if ths_method == "softsign": + e_t_d = F.softsign(e_t_d * 3) / s / 3 + elif ths_method == "clamp": + e_t_d = e_t_d.clamp(-s, s) / s + e_t = e_t_uncond + e_t_d + return e_t + + +def use_args(args: argparse.Namespace): + global_var_scope = globals() + warnings.filterwarnings("ignore") + for k, v in vars(args).items(): + global_var_scope[k] = v + + +def load_custom_settings(): + global_var_scope = globals() + global clip_load_list + warnings.filterwarnings("ignore") + if ( + custom_settings is not None + and custom_settings != "" + and custom_settings != "path/to/settings.cfg" + ): + print("Loaded ", custom_settings) + try: + from configparser import ConfigParser + except ImportError: + from ConfigParser import ConfigParser + import configparser + + config = ConfigParser() + config.read(custom_settings) + # custom_settings_stream = fetch(custom_settings) + # Load CLIP models from config + if config.has_section("clip_list"): + clip_incoming_list = config.items("clip_list") + clip_incoming_models = clip_incoming_list[0] + incoming_perceptors = eval(clip_incoming_models[1]) + if (len(incoming_perceptors) != len(clip_load_list)) or not all( + elem in incoming_perceptors for elem in clip_load_list + ): + clip_load_list = incoming_perceptors + + # Load settings from config and replace variables + if config.has_section("basic_settings"): + basic_settings = config.items("basic_settings") + for basic_setting in basic_settings: + global_var_scope[basic_setting[0]] = eval(basic_setting[1]) + + if config.has_section("advanced_settings"): + advanced_settings = config.items("advanced_settings") + for advanced_setting in advanced_settings: + global_var_scope[advanced_setting[0]] = eval(advanced_setting[1]) + + +def do_run(): + global has_purged + if has_purged: + global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list + full_clip_load() + has_purged = False + global opt, model, p, base_count, make_cutouts, progress, target_embeds, weights, zero_embed, init, scale_factor, cur_step + if generate_video: + fps = 24 + p = Popen( + [ + "ffmpeg", + "-y", + "-f", + "image2pipe", + "-vcodec", + "png", + "-r", + str(fps), + "-i", + "-", + "-vcodec", + "libx264", + "-r", + str(fps), + "-pix_fmt", + "yuv420p", + "-crf", + "17", + "-preset", + "veryslow", + "video.mp4", + ], + stdin=PIPE, + ) + # with torch.cuda.amp.autocast(): + cur_step = 0 + scale_factor = 1 + make_cutouts = {} + for i in clip_list: + make_cutouts[i] = MakeCutouts(clip_size[i], Overview=1) + for i in clip_list: + target_embeds[i] = [] + weights[i] = [] + + for prompt in prompts: + txt, weight = parse_prompt(prompt) + for i in clip_list: + if "cloob" not in i: + with torch.cuda.amp.autocast(): + embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device)) + target_embeds[i].append(embeds) + weights[i].append(weight) + else: + embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device)) + target_embeds[i].append(embeds) + weights[i].append(weight) + + for prompt in image_prompts: + print(f"processing{prompt}", end="\r") + path, weight = parse_prompt(prompt) + img = Image.open(fetch(path)).convert("RGB") + img = TF.resize( + img, min(opt.W, opt.H, *img.size), transforms.InterpolationMode.LANCZOS + ) + for i in clip_list: + if "cloob" not in i: + with torch.cuda.amp.autocast(): + batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device)) + embed = clip_model[i].encode_image(clip_normalize[i](batch)) + target_embeds[i].append(embed) + weights[i].extend([weight]) + else: + batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device)) + embed = clip_model[i].encode_image(clip_normalize[i](batch)) + target_embeds[i].append(embed) + weights[i].extend([weight]) + # if anti_jpg != 0: + # target_embeds["ViT-B-32--openai"].append( + # torch.tensor( + # [ + # np.load(f"{model_path}/openimages_512x_png_embed224.npz")["arr_0"] + # - np.load(f"{model_path}/imagenet_512x_jpg_embed224.npz")["arr_0"] + # ], + # device=device, + # ) + # ) + # weights["ViT-B-32--openai"].append(anti_jpg) + + for i in clip_list: + target_embeds[i] = torch.cat(target_embeds[i]) + weights[i] = torch.tensor([weights[i]], device=device) + shape = [4, opt.H // 8, opt.W // 8] + init = None + mask = None + transform = T.GaussianBlur(kernel_size=3, sigma=0.4) + if init_image is not None: + init = Image.open(fetch(init_image)).convert("RGB") + init = TF.to_tensor(init).to(device).unsqueeze(0) + if init_rotate: + init = torch.rot90(init, 1, [3, 2]) + init = resize(init, out_shape=[opt.n_samples, 3, opt.H, opt.W]) + init = init.mul(2).sub(1).half() + init_encoded = ( + model.first_stage_model.encode(init).sample() * init_magnitude + + init_brightness + ) + init_encoded = init_encoded + noise_like(init_encoded.shape, device, False).mul( + init_noise + ) + else: + init = None + init_encoded = None + if init_mask is not None: + mask = Image.open(fetch(init_mask)).convert("RGB") + mask = TF.to_tensor(mask).to(device).unsqueeze(0) + if mask_rotate: + mask = torch.rot90(init, 1, [3, 2]) + mask = resize(mask, out_shape=[opt.n_samples, 1, opt.H // 8, opt.W // 8]) + mask = transform(mask) + print(mask) + + if progress: + display.display(progress) + + if opt.plms: + sampler = PLMSSampler(model) + else: + sampler = DDIMSampler(model) + + os.makedirs(opt.outdir, exist_ok=True) + outpath = opt.outdir + + prompt = opt.prompt + sample_path = os.path.join(outpath, "samples") + os.makedirs(sample_path, exist_ok=True) + base_count = len(os.listdir(sample_path)) + + all_samples = list() + last_step_upscale = False + eta1 = opt.ddim_eta + eta2 = opt.eta_end + with torch.enable_grad(): + with torch.cuda.amp.autocast(): + with model.ema_scope(): + uc = None + if opt.scale != 1.0: + uc = model.get_learned_conditioning(opt.n_samples * opt.uc).cuda() + + for n in trange(opt.n_iter, desc="Sampling"): + torch.cuda.empty_cache() + gc.collect() + c = model.get_learned_conditioning(opt.n_samples * prompt).cuda() + if init_encoded is None: + x_T = torch.randn([opt.n_samples, *shape], device=device) + else: + x_T = init_encoded + + for custom_schedule in custom_schedules: + if type(custom_schedule) != type(""): + torch.cuda.empty_cache() + gc.collect() + last_step_upscale = False + samples_ddim, _ = sampler.sample( + S=opt.ddim_steps, + conditioning=c, + batch_size=opt.n_samples, + shape=shape, + custom_schedule=custom_schedule, + verbose=False, + unconditional_guidance_scale=opt.scale, + unconditional_conditioning=uc, + eta=eta1, + eta_end=eta2, + img_callback=None if use_cond_fn else display_handler, + cond_fn=cond_fn if use_cond_fn else None, + temperature=opt.temperature, + x_adjust_fn=cond_clamp, + x_T=x_T, + x0=x_T, + mask=mask, + score_corrector=score_corrector, + corrector_kwargs={}, + ) + x_T = samples_ddim.clamp(-6, 6) + else: + torch.cuda.empty_cache() + gc.collect() + method, scale_factor = custom_schedule.split(":") + if method == "RGB": + scale_factor = float(scale_factor) + temp_file_name = ( + "temp_" + f"{str(round(time.time()))}.png" + ) + temp_file = os.path.join(sample_path, temp_file_name) + im.save(temp_file, format="PNG") + init = Image.open(fetch(temp_file)).convert("RGB") + init = TF.to_tensor(init).to(device).unsqueeze(0) + opt.H, opt.W = ( + opt.H * scale_factor, + opt.W * scale_factor, + ) + init = resize( + init, + out_shape=[opt.n_samples, 3, opt.H, opt.W], + antialiasing=True, + ) + init = init.mul(2).sub(1).half() + x_T = ( + model.first_stage_model.encode(init).sample() + * init_magnitude + ) + x_T += noise_like(x_T.shape, device, False) * init_noise + x_T = x_T.clamp(-6, 6) + if method == "gfpgan": + scale_factor = float(scale_factor) + last_step_upscale = True + temp_file_name = ( + "temp_" + f"{str(round(time.time()))}.png" + ) + temp_file = os.path.join(sample_path, temp_file_name) + im.save(temp_file, format="PNG") + GFP_factor = 2 if scale_factor > 1 else 1 + GFP_ver = 1.3 # if GFP_factor == 1 else 1.2 + + torch.cuda.empty_cache() + gc.collect() + + subprocess.call( + [ + "python3", + "inference_gfpgan.py", + "-i", + temp_file, + "-o", + "/tmp/results", + "-v", + str(GFP_ver), + "-s", + str(GFP_factor), + ], + cwd="GFPGAN", + shell=False, + ) + + face_corrected = Image.open( + fetch( + f"/tmp/results/restored_imgs/{temp_file_name}" + ) + ) + with io.BytesIO() as output: + face_corrected.save(output, format="PNG") + if progress: + progress.value = output.getvalue() + init = Image.open( + fetch( + f"/tmp/results/restored_imgs/{temp_file_name}" + ) + ).convert("RGB") + init = TF.to_tensor(init).to(device).unsqueeze(0) + opt.H, opt.W = ( + opt.H * scale_factor, + opt.W * scale_factor, + ) + init = resize( + init, + out_shape=[opt.n_samples, 3, opt.H, opt.W], + antialiasing=True, + ) + init = init.mul(2).sub(1).half() + x_T = ( + model.first_stage_model.encode(init).sample() + * init_magnitude + ) + x_T += noise_like(x_T.shape, device, False) * init_noise + x_T = x_T.clamp(-6, 6) + if method == "purge": + has_purged = True + for i in scale_factor.split(","): + if i in clip_load_list: + arch, pub, m_id = i[1:-1].split(" - ") + print("Purge ", i) + del clip_list[clip_list.index(m_id)] + del clip_model[m_id] + del clip_size[m_id] + del clip_tokenize[m_id] + del clip_normalize[m_id] + + # last_step_uspcale_list.append(last_step_upscale) + scale_factor = 1 + current_time = str(round(time.time())) + if last_step_upscale: + latest_upscale = Image.open( + fetch(f"/tmp/results/restored_imgs/{temp_file_name}") + ).convert("RGB") + latest_upscale.save( + os.path.join(outpath, f"{current_time}.png"), format="PNG" + ) + else: + Image.fromarray(image_grid.astype(np.uint8)).save( + os.path.join(outpath, f"{current_time}.png"), format="PNG" + ) + settings = generate_settings_file( + add_prompts=True, add_dimensions=False + ) + text_file = open(f"{outpath}/{current_time}.cfg", "w") + text_file.write(settings) + text_file.close() + x_samples_ddim = model.decode_first_stage(samples_ddim) + x_samples_ddim = torch.clamp( + (x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0 + ) + all_samples.append(x_samples_ddim) + + if len(all_samples) > 1: + # additionally, save as grid + grid = torch.stack(all_samples, 0) + grid = rearrange(grid, "n b c h w -> (n b) c h w") + grid = make_grid(grid, nrow=opt.n_samples) + + # to image + grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy() + Image.fromarray(grid.astype(np.uint8)).save( + os.path.join(outpath, f"grid_{str(round(time.time()))}.png") + ) + + if generate_video: + p.stdin.close() From fb462c3d3b94a94d04b054b84a588cc141b0f206 Mon Sep 17 00:00:00 2001 From: Stephan Auerhahn Date: Sun, 12 Jun 2022 04:27:09 -0700 Subject: [PATCH 13/13] Fix resampling (#6) * Attempted fix for GFPGAN * Fix missing global --- majesty.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/majesty.py b/majesty.py index 6514179..572b279 100644 --- a/majesty.py +++ b/majesty.py @@ -684,7 +684,7 @@ def null_fn(x_in): def display_handler(x, i, cadance=5, decode=True): - global img_tensor, image_grid, p, progress + global progress, image_grid, writer, img_tensor, im, p img_tensor = x if i % cadance == 0: if decode: @@ -1279,7 +1279,7 @@ def do_run(): "-i", temp_file, "-o", - "/tmp/results", + "results", "-v", str(GFP_ver), "-s", @@ -1291,7 +1291,7 @@ def do_run(): face_corrected = Image.open( fetch( - f"/tmp/results/restored_imgs/{temp_file_name}" + f"GFPGAN/results/restored_imgs/{temp_file_name}" ) ) with io.BytesIO() as output: @@ -1300,7 +1300,7 @@ def do_run(): progress.value = output.getvalue() init = Image.open( fetch( - f"/tmp/results/restored_imgs/{temp_file_name}" + f"GFPGAN/results/restored_imgs/{temp_file_name}" ) ).convert("RGB") init = TF.to_tensor(init).to(device).unsqueeze(0) @@ -1337,7 +1337,7 @@ def do_run(): current_time = str(round(time.time())) if last_step_upscale: latest_upscale = Image.open( - fetch(f"/tmp/results/restored_imgs/{temp_file_name}") + fetch(f"GFPGAN/results/restored_imgs/{temp_file_name}") ).convert("RGB") latest_upscale.save( os.path.join(outpath, f"{current_time}.png"), format="PNG"