diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d6d95cf --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +models \ No newline at end of file diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..5d793d4 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,93 @@ +name: Docker + +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +on: + schedule: + - cron: '31 4 * * *' + push: + branches: [ main ] + # Publish semver tags as releases. + tags: [ 'v*.*.*' ] + pull_request: + branches: [ main ] + +env: + # Use docker.io for Docker Hub if empty + REGISTRY: ghcr.io + # github.repository as / + IMAGE_NAME: ${{ github.repository }} + + +jobs: + build: + + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + # This is used to complete the identity challenge + # with sigstore/fulcio when running outside of PRs. + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + # Install the cosign tool except on PR + # https://github.com/sigstore/cosign-installer +# - name: Install cosign +# if: github.event_name != 'pull_request' +# uses: sigstore/cosign-installer@d6a3abf1bdea83574e28d40543793018b6035605 +# with: +# cosign-release: 'v1.7.1' + + + # Workaround: https://github.com/docker/build-push-action/issues/461 + - name: Setup Docker buildx + uses: docker/setup-buildx-action@79abd3f86f79a9d68a23c75a09a9a85889262adf + + # Login against a Docker registry except on PR + # https://github.com/docker/login-action + - name: Log into registry ${{ env.REGISTRY }} + if: github.event_name != 'pull_request' + uses: docker/login-action@28218f9b04b4f3f62068d7b6ce6ca5b26e35336c + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # Extract metadata (tags, labels) for Docker + # https://github.com/docker/metadata-action + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + # Build and push Docker image with Buildx (don't push on PR) + # https://github.com/docker/build-push-action + - name: Build and push Docker image + id: build-and-push + uses: docker/build-push-action@ac9327eae2b366085ac7f6a2d02df8aa8ead720a + with: + context: . + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + # Sign the resulting Docker image digest except on PRs. + # This will only write to the public Rekor transparency log when the Docker + # repository is public to avoid leaking data. If you would like to publish + # transparency data even for private images, pass --force to cosign below. + # https://github.com/sigstore/cosign +# - name: Sign the published Docker image +# if: ${{ github.event_name != 'pull_request' }} +# env: +# COSIGN_EXPERIMENTAL: "true" + # This step uses the identity token to provision an ephemeral certificate + # against the sigstore community Fulcio instance. +# run: cosign sign ${{ steps.meta.outputs.tags }}@${{ steps.build-and-push.outputs.digest }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d6d95cf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +models \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..71a5a73 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime +RUN apt-get update \ + && DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata build-essential wget git git-lfs \ + && apt-get clean + +RUN mkdir -p /src +WORKDIR /src + +RUN git clone https://github.com/multimodalart/latent-diffusion --branch 1.4 +RUN git clone https://github.com/CompVis/taming-transformers +RUN git clone https://github.com/TencentARC/GFPGAN +RUN git lfs clone https://github.com/LAION-AI/aesthetic-predictor + +RUN pip install tensorflow==2.9.1 +RUN pip install -e ./taming-transformers +RUN pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops +RUN pip install transformers +RUN pip install dotmap +RUN pip install resize-right +RUN pip install piq +RUN pip install lpips +RUN pip install basicsr +RUN pip install facexlib +RUN pip install realesrgan +RUN pip install ipywidgets + +RUN git clone https://github.com/apolinario/Multi-Modal-Comparators --branch gradient_checkpointing +RUN pip install poetry +WORKDIR /src/Multi-Modal-Comparators +RUN poetry build; pip install dist/mmc*.whl +WORKDIR /src +RUN python Multi-Modal-Comparators/src/mmc/napm_installs/__init__.py + +VOLUME [ "/src/models" ] +VOLUME [ "/root/.cache" ] + +COPY majesty.py . +COPY latent.py . +COPY latent_settings_library . +ENTRYPOINT ["python", "latent.py"] \ No newline at end of file diff --git a/README.md b/README.md index ceb5647..72e92cf 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,12 @@ Current implementations: - [V-Majesty Diffusion](#v-majesty-diffusion-v12) -## Latent Majesty Diffusion v1.3 +## Latent Majesty Diffusion v1.4 ##### Formerly known as Latent Princess Generator [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/multimodalart/MajestyDiffusion/blob/main/latent.ipynb) A [Dango233](https://github.com/Dango233) and [apolinario (@multimodalart)](https://github.com/multimodalart) Colab notebook implementing [CompVis](https://github.com/CompVis)' Latent Diffusion, with the following changes: +v1.2 - Added [Dango233](https://github.com/Dango233) CLIP Guidance - Added [Dango233](https://github.com/Dango233) magical **new** step and upscaling scheduling - Added [Dango233](https://github.com/Dango233) cuts, augs and attributes scheduling @@ -31,6 +32,13 @@ A [Dango233](https://github.com/Dango233) and [apolinario (@multimodalart)](http - Added [LAION-AI](https://github.com/LAION-AI/aesthetic-predictor) aesthetic predictor embeddings - Added [Dango233](https://github.com/Dango233) inpainting mode - Added [apolinario (@multimodalart)](https://github.com/multimodalart) savable settings and setting library (including `colab-free-default`, `dango233-princesses`, `the-other-zippy` and `makaitrad` shared settings. Share yours with us too with a pull request! +v1.3 + - Better Upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion)) +v1.4 + - Added [Dango233](https://github.com/Dango233) Customised Dynamic Thresholding + - Added [open_clip](https://github.com/mlfoundations/open_clip) ViT-L/14 LAION-400M trained + - Fix CLOOB perceptor from MMC + - Removes latent upscaler (was broken), adds RGB upscaler ## V-Majesty Diffusion v1.2 ##### Formerly known as Princess Generator ver. Victoria diff --git a/latent.ipynb b/latent.ipynb index 8e06f7a..9ebb837 100644 --- a/latent.ipynb +++ b/latent.ipynb @@ -16,7 +16,7 @@ "id": "NUmmV5ZvrPbP" }, "source": [ - "# Latent Majesty Diffusion v1.3\n", + "# Latent Majesty Diffusion v1.4\n", "#### Formerly known as Princess Generator\n", "##### Access our [Majestic Guide](https://multimodal.art/majesty-diffusion) (_under construction_), our [GitHub](https://github.com/multimodalart/majesty-diffusion), join our community on [Discord](https://discord.gg/yNBtQBEDfZ) or reach out via [@multimodalart on Twitter](https://twitter.com/multimodalart))\n", "\\\n", @@ -28,7 +28,9 @@ "#### CLIP Guided Latent Diffusion by [dango233](https://github.com/Dango233/) and [apolinario (@multimodalart)](https://twitter.com/multimodalart). \n", "The LAION-400M-trained model and the modified inference code are from [CompVis Latent Diffusion](https://github.com/CompVis/latent-diffusion). The guided-diffusion method is modified by Dango233 based on [Katherine Crowson](https://twitter.com/RiversHaveWings)'s guided diffusion notebook. multimodalart savable settings, MMC and assembled the Colab. Check the complete list on our GitHub. Some functions and methods are from various code masters (nsheppard, DanielRussRuss and others)\n", "\n", - "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))" + "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))\n", + "\n", + "Changelog: 1.4 - better defaults, added ViT-L/14 LAION-400M trained, fix CLOOB, adds modified dynamic thresholding, removes latent upscaler (was broken), adds RGB upscaler\n" ] }, { @@ -120,10 +122,10 @@ " downgrade_pytorch_result = subprocess.run(['pip', 'install', 'torch==1.10.2', 'torchvision==0.11.3', '-q'], stdout=subprocess.PIPE).stdout.decode('utf-8')\n", " import sys\n", " sys.path.append(\".\")\n", - " !git clone https://github.com/multimodalart/latent-diffusion\n", + " !git clone https://github.com/multimodalart/latent-diffusion --branch 1.4\n", " !git clone https://github.com/CompVis/taming-transformers\n", " !git clone https://github.com/TencentARC/GFPGAN\n", - " !git clone https://github.com/multimodalart/majesty-diffusion\n", + " !git clone https://github.com/NightmareAI/majesty-diffusion --branch 1.4\n", " !git lfs clone https://github.com/LAION-AI/aesthetic-predictor\n", " !pip install -e ./taming-transformers\n", " !pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops\n", @@ -174,57 +176,11 @@ "outputs": [], "source": [ "#@title Download models\n", - "import os\n", - "if os.path.isfile(f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\"):\n", - " print(\"Using Latent Diffusion model saved from Google Drive\")\n", - "else: \n", - " !wget -O $model_path/latent_diffusion_txt2img_f8_large.ckpt https://ommer-lab.com/files/latent-diffusion/nitro/txt2img-f8-large/model.ckpt --no-check-certificate\n", - "\n", - "if os.path.isfile(f\"{model_path}/finetuned_state_dict.pt\"):\n", - " print(\"Using Latent Diffusion model saved from Google Drive\")\n", - "else: \n", - " !wget -O $model_path/finetuned_state_dict.pt https://huggingface.co/multimodalart/compvis-latent-diffusion-text2img-large/resolve/main/finetuned_state_dict.pt --no-check-certificate\n", - "\n", - "if os.path.isfile(f\"{model_path}/ava_vit_l_14_336_linear.pth\"):\n", - " print(\"Using ViT-L/14@336px aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/ava_vit_l_14_336_linear.pth https://multimodal.art/models/ava_vit_l_14_336_linear.pth\n", - "\n", - "if os.path.isfile(f\"{model_path}/sa_0_4_vit_l_14_linear.pth\"):\n", - " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/sa_0_4_vit_l_14_linear.pth https://multimodal.art/models/sa_0_4_vit_l_14_linear.pth\n", - "\n", - "if os.path.isfile(f\"{model_path}/ava_vit_l_14_linear.pth\"):\n", - " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/ava_vit_l_14_linear.pth https://multimodal.art/models/ava_vit_l_14_linear.pth\n", - "\n", - "if os.path.isfile(f\"{model_path}/ava_vit_b_16_linear.pth\"):\n", - " print(\"Using ViT-B/16 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/ava_vit_b_16_linear.pth http://batbot.tv/ai/models/v-diffusion/ava_vit_b_16_linear.pth\n", - "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_16_linear.pth\"):\n", - " print(\"Using ViT-B/16 sa aesthetic model already saved\")\n", - "else:\n", - " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth\n", - "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"):\n", - " print(\"Using ViT-B/32 aesthetic model from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_32_linear.pth\n", - "if os.path.isfile(f\"{model_path}/openimages_512x_png_embed224.npz\"):\n", - " print(\"Using openimages png from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/openimages_512x_png_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/openimages_512x_png_embed224.npz\n", - "if os.path.isfile(f\"{model_path}/imagenet_512x_jpg_embed224.npz\"):\n", - " print(\"Using imagenet antijpeg from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/imagenet_512x_jpg_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/imagenet_512x_jpg_embed224.npz\n", - "if os.path.isfile(f\"{model_path}/GFPGANv1.3.pth\"):\n", - " print(\"Using GFPGAN v1.3 from Google Drive\")\n", - "else:\n", - " !wget -O $model_path/GFPGANv1.3.pth https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth\n", - "!cp $model_path/GFPGANv1.3.pth GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth\n" + "import sys\n", + "sys.path.append('./majesty-diffusion')\n", + "import majesty\n", + "majesty.model_path = model_path\n", + "majesty.download_models()\n" ] }, { @@ -258,6 +214,8 @@ "source": [ "#@title Import stuff\n", "import argparse, os, sys, glob\n", + "sys.path.append('./majesty-diffusion')\n", + "import majesty\n", "import torch\n", "import numpy as np\n", "from omegaconf import OmegaConf\n", @@ -273,6 +231,7 @@ "from ldm.util import instantiate_from_config\n", "from ldm.models.diffusion.ddim import DDIMSampler\n", "from ldm.models.diffusion.plms import PLMSSampler\n", + "from ldm.modules.diffusionmodules.util import noise_like\n", "import tensorflow as tf\n", "from dotmap import DotMap\n", "import ipywidgets as widgets\n", @@ -327,658 +286,19 @@ "outputs": [], "source": [ "#@title Load the model\n", + "majesty.model_path = model_path\n", + "majesty.outputs_path = outputs_path\n", "torch.backends.cudnn.benchmark = True\n", "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", - "def load_model_from_config(config, ckpt, verbose=False, latent_diffusion_model=\"original\"):\n", - " print(f\"Loading model from {ckpt}\")\n", - " print(latent_diffusion_model)\n", - " model = instantiate_from_config(config.model)\n", - " sd = torch.load(ckpt, map_location=\"cuda\")[\"state_dict\"]\n", - " m, u = model.load_state_dict(sd, strict = False)\n", - " if(latent_diffusion_model == \"finetuned\"): \n", - " del sd\n", - " sd_finetune = torch.load(f\"{model_path}/finetuned_state_dict.pt\",map_location=\"cuda\")\n", - " m, u = model.model.load_state_dict(sd_finetune, strict = False)\n", - " model.model = model.model.half().eval().to(device)\n", - " del sd_finetune\n", - " # sd = pl_sd[\"state_dict\"]\n", - " \n", - " if len(m) > 0 and verbose:\n", - " print(\"missing keys:\")\n", - " print(m)\n", - " if len(u) > 0 and verbose:\n", - " print(\"unexpected keys:\")\n", - " print(u)\n", - "\n", - " model.requires_grad_(False).half().eval().to('cuda')\n", - " return model\n", + "majesty.device = device\n", "\n", "config = OmegaConf.load(\"./latent-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml\") # TODO: Optionally download from same location as ckpt and chnage this logic\n", - "model = load_model_from_config(config, f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\",False, latent_diffusion_model) # TODO: check path\n", - "model = model.half().eval().to(device)\n", + "model = majesty.load_model_from_config(config, f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\",False, latent_diffusion_model) # TODO: check path\n", + "majesty.model = model.half().eval().to(device)\n", "#if(latent_diffusion_model == \"finetuned\"):\n", - "# model.model = model.model.half().eval().to(device)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "HY_7vvnPThzS" - }, - "outputs": [], - "source": [ - "#@title Load necessary functions\n", - "def set_custom_schedules(schedule):\n", - " custom_schedules = []\n", - " for schedule_item in schedule:\n", - " if(isinstance(schedule_item,list)):\n", - " custom_schedules.append(np.arange(*schedule_item))\n", - " else:\n", - " custom_schedules.append(schedule_item)\n", - " \n", - " return custom_schedules\n", - "\n", - "def parse_prompt(prompt):\n", - " if prompt.startswith('http://') or prompt.startswith('https://') or prompt.startswith(\"E:\") or prompt.startswith(\"C:\") or prompt.startswith(\"D:\"):\n", - " vals = prompt.rsplit(':', 2)\n", - " vals = [vals[0] + ':' + vals[1], *vals[2:]]\n", - " else:\n", - " vals = prompt.rsplit(':', 1)\n", - " vals = vals + ['', '1'][len(vals):]\n", - " return vals[0], float(vals[1])\n", - "\n", - "\n", - "class MakeCutouts(nn.Module):\n", - " def __init__(self, cut_size,\n", - " Overview=4, \n", - " WholeCrop = 0, WC_Allowance = 10, WC_Grey_P=0.2,\n", - " InnerCrop = 0, IC_Size_Pow=0.5, IC_Grey_P = 0.2\n", - " ):\n", - " super().__init__()\n", - " self.cut_size = cut_size\n", - " self.Overview = Overview\n", - " self.WholeCrop= WholeCrop\n", - " self.WC_Allowance = WC_Allowance\n", - " self.WC_Grey_P = WC_Grey_P\n", - " self.InnerCrop = InnerCrop\n", - " self.IC_Size_Pow = IC_Size_Pow\n", - " self.IC_Grey_P = IC_Grey_P\n", - " self.augs = T.Compose([\n", - " #T.RandomHorizontalFlip(p=0.5),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " T.RandomAffine(degrees=0, \n", - " translate=(0.05, 0.05), \n", - " #scale=(0.9,0.95),\n", - " fill=-1, interpolation = T.InterpolationMode.BILINEAR, ),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " #T.RandomPerspective(p=1, interpolation = T.InterpolationMode.BILINEAR, fill=-1,distortion_scale=0.2),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " T.RandomGrayscale(p=0.1),\n", - " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", - " T.ColorJitter(brightness=0.05, contrast=0.05, saturation=0.05),\n", - " ])\n", - "\n", - " def forward(self, input):\n", - " gray = transforms.Grayscale(3)\n", - " sideY, sideX = input.shape[2:4]\n", - " max_size = min(sideX, sideY)\n", - " min_size = min(sideX, sideY, self.cut_size)\n", - " l_size = max(sideX, sideY)\n", - " output_shape = [input.shape[0],3,self.cut_size,self.cut_size] \n", - " output_shape_2 = [input.shape[0],3,self.cut_size+2,self.cut_size+2]\n", - " pad_input = F.pad(input,((sideY-max_size)//2+round(max_size*0.055),(sideY-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055)), **padargs)\n", - " cutouts_list = []\n", - " \n", - " if self.Overview>0:\n", - " cutouts = []\n", - " cutout = resize(pad_input, out_shape=output_shape, antialiasing=True)\n", - " output_shape_all = list(output_shape)\n", - " output_shape_all[0]=self.Overview*input.shape[0]\n", - " pad_input = pad_input.repeat(input.shape[0],1,1,1)\n", - " cutout = resize(pad_input, out_shape=output_shape_all)\n", - " if aug: cutout=self.augs(cutout)\n", - " cutouts_list.append(cutout)\n", - " \n", - " if self.InnerCrop >0:\n", - " cutouts=[]\n", - " for i in range(self.InnerCrop):\n", - " size = int(torch.rand([])**self.IC_Size_Pow * (max_size - min_size) + min_size)\n", - " offsetx = torch.randint(0, sideX - size + 1, ())\n", - " offsety = torch.randint(0, sideY - size + 1, ())\n", - " cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]\n", - " if i <= int(self.IC_Grey_P * self.InnerCrop):\n", - " cutout = gray(cutout)\n", - " cutout = resize(cutout, out_shape=output_shape)\n", - " cutouts.append(cutout)\n", - " if cutout_debug:\n", - " TF.to_pil_image(cutouts[-1].add(1).div(2).clamp(0, 1).squeeze(0)).save(\"content/diff/cutouts/cutout_InnerCrop.jpg\",quality=99)\n", - " cutouts_tensor = torch.cat(cutouts)\n", - " cutouts=[]\n", - " cutouts_list.append(cutouts_tensor)\n", - " cutouts=torch.cat(cutouts_list)\n", - " return cutouts\n", - "\n", - "\n", - "def spherical_dist_loss(x, y):\n", - " x = F.normalize(x, dim=-1)\n", - " y = F.normalize(y, dim=-1)\n", - " return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)\n", - "\n", - "\n", - "def tv_loss(input):\n", - " \"\"\"L2 total variation loss, as in Mahendran et al.\"\"\"\n", - " input = F.pad(input, (0, 1, 0, 1), 'replicate')\n", - " x_diff = input[..., :-1, 1:] - input[..., :-1, :-1]\n", - " y_diff = input[..., 1:, :-1] - input[..., :-1, :-1]\n", - " return (x_diff**2 + y_diff**2).mean([1, 2, 3])\n", - "\n", - "\n", - "def range_loss(input, range_min, range_max):\n", - " return (input - input.clamp(range_min,range_max)).pow(2).mean([1, 2, 3])\n", - "\n", - "def symmetric_loss(x):\n", - " w = x.shape[3]\n", - " diff = (x - torch.flip(x,[3])).square().mean().sqrt()/(x.shape[2]*x.shape[3]/1e4)\n", - " return(diff)\n", - "\n", - "def fetch(url_or_path):\n", - " \"\"\"Fetches a file from an HTTP or HTTPS url, or opens the local file.\"\"\"\n", - " if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):\n", - " r = requests.get(url_or_path)\n", - " r.raise_for_status()\n", - " fd = io.BytesIO()\n", - " fd.write(r.content)\n", - " fd.seek(0)\n", - " return fd\n", - " return open(url_or_path, 'rb')\n", - "\n", - "\n", - "def to_pil_image(x):\n", - " \"\"\"Converts from a tensor to a PIL image.\"\"\"\n", - " if x.ndim == 4:\n", - " assert x.shape[0] == 1\n", - " x = x[0]\n", - " if x.shape[0] == 1:\n", - " x = x[0]\n", - " return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2)\n", - "\n", - "\n", - "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", - " std=[0.26862954, 0.26130258, 0.27577711])\n", - "\n", - "def centralized_grad(x, use_gc=True, gc_conv_only=False):\n", - " if use_gc:\n", - " if gc_conv_only:\n", - " if len(list(x.size())) > 3:\n", - " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", - " else:\n", - " if len(list(x.size())) > 1:\n", - " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", - " return x\n", - "\n", - "def cond_fn(x, t):\n", - " t=1000-t\n", - " t=t[0]\n", - " with torch.enable_grad():\n", - " global clamp_start_, clamp_max\n", - " x = x.detach()\n", - " x = x.requires_grad_()\n", - " x_in = model.decode_first_stage(x)\n", - " display_handler(x_in,t,1,False)\n", - " n = x_in.shape[0]\n", - " clip_guidance_scale = clip_guidance_index[t]\n", - " make_cutouts = {}\n", - " #rx_in_grad = torch.zeros_like(x_in)\n", - " for i in clip_list:\n", - " make_cutouts[i] = MakeCutouts(clip_size[i],\n", - " Overview= cut_overview[t], \n", - " InnerCrop = cut_innercut[t], \n", - " IC_Size_Pow=cut_ic_pow, IC_Grey_P = cut_icgray_p[t]\n", - " )\n", - " cutn = cut_overview[t]+cut_innercut[t]\n", - " for j in range(cutn_batches):\n", - " losses=0\n", - " for i in clip_list:\n", - " clip_in = clip_normalize[i](make_cutouts[i](x_in.add(1).div(2)).to(\"cuda\"))\n", - " image_embeds = clip_model[i].encode_image(clip_in).float().unsqueeze(0).expand([target_embeds[i].shape[0],-1,-1])\n", - " target_embeds_temp = target_embeds[i]\n", - " if i == 'ViT-B-32--openai' and experimental_aesthetic_embeddings:\n", - " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_b_32_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", - " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", - " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", - " if i == 'ViT-L-14--openai' and experimental_aesthetic_embeddings:\n", - " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_l_14_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", - " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", - " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", - " target_embeds_temp = target_embeds_temp.unsqueeze(1).expand([-1,cutn*n,-1]) \n", - " dists = spherical_dist_loss(image_embeds, target_embeds_temp)\n", - " dists = dists.mean(1).mul(weights[i].squeeze()).mean()\n", - " losses+=dists*clip_guidance_scale * (2 if i in [\"ViT-L-14-336--openai\", \"RN50x64--openai\", \"ViT-B-32--laion2b_e16\"] else (.4 if \"cloob\" in i else 1))\n", - " if i == \"ViT-L-14-336--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_336(F.normalize(image_embeds, dim=-1))).mean() \n", - " losses -= aes_loss * aes_scale \n", - " if i == \"ViT-L-14--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_224(F.normalize(image_embeds, dim=-1))).mean() \n", - " losses -= aes_loss * aes_scale \n", - " if i == \"ViT-B-16--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_16(F.normalize(image_embeds, dim=-1))).mean() \n", - " losses -= aes_loss * aes_scale \n", - " if i == \"ViT-B-32--openai\" and aes_scale !=0:\n", - " aes_loss = (aesthetic_model_32(F.normalize(image_embeds, dim=-1))).mean()\n", - " losses -= aes_loss * aes_scale\n", - " #x_in_grad += torch.autograd.grad(losses, x_in)[0] / cutn_batches / len(clip_list)\n", - " #losses += dists\n", - " #losses = losses / len(clip_list) \n", - " #gc.collect()\n", - " \n", - " tv_losses = tv_loss(x).sum() * tv_scales[0] +\\\n", - " tv_loss(F.interpolate(x, scale_factor= 1/2)).sum()* tv_scales[1] + \\\n", - " tv_loss(F.interpolate(x, scale_factor = 1/4)).sum()* tv_scales[2] + \\\n", - " tv_loss(F.interpolate(x, scale_factor = 1/8)).sum()* tv_scales[3] \n", - " range_scale= range_index[t]\n", - " range_losses = range_loss(x_in,RGB_min,RGB_max).sum() * range_scale\n", - " loss = tv_losses + range_losses + losses\n", - " #del losses\n", - " if symmetric_loss_scale != 0: loss += symmetric_loss(x_in) * symmetric_loss_scale\n", - " if init_image is not None and init_scale:\n", - " lpips_loss = (lpips_model(x_in, init) * init_scale).squeeze().mean()\n", - " #print(lpips_loss)\n", - " loss += lpips_loss\n", - " #loss_grad = torch.autograd.grad(loss, x_in, )[0]\n", - " #x_in_grad += loss_grad\n", - " #grad = -torch.autograd.grad(x_in, x, x_in_grad)[0]\n", - " loss.backward()\n", - " grad = -x.grad\n", - " grad = torch.nan_to_num(grad, nan=0.0, posinf=0, neginf=0)\n", - " if grad_center: grad = centralized_grad(grad, use_gc=True, gc_conv_only=False)\n", - " mag = grad.square().mean().sqrt()\n", - " if mag==0 or torch.isnan(mag):\n", - " print(\"ERROR\")\n", - " print(t)\n", - " return(grad)\n", - " if t>=0:\n", - " if active_function == \"softsign\":\n", - " grad = F.softsign(grad*grad_scale/mag)\n", - " if active_function == \"tanh\":\n", - " grad = (grad/mag*grad_scale).tanh()\n", - " if active_function==\"clamp\":\n", - " grad = grad.clamp(-mag*grad_scale*2,mag*grad_scale*2)\n", - " if grad.abs().max()>0:\n", - " grad=grad/grad.abs().max()*opt.mag_mul\n", - " magnitude = grad.square().mean().sqrt()\n", - " else:\n", - " return(grad)\n", - " clamp_max = clamp_index[t]\n", - " #print(magnitude, end = \"\\r\")\n", - " grad = grad* magnitude.clamp(max= clamp_max) /magnitude#0.2\n", - " grad = grad.detach()\n", - " return grad\n", - "\n", - "def null_fn(x_in):\n", - " return(torch.zeros_like(x_in))\n", - "\n", - "def display_handler(x,i,cadance = 5, decode = True):\n", - " global progress, image_grid, writer, img_tensor, im\n", - " img_tensor = x\n", - " if i%cadance==0:\n", - " if decode: \n", - " x = model.decode_first_stage(x)\n", - " grid = make_grid(torch.clamp((x+1.0)/2.0, min=0.0, max=1.0),round(x.shape[0]**0.5))\n", - " grid = 255. * rearrange(grid, 'c h w -> h w c').detach().cpu().numpy()\n", - " image_grid = grid.copy(order = \"C\") \n", - " with io.BytesIO() as output:\n", - " im = Image.fromarray(grid.astype(np.uint8))\n", - " im.save(output, format = \"PNG\")\n", - " progress.value = output.getvalue()\n", - " if generate_video:\n", - " im.save(p.stdin, 'PNG')\n", - "\n", - "\n", - " \n", - "def cond_clamp(image,t): \n", - " #if t >=0:\n", - " #mag=image.square().mean().sqrt()\n", - " #mag = (mag*cc).clamp(1.6,100)\n", - " image = image.clamp(-cc, cc)\n", - " image = torch.nan_to_num(image, nan=0.0, posinf=cc, neginf=-cc)\n", - " return(image)\n", - "\n", - "def make_schedule(t_start, t_end, step_size=1):\n", - " schedule = []\n", - " par_schedule = []\n", - " t = t_start\n", - " while t > t_end:\n", - " schedule.append(t)\n", - " t -= step_size\n", - " schedule.append(t_end)\n", - " return np.array(schedule)\n", - "\n", - "lpips_model = lpips.LPIPS(net='vgg').to(device)\n", - "\n", - "def list_mul_to_array(list_mul):\n", - " i = 0\n", - " mul_count = 0\n", - " mul_string = ''\n", - " full_list = list_mul\n", - " full_list_len = len(full_list)\n", - " for item in full_list:\n", - " if(i == 0):\n", - " last_item = item\n", - " if(item == last_item):\n", - " mul_count+=1\n", - " if(item != last_item or full_list_len == i+1):\n", - " mul_string = mul_string + f' [{last_item}]*{mul_count} +'\n", - " mul_count=1\n", - " last_item = item\n", - " i+=1\n", - " return(mul_string[1:-2])\n", - "\n", - "def generate_settings_file(add_prompts=False, add_dimensions=False):\n", - " \n", - " if(add_prompts):\n", - " prompts = f'''\n", - " clip_prompts = {clip_prompts}\n", - " latent_prompts = {latent_prompts}\n", - " latent_negatives = {latent_negatives}\n", - " image_prompts = {image_prompts}\n", - " '''\n", - " else:\n", - " prompts = ''\n", - "\n", - " if(add_dimensions):\n", - " dimensions = f'''width = {width}\n", - " height = {height}\n", - " '''\n", - " else:\n", - " dimensions = ''\n", - " settings = f'''\n", - " #This settings file can be loaded back to Latent Majesty Diffusion. If you like your setting consider sharing it to the settings library at https://github.com/multimodalart/MajestyDiffusion\n", - " [clip_list]\n", - " perceptors = {clip_load_list}\n", - " \n", - " [basic_settings]\n", - " #Perceptor things\n", - " {prompts}\n", - " {dimensions}\n", - " latent_diffusion_guidance_scale = {latent_diffusion_guidance_scale}\n", - " clip_guidance_scale = {clip_guidance_scale}\n", - " aesthetic_loss_scale = {aesthetic_loss_scale}\n", - " augment_cuts={augment_cuts}\n", - "\n", - " #Init image settings\n", - " starting_timestep = {starting_timestep}\n", - " init_scale = {init_scale} \n", - " init_brightness = {init_brightness}\n", - " init_noise = {init_noise}\n", - "\n", - " [advanced_settings]\n", - " #Add CLIP Guidance and all the flavors or just run normal Latent Diffusion\n", - " use_cond_fn = {use_cond_fn}\n", - "\n", - " #Custom schedules for cuts. Check out the schedules documentation here\n", - " custom_schedule_setting = {custom_schedule_setting}\n", - "\n", - " #Cut settings\n", - " clamp_index = {list_mul_to_array(clamp_index)}\n", - " cut_overview = {list_mul_to_array(cut_overview)}\n", - " cut_innercut = {list_mul_to_array(cut_innercut)}\n", - " cut_ic_pow = {cut_ic_pow}\n", - " cut_icgray_p = {list_mul_to_array(cut_icgray_p)}\n", - " cutn_batches = {cutn_batches}\n", - " range_index = {list_mul_to_array(range_index)}\n", - " active_function = \"{active_function}\"\n", - " tv_scales = {list_mul_to_array(tv_scales)}\n", - " latent_tv_loss = {latent_tv_loss}\n", - "\n", - " #If you uncomment this line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale will be used\n", - " clip_guidance_schedule = {list_mul_to_array(clip_guidance_index)}\n", - " \n", - " #Apply symmetric loss (force simmetry to your results)\n", - " symmetric_loss_scale = {symmetric_loss_scale} \n", - "\n", - " #Latent Diffusion Advanced Settings\n", - " #Use when latent upscale to correct satuation problem\n", - " scale_div = {scale_div}\n", - " #Magnify grad before clamping by how many times\n", - " opt_mag_mul = {opt_mag_mul}\n", - " opt_ddim_eta = {opt_ddim_eta}\n", - " opt_eta_end = {opt_eta_end}\n", - " opt_temperature = {opt_temperature}\n", - "\n", - " #Grad advanced settings\n", - " grad_center = {grad_center}\n", - " #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", - " grad_scale={grad_scale} \n", - "\n", - " #Init image advanced settings\n", - " init_rotate={init_rotate}\n", - " mask_rotate={mask_rotate}\n", - " init_magnitude = {init_magnitude}\n", - "\n", - " #More settings\n", - " RGB_min = {RGB_min}\n", - " RGB_max = {RGB_max}\n", - " #How to pad the image with cut_overview\n", - " padargs = {padargs} \n", - " flip_aug={flip_aug}\n", - " cc = {cc}\n", - " #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", - " experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings}\n", - " #How much you want this to influence your result\n", - " experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight}\n", - " #9 are good aesthetic embeddings, 0 are bad ones\n", - " experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score}\n", - " '''\n", - " return(settings)\n", - "\n", - "#Alstro's aesthetic model\n", - "aesthetic_model_336 = torch.nn.Linear(768,1).cuda()\n", - "aesthetic_model_336.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_336_linear.pth\"))\n", - "\n", - "aesthetic_model_224 = torch.nn.Linear(768,1).cuda()\n", - "aesthetic_model_224.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_linear.pth\"))\n", - "\n", - "aesthetic_model_16 = torch.nn.Linear(512,1).cuda()\n", - "aesthetic_model_16.load_state_dict(torch.load(f\"{model_path}/ava_vit_b_16_linear.pth\"))\n", - "\n", - "aesthetic_model_32 = torch.nn.Linear(512,1).cuda()\n", - "aesthetic_model_32.load_state_dict(torch.load(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"))\n", - "\n", - "from ldm.modules.diffusionmodules.util import noise_like\n", - "def do_run():\n", - " # with torch.cuda.amp.autocast():\n", - " global progress,target_embeds, weights, zero_embed, init, scale_factor\n", - " scale_factor = 1\n", - " make_cutouts = {}\n", - " for i in clip_list:\n", - " make_cutouts[i] = MakeCutouts(clip_size[i],Overview=1)\n", - " target_embeds, weights ,zero_embed = {}, {}, {}\n", - " for i in clip_list:\n", - " target_embeds[i] = []\n", - " weights[i]=[]\n", - "\n", - " for prompt in prompts:\n", - " txt, weight = parse_prompt(prompt)\n", - " for i in clip_list:\n", - " if \"cloob\" not in i:\n", - " with torch.cuda.amp.autocast():\n", - " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", - " target_embeds[i].append(embeds)\n", - " weights[i].append(weight)\n", - " else:\n", - " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", - " target_embeds[i].append(embeds)\n", - " weights[i].append(weight)\n", - "\n", - " for prompt in image_prompts:\n", - " print(f\"processing{prompt}\",end=\"\\r\")\n", - " path, weight = parse_prompt(prompt)\n", - " img = Image.open(fetch(path)).convert('RGB')\n", - " img = TF.resize(img, min(opt.W, opt.H, *img.size), transforms.InterpolationMode.LANCZOS)\n", - " for i in clip_list:\n", - " if \"cloob\" not in i:\n", - " with torch.cuda.amp.autocast():\n", - " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", - " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", - " target_embeds[i].append(embed)\n", - " weights[i].extend([weight])\n", - " else:\n", - " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", - " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", - " target_embeds[i].append(embed)\n", - " weights[i].extend([weight])\n", - " if anti_jpg != 0:\n", - " target_embeds[\"ViT-B-32--openai\"].append(torch.tensor([np.load(f\"{model_path}/openimages_512x_png_embed224.npz\")['arr_0']-np.load(f\"{model_path}/imagenet_512x_jpg_embed224.npz\")['arr_0']], device = device))\n", - " weights[\"ViT-B-32--openai\"].append(anti_jpg)\n", - "\n", - " for i in clip_list:\n", - " target_embeds[i] = torch.cat(target_embeds[i])\n", - " weights[i] = torch.tensor([weights[i]], device=device)\n", - " shape = [4, opt.H//8, opt.W//8]\n", - " init = None\n", - " mask = None\n", - " transform = T.GaussianBlur(kernel_size=3, sigma=0.4)\n", - " if init_image is not None:\n", - " init = Image.open(fetch(init_image)).convert('RGB')\n", - " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", - " if init_rotate: init = torch.rot90(init, 1, [3,2]) \n", - " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W])\n", - " init = init.mul(2).sub(1).half()\n", - " init_encoded = model.first_stage_model.encode(init).sample()* init_magnitude + init_brightness\n", - " init_encoded = init_encoded + noise_like(init_encoded.shape,device,False).mul(init_noise)\n", - " else:\n", - " init = None\n", - " init_encoded = None\n", - " if init_mask is not None:\n", - " mask = Image.open(fetch(init_mask)).convert('RGB')\n", - " mask = TF.to_tensor(mask).to(device).unsqueeze(0)\n", - " if mask_rotate: mask = torch.rot90(init, 1, [3,2]) \n", - " mask = resize(mask,out_shape = [opt.n_samples,1,opt.H//8, opt.W//8])\n", - " mask = transform(mask)\n", - " print(mask)\n", - "\n", - "\n", - " progress = widgets.Image(layout = widgets.Layout(max_width = \"400px\",max_height = \"512px\"))\n", - " display.display(progress)\n", - "\n", - " if opt.plms:\n", - " sampler = PLMSSampler(model)\n", - " else:\n", - " sampler = DDIMSampler(model)\n", - "\n", - " os.makedirs(opt.outdir, exist_ok=True)\n", - " outpath = opt.outdir\n", - "\n", - " prompt = opt.prompt\n", - " sample_path = os.path.join(outpath, \"samples\")\n", - " os.makedirs(sample_path, exist_ok=True)\n", - " base_count = len(os.listdir(sample_path))\n", - "\n", - " all_samples=list()\n", - " last_step_upscale = False\n", - " with torch.enable_grad():\n", - " with torch.cuda.amp.autocast():\n", - " with model.ema_scope():\n", - " uc = None\n", - " if opt.scale != 1.0:\n", - " uc = model.get_learned_conditioning(opt.n_samples * opt.uc).cuda()\n", - " \n", - " for n in trange(opt.n_iter, desc=\"Sampling\"):\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " c = model.get_learned_conditioning(opt.n_samples * prompt).cuda()\n", - " if init_encoded is None:\n", - " x_T = torch.randn([opt.n_samples,*shape], device=device)\n", - " else:\n", - " x_T = init_encoded\n", - " last_step_uspcale_list = []\n", - " \n", - " for custom_schedule in custom_schedules:\n", - " if type(custom_schedule) != type(\"\"):\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " last_step_upscale = False\n", - " samples_ddim, _ = sampler.sample(S=opt.ddim_steps,\n", - " conditioning=c,\n", - " batch_size=opt.n_samples,\n", - " shape=shape,\n", - " custom_schedule = custom_schedule,\n", - " verbose=False,\n", - " unconditional_guidance_scale=opt.scale,\n", - " unconditional_conditioning=uc,\n", - " eta=opt.ddim_eta,\n", - " eta_end = opt.eta_end,\n", - " img_callback=None if use_cond_fn else display_handler,\n", - " cond_fn=cond_fn, #if use_cond_fn else None,\n", - " temperature = opt.temperature,\n", - " x_adjust_fn=cond_clamp,\n", - " x_T = x_T,\n", - " x0=x_T,\n", - " mask=mask\n", - " )\n", - " x_T = samples_ddim.clamp(-6,6)\n", - " else:\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " method, scale_factor = custom_schedule.split(\":\")\n", - " scale_factor = float(scale_factor)\n", - " #clamp_index = np.array(clamp_index) * scale_factor\n", - " if method == \"latent\":\n", - " x_T = resize(samples_ddim, scale_factors=scale_factor, antialiasing=True)*scale_div\n", - " x_T += noise_like(x_T.shape,device,False)*init_noise\n", - " if method == \"gfpgan\":\n", - " last_step_upscale = True\n", - " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", - " temp_file = os.path.join(sample_path, temp_file_name)\n", - " im.save(temp_file, format = \"PNG\")\n", - " GFP_factor = 2 if scale_factor > 1 else 1\n", - " GFP_ver = 1.3 #if GFP_factor == 1 else 1.2\n", - " %cd GFPGAN\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " !python inference_gfpgan.py -i $temp_file -o results -v $GFP_ver -s $GFP_factor\n", - " %cd ..\n", - " face_corrected = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\"))\n", - " with io.BytesIO() as output:\n", - " face_corrected.save(output,format=\"PNG\")\n", - " progress.value = output.getvalue()\n", - " init = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", - " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", - " opt.H, opt.W = opt.H*scale_factor, opt.W*scale_factor\n", - " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W], antialiasing=True)\n", - " init = init.mul(2).sub(1).half()\n", - " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", - " x_T += noise_like(x_T.shape,device,False)*init_noise\n", - " x_T = x_T.clamp(-6,6)\n", - "\n", - " #last_step_uspcale_list.append(last_step_upscale)\n", - " scale_factor = 1\n", - " current_time = str(round(time.time()))\n", - " if(last_step_upscale):\n", - " latest_upscale = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", - " latest_upscale.save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", - " else:\n", - " Image.fromarray(image_grid.astype(np.uint8)).save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", - " settings = generate_settings_file(add_prompts=True, add_dimensions=False)\n", - " text_file = open(f\"{outpath}/{current_time}.cfg\", \"w\")\n", - " text_file.write(settings)\n", - " text_file.close()\n", - " x_samples_ddim = model.decode_first_stage(samples_ddim)\n", - " x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0)\n", - " all_samples.append(x_samples_ddim)\n", - "\n", - "\n", - " if(len(all_samples) > 1):\n", - " # additionally, save as grid\n", - " grid = torch.stack(all_samples, 0)\n", - " grid = rearrange(grid, 'n b c h w -> (n b) c h w')\n", - " grid = make_grid(grid, nrow=opt.n_samples)\n", - "\n", - " # to image\n", - " grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()\n", - " Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid_{str(round(time.time()))}.png'))" + "# model.model = model.model.half().eval().to(device)\n", + "majesty.load_lpips_model()\n", + "majesty.load_aesthetic_model()" ] }, { @@ -1017,7 +337,7 @@ "#@markdown #### Open AI CLIP models\n", "ViT_B32 = False #@param {type:\"boolean\"}\n", "ViT_B16 = True #@param {type:\"boolean\"}\n", - "ViT_L14 = False #@param {type:\"boolean\"}\n", + "ViT_L14 = True #@param {type:\"boolean\"}\n", "ViT_L14_336px = False #@param {type:\"boolean\"}\n", "#RN101 = False #@param {type:\"boolean\"}\n", "#RN50 = False #@param {type:\"boolean\"}\n", @@ -1028,6 +348,7 @@ "#@markdown #### OpenCLIP models\n", "ViT_B16_plus = False #@param {type: \"boolean\"}\n", "ViT_B32_laion2b = True #@param {type: \"boolean\"}\n", + "ViT_L14_laion = False #@param {type:\"boolean\"}\n", "\n", "#@markdown #### Multilangual CLIP models \n", "clip_farsi = False #@param {type: \"boolean\"}\n", @@ -1037,8 +358,8 @@ "cloob_ViT_B16 = False #@param {type: \"boolean\"}\n", "\n", "# @markdown Load even more CLIP and CLIP-like models (from [Multi-Modal-Comparators](https://github.com/dmarx/Multi-Modal-Comparators))\n", - "model1 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", - "model2 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model1 = \"\" # @param [\"[clip - mlfoundations - RN50--openai]\",\"[clip - mlfoundations - RN101--openai]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model2 = \"\" # @param [\"[clip - mlfoundations - RN50--openai]\",\"[clip - mlfoundations - RN101--openai]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", "model3 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", "\n", "if ViT_B32: \n", @@ -1053,6 +374,8 @@ " clip_load_list.append(\"[clip - mlfoundations - RN50x64--openai]\")\n", "if RN50x16: \n", " clip_load_list.append(\"[clip - mlfoundations - RN50x16--openai]\")\n", + "if ViT_L14_laion: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14--laion400m_e32]\")\n", "if ViT_L14_336px:\n", " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14-336--openai]\")\n", "if ViT_B16_plus:\n", @@ -1074,66 +397,7 @@ " clip_load_list.append(model3)\n", "\n", "\n", - "i = 0\n", - "from mmc.multimmc import MultiMMC\n", - "from mmc.modalities import TEXT, IMAGE\n", - "temp_perceptor = MultiMMC(TEXT, IMAGE)\n", - "\n", - "def get_mmc_models(clip_load_list):\n", - " mmc_models = []\n", - " for model_key in clip_load_list:\n", - " if not model_key:\n", - " continue\n", - " arch, pub, m_id = model_key[1:-1].split(' - ')\n", - " mmc_models.append({\n", - " 'architecture':arch,\n", - " 'publisher':pub,\n", - " 'id':m_id,\n", - " })\n", - " return mmc_models\n", - "mmc_models = get_mmc_models(clip_load_list)\n", - "\n", - "import mmc\n", - "from mmc.registry import REGISTRY\n", - "import mmc.loaders # force trigger model registrations\n", - "from mmc.mock.openai import MockOpenaiClip\n", - "\n", - "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", - " std=[0.26862954, 0.26130258, 0.27577711])\n", - "\n", - "\n", - "def load_clip_models(mmc_models):\n", - " clip_model, clip_size, clip_tokenize, clip_normalize= {},{},{},{}\n", - " clip_list = []\n", - " for item in mmc_models:\n", - " print(\"Loaded \", item[\"id\"])\n", - " clip_list.append(item[\"id\"])\n", - " model_loaders = REGISTRY.find(**item)\n", - " for model_loader in model_loaders:\n", - " clip_model_loaded = model_loader.load()\n", - " clip_model[item[\"id\"]] = MockOpenaiClip(clip_model_loaded)\n", - " clip_size[item[\"id\"]] = clip_model[item[\"id\"]].visual.input_resolution\n", - " clip_tokenize[item[\"id\"]] = clip_model[item[\"id\"]].preprocess_text()\n", - " if(item[\"architecture\"] == 'cloob'):\n", - " clip_normalize[item[\"id\"]] = clip_model[item[\"id\"]].normalize\n", - " else:\n", - " clip_normalize[item[\"id\"]] = normalize\n", - " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - "\n", - "\n", - "def full_clip_load(clip_load_list):\n", - " torch.cuda.empty_cache()\n", - " gc.collect()\n", - " try:\n", - " del clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - " except:\n", - " pass\n", - " mmc_models = get_mmc_models(clip_load_list)\n", - " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = load_clip_models(mmc_models)\n", - " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", - "\n", - "clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", - "\n", + "majesty.clip_load_list = clip_load_list\n", "torch.cuda.empty_cache()\n", "gc.collect()" ] @@ -1159,64 +423,78 @@ "opt = DotMap()\n", "\n", "#Change it to false to not use CLIP Guidance at all \n", - "use_cond_fn = True\n", + "majesty.use_cond_fn = True\n", "\n", "#Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion\n", - "custom_schedule_setting = [\n", - " [200,1000,8],\n", - " [50,200,5],\n", - " #\"gfpgan:1.5\",\n", - " #[50,200,5],\n", + "majesty.custom_schedule_setting = [\n", + " [50,1000,8],\n", + " \"gfpgan:1.5\",\n", + " [5,200,5],\n", "]\n", " \n", "#Cut settings\n", - "clamp_index = [1]*1000 \n", - "cut_overview = [8]*500 + [4]*500\n", - "cut_innercut = [0]*500 + [4]*500\n", - "cut_ic_pow = .1\n", - "cut_icgray_p = [.1]*300+[0]*1000\n", - "cutn_batches = 1\n", - "range_index = [0]*300 + [0]*1000 \n", - "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", - "tv_scales = [1000]*1+[600]*3\n", - "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", + "majesty.clamp_index = [2,1.4] #linear variation of the index for clamping the gradient \n", + "majesty.cut_overview = [8]*500 + [4]*500\n", + "majesty.cut_innercut = [0]*500 + [4]*500\n", + "majesty.cut_ic_pow = .2\n", + "majesty.cut_icgray_p = [.1]*300+[0]*1000\n", + "majesty.cutn_batches = 1\n", + "majesty.cut_blur_n = [0]*400 + [0]*600\n", + "majesty.cut_blur_kernel = 3\n", + "majesty.range_index = [0]*1000\n", + "majesty.active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", + "majesty.ths_method = \"softsign\"\n", + "majesty.tv_scales = [600]*1+[50]*1 +[0]*2\n", + "majesty.latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", "\n", "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", "#clip_guidance_schedule = [10000]*300 + [500]*700\n", "\n", - "symmetric_loss_scale = 0 #Apply symmetric loss\n", + "majesty.symmetric_loss_scale = 0 #Apply symmetric loss\n", "\n", "#Latent Diffusion Advanced Settings\n", - "scale_div = 0.5 # Use when latent upscale to correct satuation problem\n", - "opt_mag_mul = 10 #Magnify grad before clamping\n", + "majesty.scale_div = 1 # Use when latent upscale to correct satuation problem\n", + "majesty.opt_mag_mul = 15 #Magnify grad before clamping\n", "#PLMS Currently not working, working on a fix\n", "#opt.plms = False #Won;=t work with clip guidance\n", - "opt_ddim_eta, opt_eta_end = [1.4,1] # linear variation of eta\n", - "opt_temperature = .975 \n", + "majesty.opt_ddim_eta, majesty.opt_eta_end = [1.5,1.2] # linear variation of eta\n", + "majesty.opt_temperature = .95\n", "\n", "#Grad advanced settings\n", - "grad_center = False\n", - "grad_scale= 0.5 #5 Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", - "anti_jpg = 0 #not working\n", + "majesty.grad_center = False\n", + "majesty.grad_scale= 0.75 #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "\n", + "#Restraints the model from explodign despite larger clamp\n", + "majesty.score_modifier = True\n", + "majesty.threshold_percentile = .9\n", + "majesty.threshold = 1.2\n", + "majesty.var_index = [0]*1000\n", + "\n", "\n", "#Init image advanced settings\n", - "init_rotate, mask_rotate=[False, False]\n", - "init_magnitude = 0.15\n", + "majesty.init_rotate, majesty.mask_rotate=[False, False]\n", + "majesty.init_magnitude = 0.15\n", "\n", "#More settings\n", - "RGB_min, RGB_max = [-0.95,0.95]\n", - "padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", - "flip_aug=False\n", - "cc = 60\n", - "cutout_debug = False\n", - "opt.outdir = outputs_path\n", + "majesty.RGB_min, majesty.RGB_max = [-1,1]\n", + "majesty.padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", + "majesty.flip_aug=False\n", + "majesty.cutout_debug = False\n", + "majesty.opt.outdir = outputs_path\n", "\n", "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", - "experimental_aesthetic_embeddings = False\n", + "majesty.experimental_aesthetic_embeddings = True\n", "#How much you want this to influence your result\n", - "experimental_aesthetic_embeddings_weight = 0.5\n", + "majesty.experimental_aesthetic_embeddings_weight = 0.3\n", "#9 are good aesthetic embeddings, 0 are bad ones\n", - "experimental_aesthetic_embeddings_score = 9" + "majesty.experimental_aesthetic_embeddings_score = 8\n", + "\n", + "# For fun dont change except if you really know what your are doing\n", + "majesty.grad_blur = False\n", + "majesty.compress_steps = 0\n", + "majesty.compress_factor = 0.1\n", + "majesty.punish_steps = 0\n", + "majesty.punish_factor = 0.8" ] }, { @@ -1246,15 +524,15 @@ "source": [ "#Amp up your prompt game with prompt engineering, check out this guide: https://matthewmcateer.me/blog/clip-prompt-engineering/\n", "#Prompt for CLIP Guidance\n", - "clip_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "clip_prompts =[\"portrait of a Majestic Princess, trending on artstation\"] \n", "\n", "#Prompt for Latent Diffusion\n", - "latent_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "latent_prompts = [\"portrait of a Majestic Princess, trending on artstation\"] \n", "\n", "#Negative prompts for Latent Diffusion\n", - "latent_negatives = [\"low quality image\"]\n", + "latent_negatives = [\"\"]\n", "\n", - "image_prompts = []" + "majesty.image_prompts = []" ] }, { @@ -1279,113 +557,58 @@ "warnings.filterwarnings('ignore')\n", "#@markdown ### Basic settings \n", "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", - "width = 256#@param{type: 'integer'}\n", - "height = 256#@param{type: 'integer'}\n", - "latent_diffusion_guidance_scale = 2 #@param {type:\"number\"}\n", - "clip_guidance_scale = 5000 #@param{type: 'integer'}\n", - "how_many_batches = 1 #@param{type: 'integer'}\n", - "aesthetic_loss_scale = 200 #@param{type: 'integer'}\n", - "augment_cuts=True #@param{type:'boolean'}\n", + "majesty.width = 256#@param{type: 'integer'}\n", + "majesty.height = 256#@param{type: 'integer'}\n", + "majesty.latent_diffusion_guidance_scale = 15 #@param {type:\"number\"}\n", + "majesty.clip_guidance_scale = 5000#@param{type: 'integer'}\n", + "majesty.how_many_batches = 1 #@param{type: 'integer'}\n", + "majesty.aesthetic_loss_scale = 400 #@param{type: 'integer'}\n", + "majesty.augment_cuts=True #@param{type:'boolean'}\n", "\n", "#@markdown\n", "\n", "#@markdown ### Init image settings\n", "#@markdown `init_image` requires the path of an image to use as init to the model\n", - "init_image = None #@param{type: 'string'}\n", - "if(init_image == '' or init_image == 'None'):\n", - " init_image = None\n", + "majesty.init_image = None #@param{type: 'string'}\n", + "if(majesty.init_image == '' or majesty.init_image == 'None'):\n", + " majesty.init_image = None\n", "#@markdown `starting_timestep`: How much noise do you want to add to your init image for it to then be difused by the model\n", - "starting_timestep = 0.9 #@param{type: 'number'}\n", + "majesty.starting_timestep = 0.9 #@param{type: 'number'}\n", "#@markdown `init_mask` is a mask same width and height as the original image with the color black indicating where to inpaint\n", - "init_mask = None #@param{type: 'string'}\n", + "majesty.init_mask = None #@param{type: 'string'}\n", "#@markdown `init_scale` controls how much the init image should influence the final result. Experiment with values around `1000`\n", - "init_scale = 1000 #@param{type: 'integer'}\n", - "init_brightness = 0.0 #@param{type: 'number'}\n", + "majesty.init_scale = 1000 #@param{type: 'integer'}\n", + "majesty.init_brightness = 0.0 #@param{type: 'number'}\n", "#@markdown How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)\n", - "init_noise = 0.6 #@param{type: 'number'}\n", + "majesty.init_noise = 0.57 #@param{type: 'number'}\n", "\n", "#@markdown\n", "\n", "#@markdown ### Custom saved settings\n", "#@markdown If you choose custom saved settings, the settings set by the preset overrule some of your choices. You can still modify the settings not in the preset. Check what each preset modifies here\n", - "custom_settings = 'path/to/settings.cfg' #@param{type:'string'}\n", + "majesty.custom_settings = 'path/to/settings.cfg' #@param{type:'string'}\n", "settings_library = 'None (use settings defined above)' #@param [\"None (use settings defined above)\", \"default (optimized for colab free)\", \"dango233_princesses\", \"the_other_zippy_defaults\", \"makeitrad_defaults\"]\n", "if(settings_library != 'None (use settings defined above)'):\n", " if(settings_library == 'default (optimized for colab free)'):\n", " custom_settings = f'majesty-diffusion/latent_settings_library/default.cfg'\n", " else:\n", " custom_settings = f'majesty-diffusion/latent_settings_library/{settings_library}.cfg'\n", - "\n", - "global_var_scope = globals()\n", - "if(custom_settings is not None and custom_settings != '' and custom_settings != 'path/to/settings.cfg'):\n", - " print('Loaded ', custom_settings)\n", - " try:\n", - " from configparser import ConfigParser\n", - " except ImportError:\n", - " from ConfigParser import ConfigParser\n", - " import configparser\n", - " \n", - " config = ConfigParser()\n", - " config.read(custom_settings)\n", - " #custom_settings_stream = fetch(custom_settings)\n", - " #Load CLIP models from config\n", - " if(config.has_section('clip_list')):\n", - " clip_incoming_list = config.items('clip_list')\n", - " clip_incoming_models = clip_incoming_list[0]\n", - " incoming_perceptors = eval(clip_incoming_models[1])\n", - " if((len(incoming_perceptors) != len(clip_load_list)) or not all(elem in incoming_perceptors for elem in clip_load_list)):\n", - " clip_load_list = incoming_perceptors\n", - " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", - "\n", - " #Load settings from config and replace variables\n", - " if(config.has_section('basic_settings')):\n", - " basic_settings = config.items('basic_settings')\n", - " for basic_setting in basic_settings:\n", - " global_var_scope[basic_setting[0]] = eval(basic_setting[1])\n", - " \n", - " if(config.has_section('advanced_settings')):\n", - " advanced_settings = config.items('advanced_settings')\n", - " for advanced_setting in advanced_settings:\n", - " global_var_scope[advanced_setting[0]] = eval(advanced_setting[1])\n", - "\n", - "if(((init_image is not None) and (init_image != 'None') and (init_image != '')) and starting_timestep != 1 and custom_schedule_setting[0][1] == 1000):\n", - " custom_schedule_setting[0] = [custom_schedule_setting[0][0], int(custom_schedule_setting[0][1]*starting_timestep), custom_schedule_setting[0][2]]\n", - "\n", - "prompts = clip_prompts\n", - "opt.prompt = latent_prompts\n", - "opt.uc = latent_negatives\n", - "custom_schedules = set_custom_schedules(custom_schedule_setting)\n", - "aes_scale = aesthetic_loss_scale\n", - "try: \n", - " clip_guidance_schedule\n", - " clip_guidance_index = clip_guidance_schedule\n", - "except:\n", - " clip_guidance_index = [clip_guidance_scale]*1000\n", - "\n", - "opt.W = (width//64)*64;\n", - "opt.H = (height//64)*64;\n", - "if opt.W != width or opt.H != height:\n", - " print(f'Changing output size to {opt.W}x{opt.H}. Dimensions must by multiples of 64.')\n", - "\n", - "opt.mag_mul = opt_mag_mul \n", - "opt.ddim_eta = opt_ddim_eta\n", - "opt.eta_end = opt_eta_end\n", - "opt.temperature = opt_temperature\n", - "opt.n_iter = how_many_batches\n", - "opt.n_samples = 1\n", - "#opt.W, opt.H = [width,height]\n", - "opt.scale = latent_diffusion_guidance_scale\n", - "aug = augment_cuts\n", + "majesty.load_custom_settings()\n", + "majesty.full_clip_load()\n", + "majesty.config_init_image()\n", + "\n", + "majesty.prompts = clip_prompts\n", + "majesty.opt.prompt = latent_prompts\n", + "majesty.opt.uc = latent_negatives\n", + "majesty.set_custom_schedules()\n", + "majesty.config_clip_guidance()\n", + "majesty.config_output_size()\n", + "majesty.config_options()\n", "\n", "torch.cuda.empty_cache()\n", "gc.collect()\n", - "generate_video = False\n", - "if generate_video: \n", - " fps = 24\n", - " p = Popen(['ffmpeg', '-y', '-f', 'image2pipe', '-vcodec', 'png', '-r', str(fps), '-i', '-', '-vcodec', 'libx264', '-r', str(fps), '-pix_fmt', 'yuv420p', '-crf', '17', '-preset', 'veryslow', 'video.mp4'], stdin=PIPE)\n", - "do_run()\n", - "if generate_video: \n", - " p.stdin.close()" + "majesty.generate_video = False\n", + "majesty.do_run()\n" ] }, { @@ -1411,7 +634,7 @@ "#@markdown If you would like to save your current settings, uncheck `skip_saving` and run this cell. You will get a `custom_settings.cfg` file you can reuse and share. If you like your results, send us a pull request to add your settings to the selectable library\n", "skip_saving = True #@param{type:'boolean'}\n", "if(not skip_saving):\n", - " data = generate_settings_file(add_prompts=False, add_dimensions=True)\n", + " data = majesty.generate_settings_file(add_prompts=False, add_dimensions=True)\n", " text_file = open(\"custom_settings.cfg\", \"w\")\n", " text_file.write(data)\n", " text_file.close()\n", @@ -1443,7 +666,7 @@ "xEVSOJ4f0B21" ], "machine_shape": "hm", - "name": "Latent Majesty Diffusion v1.3", + "name": "Latent Majesty Diffusion v1.4", "private_outputs": true, "provenance": [] }, diff --git a/latent.py b/latent.py new file mode 100644 index 0000000..e053fd2 --- /dev/null +++ b/latent.py @@ -0,0 +1,318 @@ +import argparse, sys +import torch +from omegaconf import OmegaConf +from subprocess import Popen, PIPE +import gc + +import torch +import json +import majesty as majesty + + +def main(argv): + + custom_settings = None + + parser = argparse.ArgumentParser( + description="Generate images from text with majesty" + ) + parser.add_argument( + "-p", + "--clip_prompts", + type=str, + help="CLIP prompts", + default=[ + "portrait of a princess in sanctuary, hyperrealistic painting trending on artstation" + ], + dest="clip_prompts", + ) + parser.add_argument( + "--latent_prompts", + type=str, + help="Latent prompts", + default=None, + dest="latent_prompts", + ) + parser.add_argument( + "--latent_negatives", + type=str, + help="Negative prompts", + default=["low quality image"], + dest="latent_negatives", + ) + parser.add_argument( + "--image_prompts", + type=str, + help="Image prompts", + default=[], + dest="image_prompts", + ) + parser.add_argument( + "-m", + "--model_path", + type=str, + help="Model path", + default="models", + dest="model_path", + ) + parser.add_argument( + "--model_source", + type=str, + help="Source URL prefix for a local HTTP server with model downloads to use instead of authoritative URLs (useful in ephemeral stups)", + default=None, + dest="model_source", + ) + parser.add_argument( + "-o", + "--outputs_path", + type=str, + help="Outputs path", + default="outputs", + dest="outputs_path", + ) + parser.add_argument( + "-c", + "--custom_settings", + type=str, + help="Custom settings file", + default=None, + dest="custom_settings", + ) + parser.add_argument( + "-W", "--width", type=int, help="Output width", default=256, dest="width" + ) + parser.add_argument( + "-H", "--height", type=int, help="Output height", default=256, dest="height" + ) + parser.add_argument( + "-ls", + "--latent_scale", + type=float, + help="Latent diffusion guidance scale", + default=2, + dest="latent_diffusion_guidance_scale", + ) + parser.add_argument( + "-cs", + "--clip_scale", + type=int, + help="CLIP guidance scale", + default=5000, + dest="clip_guidance_scale", + ) + parser.add_argument( + "-b", + "--batches", + type=int, + help="Number of batches", + default=1, + dest="how_many_batches", + ) + parser.add_argument( + "--aesthetic_loss_scale", + type=int, + help="Aesthetic loss scale", + default=200, + dest="aesthetic_loss_scale", + ) + parser.add_argument( + "--disable_augment_cuts", + help="Disable Augment cuts", + dest="augment_cuts", + action="store_false", + ) + parser.add_argument( + "-ns", + "--n_samples", + type=int, + help="Number of samples", + default=1, + dest="n_samples", + ) + parser.add_argument( + "--init_image", + type=str, + help="Initial image", + default=None, + dest="init_image", + ) + parser.add_argument( + "--starting_timestep", + type=float, + help="Starting timestep", + default=0.9, + dest="starting_timestep", + ) + parser.add_argument( + "--init_mask", + type=str, + help="A mask same width and height as the original image with the color black indicating where to inpaint", + default=None, + dest="init_mask", + ) + parser.add_argument( + "--init_scale", + type=int, + help="Controls how much the init image should influence the final result. Experiment with values around 1000", + default=1000, + dest="init_scale", + ) + parser.add_argument( + "--init_brightness", + type=float, + help="Init image brightness", + default=0.0, + dest="init_brightness", + ) + parser.add_argument( + "--init_noise", + type=float, + help="How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)", + default=0.6, + dest="init_noise", + ) + parser.add_argument( + "--enable_aesthetic_embeddings", + help="Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14", + dest="experimental_aesthetic_embeddings", + action="store_true", + ) + parser.add_argument( + "--aesthetic_embeddings_weight", + help="How much you want experimental aesthetic embeddings to influence your result", + type=float, + default=0.5, + dest="experimental_aesthetic_embeddings_weight", + ) + parser.add_argument( + "--aesthetic_embeddings_score", + help="9 are good aesthetic embeddings, 0 are bad ones", + type=int, + default=9, + dest="experimental_aesthetic_embeddings_score", + ) + + args = parser.parse_args() + majesty.use_args(args) + + majesty.download_models() + + torch.backends.cudnn.benchmark = True + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + majesty.device = device + + latent_diffusion_model = "finetuned" + config = OmegaConf.load( + "./latent-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml" + ) # TODO: Optionally download from same location as ckpt and chnage this logic + model = majesty.load_model_from_config( + config, + f"{majesty.model_path}/latent_diffusion_txt2img_f8_large.ckpt", + False, + latent_diffusion_model, + ) # TODO: check path + majesty.model = model.half().eval().to(device) + # if(latent_diffusion_model == "finetuned"): + # model.model = model.model.half().eval().to(device) + + majesty.load_lpips_model() + # Alstro's aesthetic model + majesty.load_aesthetic_model() + + clip_load_list = [] + # @markdown #### Open AI CLIP models + ViT_B32 = False # @param {type:"boolean"} + ViT_B16 = True # @param {type:"boolean"} + ViT_L14 = False # @param {type:"boolean"} + ViT_L14_336px = False # @param {type:"boolean"} + # RN101 = False #@param {type:"boolean"} + # RN50 = False #@param {type:"boolean"} + RN50x4 = False # @param {type:"boolean"} + RN50x16 = False # @param {type:"boolean"} + RN50x64 = False # @param {type:"boolean"} + + # @markdown #### OpenCLIP models + ViT_B16_plus = False # @param {type: "boolean"} + ViT_B32_laion2b = True # @param {type: "boolean"} + + # @markdown #### Multilangual CLIP models + clip_farsi = False # @param {type: "boolean"} + clip_korean = False # @param {type: "boolean"} + + # @markdown #### CLOOB models + cloob_ViT_B16 = False # @param {type: "boolean"} + + # @markdown Load even more CLIP and CLIP-like models (from [Multi-Modal-Comparators](https://github.com/dmarx/Multi-Modal-Comparators)) + model1 = "" # @param ["[clip - openai - RN50]","[clip - openai - RN101]","[clip - mlfoundations - RN50--yfcc15m]","[clip - mlfoundations - RN50--cc12m]","[clip - mlfoundations - RN50-quickgelu--yfcc15m]","[clip - mlfoundations - RN50-quickgelu--cc12m]","[clip - mlfoundations - RN101--yfcc15m]","[clip - mlfoundations - RN101-quickgelu--yfcc15m]","[clip - mlfoundations - ViT-B-32--laion400m_e31]","[clip - mlfoundations - ViT-B-32--laion400m_e32]","[clip - mlfoundations - ViT-B-32--laion400m_avg]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]","[clip - mlfoundations - ViT-B-16--laion400m_e31]","[clip - mlfoundations - ViT-B-16--laion400m_e32]","[clip - sbert - ViT-B-32-multilingual-v1]","[clip - facebookresearch - clip_small_25ep]","[simclr - facebookresearch - simclr_small_25ep]","[slip - facebookresearch - slip_small_25ep]","[slip - facebookresearch - slip_small_50ep]","[slip - facebookresearch - slip_small_100ep]","[clip - facebookresearch - clip_base_25ep]","[simclr - facebookresearch - simclr_base_25ep]","[slip - facebookresearch - slip_base_25ep]","[slip - facebookresearch - slip_base_50ep]","[slip - facebookresearch - slip_base_100ep]","[clip - facebookresearch - clip_large_25ep]","[simclr - facebookresearch - simclr_large_25ep]","[slip - facebookresearch - slip_large_25ep]","[slip - facebookresearch - slip_large_50ep]","[slip - facebookresearch - slip_large_100ep]","[clip - facebookresearch - clip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc12m_35ep]","[clip - facebookresearch - clip_base_cc12m_35ep]"] {allow-input: true} + model2 = "" # @param ["[clip - openai - RN50]","[clip - openai - RN101]","[clip - mlfoundations - RN50--yfcc15m]","[clip - mlfoundations - RN50--cc12m]","[clip - mlfoundations - RN50-quickgelu--yfcc15m]","[clip - mlfoundations - RN50-quickgelu--cc12m]","[clip - mlfoundations - RN101--yfcc15m]","[clip - mlfoundations - RN101-quickgelu--yfcc15m]","[clip - mlfoundations - ViT-B-32--laion400m_e31]","[clip - mlfoundations - ViT-B-32--laion400m_e32]","[clip - mlfoundations - ViT-B-32--laion400m_avg]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]","[clip - mlfoundations - ViT-B-16--laion400m_e31]","[clip - mlfoundations - ViT-B-16--laion400m_e32]","[clip - sbert - ViT-B-32-multilingual-v1]","[clip - facebookresearch - clip_small_25ep]","[simclr - facebookresearch - simclr_small_25ep]","[slip - facebookresearch - slip_small_25ep]","[slip - facebookresearch - slip_small_50ep]","[slip - facebookresearch - slip_small_100ep]","[clip - facebookresearch - clip_base_25ep]","[simclr - facebookresearch - simclr_base_25ep]","[slip - facebookresearch - slip_base_25ep]","[slip - facebookresearch - slip_base_50ep]","[slip - facebookresearch - slip_base_100ep]","[clip - facebookresearch - clip_large_25ep]","[simclr - facebookresearch - simclr_large_25ep]","[slip - facebookresearch - slip_large_25ep]","[slip - facebookresearch - slip_large_50ep]","[slip - facebookresearch - slip_large_100ep]","[clip - facebookresearch - clip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc12m_35ep]","[clip - facebookresearch - clip_base_cc12m_35ep]"] {allow-input: true} + model3 = "" # @param ["[clip - openai - RN50]","[clip - openai - RN101]","[clip - mlfoundations - RN50--yfcc15m]","[clip - mlfoundations - RN50--cc12m]","[clip - mlfoundations - RN50-quickgelu--yfcc15m]","[clip - mlfoundations - RN50-quickgelu--cc12m]","[clip - mlfoundations - RN101--yfcc15m]","[clip - mlfoundations - RN101-quickgelu--yfcc15m]","[clip - mlfoundations - ViT-B-32--laion400m_e31]","[clip - mlfoundations - ViT-B-32--laion400m_e32]","[clip - mlfoundations - ViT-B-32--laion400m_avg]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]","[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]","[clip - mlfoundations - ViT-B-16--laion400m_e31]","[clip - mlfoundations - ViT-B-16--laion400m_e32]","[clip - sbert - ViT-B-32-multilingual-v1]","[clip - facebookresearch - clip_small_25ep]","[simclr - facebookresearch - simclr_small_25ep]","[slip - facebookresearch - slip_small_25ep]","[slip - facebookresearch - slip_small_50ep]","[slip - facebookresearch - slip_small_100ep]","[clip - facebookresearch - clip_base_25ep]","[simclr - facebookresearch - simclr_base_25ep]","[slip - facebookresearch - slip_base_25ep]","[slip - facebookresearch - slip_base_50ep]","[slip - facebookresearch - slip_base_100ep]","[clip - facebookresearch - clip_large_25ep]","[simclr - facebookresearch - simclr_large_25ep]","[slip - facebookresearch - slip_large_25ep]","[slip - facebookresearch - slip_large_50ep]","[slip - facebookresearch - slip_large_100ep]","[clip - facebookresearch - clip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc3m_40ep]","[slip - facebookresearch - slip_base_cc12m_35ep]","[clip - facebookresearch - clip_base_cc12m_35ep]"] {allow-input: true} + + if ViT_B32: + clip_load_list.append("[clip - mlfoundations - ViT-B-32--openai]") + if ViT_B16: + clip_load_list.append("[clip - mlfoundations - ViT-B-16--openai]") + if ViT_L14: + clip_load_list.append("[clip - mlfoundations - ViT-L-14--openai]") + if RN50x4: + clip_load_list.append("[clip - mlfoundations - RN50x4--openai]") + if RN50x64: + clip_load_list.append("[clip - mlfoundations - RN50x64--openai]") + if RN50x16: + clip_load_list.append("[clip - mlfoundations - RN50x16--openai]") + if ViT_L14_336px: + clip_load_list.append("[clip - mlfoundations - ViT-L-14-336--openai]") + if ViT_B16_plus: + clip_load_list.append( + "[clip - mlfoundations - ViT-B-16-plus-240--laion400m_e32]" + ) + if ViT_B32_laion2b: + clip_load_list.append("[clip - mlfoundations - ViT-B-32--laion2b_e16]") + if clip_farsi: + clip_load_list.append("[clip - sajjjadayobi - clipfa]") + if clip_korean: + clip_load_list.append("[clip - navervision - kelip_ViT-B/32]") + if cloob_ViT_B16: + clip_load_list.append( + "[cloob - crowsonkb - cloob_laion_400m_vit_b_16_32_epochs]" + ) + + if model1: + clip_load_list.append(model1) + if model2: + clip_load_list.append(model2) + if model3: + clip_load_list.append(model3) + + torch.cuda.empty_cache() + gc.collect() + + majesty.opt.outdir = majesty.outputs_path + + majesty.clip_load_list = clip_load_list + + majesty.load_custom_settings() + + majesty.full_clip_load() + + majesty.config_init_image() + + majesty.prompts = majesty.clip_prompts + if majesty.latent_prompts == [] or majesty.latent_prompts == None: + majesty.opt.prompt = majesty.prompts + else: + majesty.opt.prompt = majesty.latent_prompts + majesty.opt.uc = majesty.latent_negatives + majesty.set_custom_schedules() + + majesty.config_clip_guidance() + majesty.config_output_size() + majesty.config_options() + + torch.cuda.empty_cache() + gc.collect() + + majesty.do_run() + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/majesty.py b/majesty.py new file mode 100644 index 0000000..572b279 --- /dev/null +++ b/majesty.py @@ -0,0 +1,1374 @@ +import argparse, os, sys, glob +import shutil +import torch +import numpy as np +from omegaconf import OmegaConf +from PIL import Image +from tqdm.auto import tqdm, trange + +tqdm_auto_model = __import__("tqdm.auto", fromlist="") +sys.modules["tqdm"] = tqdm_auto_model +from einops import rearrange +from torchvision.utils import make_grid +import transformers +import gc + +sys.path.append("./latent-diffusion") +from ldm.util import instantiate_from_config +from ldm.models.diffusion.ddim import DDIMSampler +from ldm.models.diffusion.plms import PLMSSampler +from ldm.modules.diffusionmodules.util import noise_like +import tensorflow as tf +from dotmap import DotMap +import ipywidgets as widgets +from math import pi + +from resize_right import resize + +import subprocess +from subprocess import Popen, PIPE + +from dataclasses import dataclass +from functools import partial +import gc +import io +import math +import sys +import random +from piq import brisque +from itertools import product +from IPython import display +import lpips +from PIL import Image, ImageOps +import requests +import torch +from torch import nn +from torch.nn import functional as F +from torchvision import models +from torchvision import transforms +from torchvision import transforms as T +from torchvision.transforms import functional as TF +from numpy import nan +from threading import Thread +import time +import json +import warnings + +import mmc +from mmc.registry import REGISTRY +import mmc.loaders # force trigger model registrations +from mmc.mock.openai import MockOpenaiClip + +model_path = "models" +outputs_path = "results" +device = None +opt = DotMap() + +# Change it to false to not use CLIP Guidance at all +use_cond_fn = True + +# Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion +custom_schedule_setting = [ + [50, 1000, 8], + "gfpgan:1.5", + [5, 200, 5], + # "gfpgan:1.5", + # [50,200,5], +] + +# Cut settings +clamp_index = [2, 1.4] # linear variation of the index for clamping the gradient +cut_overview = [8] * 500 + [4] * 500 +cut_innercut = [0] * 500 + [4] * 500 +cut_ic_pow = 0.2 +cut_icgray_p = [0.1] * 300 + [0] * 1000 +cutn_batches = 1 +cut_blur_n = [0] * 400 + [0] * 600 +cut_blur_kernel = 3 +range_index = [0] * 1000 +active_function = ( + "softsign" # function to manipulate the gradient - help things to stablize +) +ths_method = "softsign" +tv_scales = [600] * 1 + [50] * 1 + [0] * 2 +latent_tv_loss = True # Applies the TV Loss in the Latent space instead of pixel, improves generation quality + +# If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used +# clip_guidance_schedule = [10000]*300 + [500]*700 + +symmetric_loss_scale = 0 # Apply symmetric loss + +# Latent Diffusion Advanced Settings +scale_div = 1 # Use when latent upscale to correct satuation problem +opt_mag_mul = 15 # Magnify grad before clamping +# PLMS Currently not working, working on a fix +# opt.plms = False #Won;=t work with clip guidance +opt_ddim_eta, opt_eta_end = [1.5, 1.2] # linear variation of eta +opt_temperature = 0.95 + +# Grad advanced settings +grad_center = False +grad_scale = 0.75 # Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept + +# Restraints the model from exploding despite larger clamp +score_modifier = True +threshold_percentile = 0.9 +threshold = 1.2 +var_index = [0] * 1000 + +# Init image advanced settings +init_rotate, mask_rotate = [False, False] +init_magnitude = 0.15 + +# More settings +RGB_min, RGB_max = [-1, 1] +padargs = {"mode": "constant", "value": -1} # How to pad the image with cut_overview +flip_aug = False +cutout_debug = False + +# Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14 +experimental_aesthetic_embeddings = True +# How much you want this to influence your result +experimental_aesthetic_embeddings_weight = 0.3 +# 9 are good aesthetic embeddings, 0 are bad ones +experimental_aesthetic_embeddings_score = 8 + +# For fun dont change except if you really know what your are doing +grad_blur = False +compress_steps = 0 +compress_factor = 0.1 +punish_steps = 0 +punish_factor = 0.8 + +# Amp up your prompt game with prompt engineering, check out this guide: https://matthewmcateer.me/blog/clip-prompt-engineering/ +# Prompt for CLIP Guidance +clip_prompts = ["portrait of a Majestic Princess, trending on artstation"] + +# Prompt for Latent Diffusion +latent_prompts = ["portrait of a Majestic Princess, trending on artstation"] + +# Negative prompts for Latent Diffusion +latent_negatives = [""] + +image_prompts = [] + +width = 256 +height = 256 +latent_diffusion_guidance_scale = 15 +clip_guidance_scale = 5000 +how_many_batches = 1 +aesthetic_loss_scale = 400 +augment_cuts = True +n_samples = 1 + +init_image = None +starting_timestep = 0.9 +init_mask = None +init_scale = 1000 +init_brightness = 0.0 +init_noise = 0.57 + +normalize = transforms.Normalize( + mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711] +) + +# Globals +custom_settings = None +generate_video = False +model = {} +aes_scale = None +aug = None + +clip_model, clip_size, clip_tokenize, clip_normalize = {}, {}, {}, {} +clip_list, clip_load_list, clip_guidance_index = [], [], [] + + +aesthetic_model_336, aesthetic_model_224, aesthetic_model_16, aesthetic_model_32 = ( + {}, + {}, + {}, + {}, +) +custom_schedules = [] + +progress = None +image_grid, writer, img_tensor, im = {}, {}, {}, {} +target_embeds, weights, zero_embed, init = {}, {}, {}, {} +make_cutouts = {} +scale_factor = 1 +clamp_start_, clamp_max = None, None +clip_guidance_schedule = None +prompts = [] +mmc_models = [] +last_step_uspcale_list = [] + +has_purged = False + +# Used to override download locations, allows rehosting models in a bucket for ephemeral servers to download +model_source = None + + +def download_models(): + # download models as needed + models = [ + [ + "latent_diffusion_txt2img_f8_large.ckpt", + "https://ommer-lab.com/files/latent-diffusion/nitro/txt2img-f8-large/model.ckpt", + ], + [ + "finetuned_state_dict.pt", + "https://huggingface.co/multimodalart/compvis-latent-diffusion-text2img-large/resolve/main/finetuned_state_dict.pt", + ], + [ + "ava_vit_l_14_336_linear.pth", + "https://multimodal.art/models/ava_vit_l_14_336_linear.pth", + ], + [ + "sa_0_4_vit_l_14_linear.pth", + "https://multimodal.art/models/sa_0_4_vit_l_14_linear.pth", + ], + [ + "ava_vit_l_14_linear.pth", + "https://multimodal.art/models/ava_vit_l_14_linear.pth", + ], + [ + "ava_vit_b_16_linear.pth", + "http://batbot.tv/ai/models/v-diffusion/ava_vit_b_16_linear.pth", + ], + [ + "sa_0_4_vit_b_16_linear.pth", + "https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth", + ], + [ + "sa_0_4_vit_b_32_linear.pth", + "https://multimodal.art/models/sa_0_4_vit_b_32_linear.pth", + ], + [ + "openimages_512x_png_embed224.npz", + "https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/openimages_512x_png_embed224.npz", + ], + [ + "imagenet_512x_jpg_embed224.npz", + "https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/imagenet_512x_jpg_embed224.npz", + ], + [ + "GFPGANv1.3.pth", + "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth", + ], + ] + + if not os.path.exists(model_path): + os.makedirs(model_path) + + for item in models: + model_file = f"{model_path}/{item[0]}" + if not os.path.exists(model_file): + if model_source: + url = f"{model_source}/{item[0]}" + else: + url = item[1] + print(f"Downloading {url}") + subprocess.call( + ["wget", "-nv", "-O", model_file, url, "--no-check-certificate"], + shell=False, + ) + if not os.path.exists("GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth"): + shutil.copyfile( + f"{model_path}/GFPGANv1.3.pth", + "GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth", + ) + + +def load_model_from_config( + config, ckpt, verbose=False, latent_diffusion_model="original" +): + print(f"Loading model from {ckpt}") + print(latent_diffusion_model) + model = instantiate_from_config(config.model) + sd = torch.load(ckpt, map_location="cuda")["state_dict"] + m, u = model.load_state_dict(sd, strict=False) + if latent_diffusion_model == "finetuned": + del sd + sd_finetune = torch.load( + f"{model_path}/finetuned_state_dict.pt", map_location="cuda" + ) + m, u = model.model.load_state_dict(sd_finetune, strict=False) + model.model = model.model.half().eval().to(device) + del sd_finetune + # sd = pl_sd["state_dict"] + + if len(m) > 0 and verbose: + print("missing keys:") + print(m) + if len(u) > 0 and verbose: + print("unexpected keys:") + print(u) + + model.requires_grad_(False).half().eval().to("cuda") + return model + + +def get_mmc_models(): + global mmc_models + mmc_models = [] + for model_key in clip_load_list: + if not model_key: + continue + arch, pub, m_id = model_key[1:-1].split(" - ") + mmc_models.append( + { + "architecture": arch, + "publisher": pub, + "id": m_id, + } + ) + + +def set_custom_schedules(): + global custom_schedules + custom_schedules = [] + for schedule_item in custom_schedule_setting: + if isinstance(schedule_item, list): + custom_schedules.append(np.arange(*schedule_item)) + else: + custom_schedules.append(schedule_item) + + +def parse_prompt(prompt): + if ( + prompt.startswith("http://") + or prompt.startswith("https://") + or prompt.startswith("E:") + or prompt.startswith("C:") + or prompt.startswith("D:") + ): + vals = prompt.rsplit(":", 2) + vals = [vals[0] + ":" + vals[1], *vals[2:]] + else: + vals = prompt.rsplit(":", 1) + vals = vals + ["", "1"][len(vals) :] + return vals[0], float(vals[1]) + + +class MakeCutouts(nn.Module): + def __init__( + self, + cut_size, + Overview=4, + WholeCrop=0, + WC_Allowance=10, + WC_Grey_P=0.2, + InnerCrop=0, + IC_Size_Pow=0.5, + IC_Grey_P=0.2, + cut_blur_n=0, + ): + super().__init__() + self.cut_size = cut_size + self.Overview = Overview + self.WholeCrop = WholeCrop + self.WC_Allowance = WC_Allowance + self.WC_Grey_P = WC_Grey_P + self.InnerCrop = InnerCrop + self.IC_Size_Pow = IC_Size_Pow + self.IC_Grey_P = IC_Grey_P + self.cut_blur_n = cut_blur_n + self.augs = T.Compose( + [ + # T.RandomHorizontalFlip(p=0.5), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomAffine( + degrees=0, + translate=(0.05, 0.05), + # scale=(0.9,0.95), + fill=-1, + interpolation=T.InterpolationMode.BILINEAR, + ), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + # T.RandomPerspective(p=1, interpolation = T.InterpolationMode.BILINEAR, fill=-1,distortion_scale=0.2), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.RandomGrayscale(p=0.1), + T.Lambda(lambda x: x + torch.randn_like(x) * 0.01), + T.ColorJitter(brightness=0.05, contrast=0.05, saturation=0.05), + ] + ) + + def forward(self, input): + gray = transforms.Grayscale(3) + sideY, sideX = input.shape[2:4] + max_size = min(sideX, sideY) + min_size = min(sideX, sideY, self.cut_size) + l_size = max(sideX, sideY) + output_shape = [input.shape[0], 3, self.cut_size, self.cut_size] + output_shape_2 = [input.shape[0], 3, self.cut_size + 2, self.cut_size + 2] + pad_input = F.pad( + input, + ( + (sideY - max_size) // 2 + round(max_size * 0.055), + (sideY - max_size) // 2 + round(max_size * 0.055), + (sideX - max_size) // 2 + round(max_size * 0.055), + (sideX - max_size) // 2 + round(max_size * 0.055), + ), + **padargs, + ) + cutouts_list = [] + + if self.Overview > 0: + cutouts = [] + cutout = resize(pad_input, out_shape=output_shape, antialiasing=True) + output_shape_all = list(output_shape) + output_shape_all[0] = self.Overview * input.shape[0] + pad_input = pad_input.repeat(input.shape[0], 1, 1, 1) + cutout = resize(pad_input, out_shape=output_shape_all) + if aug: + cutout = self.augs(cutout) + if self.cut_blur_n > 0: + cutout[0 : self.cut_blur_n, :, :, :] = TF.gaussian_blur( + cutout[0 : self.cut_blur_n, :, :, :], cut_blur_kernel + ) + cutouts_list.append(cutout) + + if self.InnerCrop > 0: + cutouts = [] + for i in range(self.InnerCrop): + size = int( + torch.rand([]) ** self.IC_Size_Pow * (max_size - min_size) + + min_size + ) + offsetx = torch.randint(0, sideX - size + 1, ()) + offsety = torch.randint(0, sideY - size + 1, ()) + cutout = input[:, :, offsety : offsety + size, offsetx : offsetx + size] + if i <= int(self.IC_Grey_P * self.InnerCrop): + cutout = gray(cutout) + cutout = resize(cutout, out_shape=output_shape) + cutouts.append(cutout) + if cutout_debug: + TF.to_pil_image(cutouts[-1].add(1).div(2).clamp(0, 1).squeeze(0)).save( + "content/diff/cutouts/cutout_InnerCrop.jpg", quality=99 + ) + cutouts_tensor = torch.cat(cutouts) + cutouts = [] + cutouts_list.append(cutouts_tensor) + cutouts = torch.cat(cutouts_list) + return cutouts + + +def spherical_dist_loss(x, y): + x = F.normalize(x, dim=-1) + y = F.normalize(y, dim=-1) + return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2) + + +def tv_loss(input): + """L2 total variation loss, as in Mahendran et al.""" + input = F.pad(input, (0, 1, 0, 1), "replicate") + x_diff = input[..., :-1, 1:] - input[..., :-1, :-1] + y_diff = input[..., 1:, :-1] - input[..., :-1, :-1] + return (x_diff**2 + y_diff**2).mean([1, 2, 3]) + + +def range_loss(input, range_min, range_max): + return (input - input.clamp(range_min, range_max)).pow(2).mean([1, 2, 3]) + + +def symmetric_loss(x): + w = x.shape[3] + diff = (x - torch.flip(x, [3])).square().mean().sqrt() / ( + x.shape[2] * x.shape[3] / 1e4 + ) + return diff + + +def fetch(url_or_path): + """Fetches a file from an HTTP or HTTPS url, or opens the local file.""" + if str(url_or_path).startswith("http://") or str(url_or_path).startswith( + "https://" + ): + r = requests.get(url_or_path) + r.raise_for_status() + fd = io.BytesIO() + fd.write(r.content) + fd.seek(0) + return fd + return open(url_or_path, "rb") + + +def to_pil_image(x): + """Converts from a tensor to a PIL image.""" + if x.ndim == 4: + assert x.shape[0] == 1 + x = x[0] + if x.shape[0] == 1: + x = x[0] + return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2) + + +def centralized_grad(x, use_gc=True, gc_conv_only=False): + if use_gc: + if gc_conv_only: + if len(list(x.size())) > 3: + x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True)) + else: + if len(list(x.size())) > 1: + x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True)) + return x + + +def cond_fn(x, t): + global cur_step + cur_step += 1 + t = 1000 - t + t = t[0] + with torch.enable_grad(): + x = x.detach() + x = x.requires_grad_() + x_in = model.decode_first_stage(x) + display_handler(x_in, t, 1, False) + n = x_in.shape[0] + clip_guidance_scale = clip_guidance_index[t] + make_cutouts = {} + # rx_in_grad = torch.zeros_like(x_in) + for i in clip_list: + make_cutouts[i] = MakeCutouts( + clip_size[i], + Overview=cut_overview[t], + InnerCrop=cut_innercut[t], + IC_Size_Pow=cut_ic_pow, + IC_Grey_P=cut_icgray_p[t], + cut_blur_n=cut_blur_n[t], + ) + cutn = cut_overview[t] + cut_innercut[t] + for j in range(cutn_batches): + losses = 0 + for i in clip_list: + clip_in = clip_normalize[i]( + make_cutouts[i](x_in.add(1).div(2)).to("cuda") + ) + image_embeds = ( + clip_model[i] + .encode_image(clip_in) + .float() + .unsqueeze(0) + .expand([target_embeds[i].shape[0], -1, -1]) + ) + target_embeds_temp = target_embeds[i] + if i == "ViT-B-32--openai" and experimental_aesthetic_embeddings: + aesthetic_embedding = torch.from_numpy( + np.load( + f"aesthetic-predictor/vit_b_32_embeddings/rating{experimental_aesthetic_embeddings_score}.npy" + ) + ).to(device) + aesthetic_query = ( + target_embeds_temp + + aesthetic_embedding * experimental_aesthetic_embeddings_weight + ) + target_embeds_temp = (aesthetic_query) / torch.linalg.norm( + aesthetic_query + ) + if i == "ViT-L-14--openai" and experimental_aesthetic_embeddings: + aesthetic_embedding = torch.from_numpy( + np.load( + f"aesthetic-predictor/vit_l_14_embeddings/rating{experimental_aesthetic_embeddings_score}.npy" + ) + ).to(device) + aesthetic_query = ( + target_embeds_temp + + aesthetic_embedding * experimental_aesthetic_embeddings_weight + ) + target_embeds_temp = (aesthetic_query) / torch.linalg.norm( + aesthetic_query + ) + target_embeds_temp = target_embeds_temp.unsqueeze(1).expand( + [-1, cutn * n, -1] + ) + dists = spherical_dist_loss(image_embeds, target_embeds_temp) + dists = dists.mean(1).mul(weights[i].squeeze()).mean() + losses += ( + dists + * clip_guidance_scale + * ( + 2 + if i + in [ + "ViT-L-14-336--openai", + "RN50x64--openai", + "ViT-B-32--laion2b_e16", + ] + else (0.4 if "cloob" in i else 1) + ) + ) + if i == "ViT-L-14-336--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_336(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + if i == "ViT-L-14--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_224(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + if i == "ViT-B-16--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_16(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + if i == "ViT-B-32--openai" and aes_scale != 0: + aes_loss = ( + aesthetic_model_32(F.normalize(image_embeds, dim=-1)) + ).mean() + losses -= aes_loss * aes_scale + # x_in_grad += torch.autograd.grad(losses, x_in)[0] / cutn_batches / len(clip_list) + # losses += dists + # losses = losses / len(clip_list) + # gc.collect() + + tv_losses = ( + tv_loss(x).sum() * tv_scales[0] + + tv_loss(F.interpolate(x, scale_factor=1 / 2)).sum() * tv_scales[1] + + tv_loss(F.interpolate(x, scale_factor=1 / 4)).sum() * tv_scales[2] + + tv_loss(F.interpolate(x, scale_factor=1 / 8)).sum() * tv_scales[3] + ) + range_scale = range_index[t] + range_losses = range_loss(x_in, RGB_min, RGB_max).sum() * range_scale + var_scale = var_index[t] + loss = tv_losses + range_losses + losses + # del losses + if symmetric_loss_scale != 0: + loss += symmetric_loss(x_in) * symmetric_loss_scale + if init_image is not None and init_scale: + lpips_loss = (lpips_model(x_in, init) * init_scale).squeeze().mean() + # print(lpips_loss) + loss += lpips_loss + # loss_grad = torch.autograd.grad(loss, x_in, )[0] + # x_in_grad += loss_grad + # grad = -torch.autograd.grad(x_in, x, x_in_grad)[0] + loss.backward() + grad = -x.grad + grad = torch.nan_to_num(grad, nan=0.0, posinf=0, neginf=0) + if grad_center: + grad = centralized_grad(grad, use_gc=True, gc_conv_only=False) + mag = grad.square().mean().sqrt() + if mag == 0 or torch.isnan(mag): + print("ERROR") + print(t) + return grad + if t >= 0: + if active_function == "softsign": + grad = F.softsign(grad * grad_scale / mag) + if active_function == "tanh": + grad = (grad / mag * grad_scale).tanh() + if active_function == "clamp": + grad = grad.clamp(-mag * grad_scale * 2, mag * grad_scale * 2) + if grad.abs().max() > 0: + grad = grad / grad.abs().max() * opt.mag_mul + magnitude = grad.square().mean().sqrt() + else: + return grad + clamp_max = clamp_index_variation[t] + # print(magnitude, end = "\r") + grad = grad * magnitude.clamp(max=clamp_max) / magnitude # 0.2 + grad = grad.detach() + grad = grad_fn(grad, t) + x = x.detach() + x = x.requires_grad_() + var = x.var() + var_losses = (var.pow(2).clamp(min=1) - 1) * var_scale + var_losses.backward() + grad -= x.grad + print(grad.abs().mean(), x.grad.abs().mean(), end="\r") + return grad + + +def null_fn(x_in): + return torch.zeros_like(x_in) + + +def display_handler(x, i, cadance=5, decode=True): + global progress, image_grid, writer, img_tensor, im, p + img_tensor = x + if i % cadance == 0: + if decode: + x = model.decode_first_stage(x) + grid = make_grid( + torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0), round(x.shape[0] ** 0.5) + ) + grid = 255.0 * rearrange(grid, "c h w -> h w c").detach().cpu().numpy() + image_grid = grid.copy(order="C") + with io.BytesIO() as output: + im = Image.fromarray(grid.astype(np.uint8)) + im.save(output, format="PNG") + if progress: + progress.value = output.getvalue() + if generate_video: + im.save(p.stdin, "PNG") + + +def grad_fn(x, t): + if t <= 500 and grad_blur: + x = TF.gaussian_blur(x, 2 * round(int(max(grad_blur - t / 150, 1))) - 1, 1.5) + return x + + +def cond_clamp(image, t): + t = 1000 - t[0] + if t <= max(punish_steps, compress_steps): + s = torch.quantile( + rearrange(image, "b ... -> b (...)").abs(), threshold_percentile, dim=-1 + ) + s = s.view(-1, *((1,) * (image.ndim - 1))) + ths = s.clamp(min=threshold) + im_max = image.clamp(min=ths) - image.clamp(min=ths, max=ths) + im_min = image.clamp(max=-ths, min=-ths) - image.clamp(max=-ths) + if t <= punish_steps: + image = ( + image.clamp(min=-ths, max=ths) + (im_max - im_min) * punish_factor + ) # ((im_max-im_min)*punish_factor).tanh()/punish_factor + if t <= compress_steps: + image = image / (ths / threshold) ** compress_factor + image += noise_like(image.shape, device, False) * ( + (ths / threshold) ** compress_factor - 1 + ) + return image + + +def make_schedule(t_start, t_end, step_size=1): + schedule = [] + par_schedule = [] + t = t_start + while t > t_end: + schedule.append(t) + t -= step_size + schedule.append(t_end) + return np.array(schedule) + + +def list_mul_to_array(list_mul): + i = 0 + mul_count = 0 + mul_string = "" + full_list = list_mul + full_list_len = len(full_list) + for item in full_list: + if i == 0: + last_item = item + if item == last_item: + mul_count += 1 + if item != last_item or full_list_len == i + 1: + mul_string = mul_string + f" [{last_item}]*{mul_count} +" + mul_count = 1 + last_item = item + i += 1 + return mul_string[1:-2] + + +def generate_settings_file(add_prompts=False, add_dimensions=False): + + if add_prompts: + prompts = f""" + clip_prompts = {clip_prompts} + latent_prompts = {latent_prompts} + latent_negatives = {latent_negatives} + image_prompts = [] + """ + else: + prompts = "" + + if add_dimensions: + dimensions = f"""width = {width} + height = {height} + """ + else: + dimensions = "" + settings = f""" + #This settings file can be loaded back to Latent Majesty Diffusion. If you like your setting consider sharing it to the settings library at https://github.com/multimodalart/MajestyDiffusion + [clip_list] + perceptors = {clip_load_list} + + [basic_settings] + #Perceptor things + {prompts} + {dimensions} + latent_diffusion_guidance_scale = {latent_diffusion_guidance_scale} + clip_guidance_scale = {clip_guidance_scale} + aesthetic_loss_scale = {aesthetic_loss_scale} + augment_cuts={augment_cuts} + + #Init image settings + starting_timestep = {starting_timestep} + init_scale = {init_scale} + init_brightness = {init_brightness} + init_noise = {init_noise} + + [advanced_settings] + #Add CLIP Guidance and all the flavors or just run normal Latent Diffusion + use_cond_fn = {use_cond_fn} + + #Custom schedules for cuts. Check out the schedules documentation here + custom_schedule_setting = {custom_schedule_setting} + + #Cut settings + clamp_index = {clamp_index} + cut_overview = {list_mul_to_array(cut_overview)} + cut_innercut = {list_mul_to_array(cut_innercut)} + cut_blur_n = {list_mul_to_array(cut_blur_n)} + cut_blur_kernel = {cut_blur_kernel} + cut_ic_pow = {cut_ic_pow} + cut_icgray_p = {list_mul_to_array(cut_icgray_p)} + cutn_batches = {cutn_batches} + range_index = {list_mul_to_array(range_index)} + active_function = "{active_function}" + ths_method= "{ths_method}" + tv_scales = {list_mul_to_array(tv_scales)} + latent_tv_loss = {latent_tv_loss} + + #If you uncomment this line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale will be used + clip_guidance_schedule = {list_mul_to_array(clip_guidance_index)} + + #Apply symmetric loss (force simmetry to your results) + symmetric_loss_scale = {symmetric_loss_scale} + + #Latent Diffusion Advanced Settings + #Use when latent upscale to correct satuation problem + scale_div = {scale_div} + #Magnify grad before clamping by how many times + opt_mag_mul = {opt_mag_mul} + opt_ddim_eta = {opt_ddim_eta} + opt_eta_end = {opt_eta_end} + opt_temperature = {opt_temperature} + + #Grad advanced settings + grad_center = {grad_center} + #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept + grad_scale={grad_scale} + score_modifier = {score_modifier} + threshold_percentile = {threshold_percentile} + threshold = {threshold} + var_index = {list_mul_to_array(var_index)} + + #Init image advanced settings + init_rotate={init_rotate} + mask_rotate={mask_rotate} + init_magnitude = {init_magnitude} + + #More settings + RGB_min = {RGB_min} + RGB_max = {RGB_max} + #How to pad the image with cut_overview + padargs = {padargs} + flip_aug={flip_aug} + + #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14 + experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings} + #How much you want this to influence your result + experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight} + #9 are good aesthetic embeddings, 0 are bad ones + experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score} + + # For fun dont change except if you really know what your are doing + grad_blur = {grad_blur} + compress_steps = {compress_steps} + compress_factor = {compress_factor} + punish_steps = {punish_steps} + punish_factor = {punish_factor} + """ + return settings + + +def load_clip_models(): + global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list + for item in mmc_models: + print("Loaded ", item["id"]) + clip_list.append(item["id"]) + model_loaders = REGISTRY.find(**item) + for model_loader in model_loaders: + clip_model_loaded = model_loader.load() + clip_model[item["id"]] = MockOpenaiClip(clip_model_loaded) + clip_size[item["id"]] = clip_model[item["id"]].visual.input_resolution + clip_tokenize[item["id"]] = clip_model[item["id"]].preprocess_text() + clip_normalize[item["id"]] = normalize + + +def full_clip_load(): + torch.cuda.empty_cache() + gc.collect() + get_mmc_models() + load_clip_models() + + +# Alstro's aesthetic model +def load_aesthetic_model(): + global aesthetic_model_336, aesthetic_model_224, aesthetic_model_16, aesthetic_model_32 + aesthetic_model_336 = torch.nn.Linear(768, 1).cuda() + aesthetic_model_336.load_state_dict( + torch.load(f"{model_path}/ava_vit_l_14_336_linear.pth") + ) + + aesthetic_model_224 = torch.nn.Linear(768, 1).cuda() + aesthetic_model_224.load_state_dict( + torch.load(f"{model_path}/ava_vit_l_14_linear.pth") + ) + + aesthetic_model_16 = torch.nn.Linear(512, 1).cuda() + aesthetic_model_16.load_state_dict( + torch.load(f"{model_path}/ava_vit_b_16_linear.pth") + ) + + aesthetic_model_32 = torch.nn.Linear(512, 1).cuda() + aesthetic_model_32.load_state_dict( + torch.load(f"{model_path}/sa_0_4_vit_b_32_linear.pth") + ) + + +def load_lpips_model(): + global lpips_model + lpips_model = lpips.LPIPS(net="vgg").to(device) + + +def config_init_image(): + global custom_schedule_setting + if ( + ((init_image is not None) and (init_image != "None") and (init_image != "")) + and starting_timestep != 1 + and custom_schedule_setting[0][1] == 1000 + ): + custom_schedule_setting[0] = [ + custom_schedule_setting[0][0], + int(custom_schedule_setting[0][1] * starting_timestep), + custom_schedule_setting[0][2], + ] + + +def config_clip_guidance(): + global clip_guidance_index, clip_guidance_schedule, clip_guidance_scale + if clip_guidance_schedule: + clip_guidance_index = clip_guidance_schedule + else: + clip_guidance_index = [clip_guidance_scale] * 1000 + + +def config_output_size(): + global opt + opt.W = (width // 64) * 64 + opt.H = (height // 64) * 64 + if opt.W != width or opt.H != height: + print( + f"Changing output size to {opt.W}x{opt.H}. Dimensions must by multiples of 64." + ) + + +def config_options(): + global aes_scale, opt, aug, clamp_index_variation, score_corrector + aes_scale = aesthetic_loss_scale + opt.mag_mul = opt_mag_mul + opt.ddim_eta = opt_ddim_eta + opt.eta_end = opt_eta_end + opt.temperature = opt_temperature + opt.n_iter = how_many_batches + opt.n_samples = n_samples + opt.scale = latent_diffusion_guidance_scale + aug = augment_cuts + if len(clamp_index) == 2: + clamp_index_variation = np.linspace(clamp_index[0], clamp_index[1], 1000) + else: + clamp_index_variation = clamp_index + score_corrector = DotMap() + score_corrector.modify_score = modify_score + + +def modify_score(e_t, e_t_uncond): + if score_modifier is False: + return e_t + else: + e_t_d = e_t - e_t_uncond + s = torch.quantile( + rearrange(e_t_d, "b ... -> b (...)").abs().float(), + threshold_percentile, + dim=-1, + ) + + s.clamp_(min=1.0) + s = s.view(-1, *((1,) * (e_t_d.ndim - 1))) + if ths_method == "softsign": + e_t_d = F.softsign(e_t_d * 3) / s / 3 + elif ths_method == "clamp": + e_t_d = e_t_d.clamp(-s, s) / s + e_t = e_t_uncond + e_t_d + return e_t + + +def use_args(args: argparse.Namespace): + global_var_scope = globals() + warnings.filterwarnings("ignore") + for k, v in vars(args).items(): + global_var_scope[k] = v + + +def load_custom_settings(): + global_var_scope = globals() + global clip_load_list + warnings.filterwarnings("ignore") + if ( + custom_settings is not None + and custom_settings != "" + and custom_settings != "path/to/settings.cfg" + ): + print("Loaded ", custom_settings) + try: + from configparser import ConfigParser + except ImportError: + from ConfigParser import ConfigParser + import configparser + + config = ConfigParser() + config.read(custom_settings) + # custom_settings_stream = fetch(custom_settings) + # Load CLIP models from config + if config.has_section("clip_list"): + clip_incoming_list = config.items("clip_list") + clip_incoming_models = clip_incoming_list[0] + incoming_perceptors = eval(clip_incoming_models[1]) + if (len(incoming_perceptors) != len(clip_load_list)) or not all( + elem in incoming_perceptors for elem in clip_load_list + ): + clip_load_list = incoming_perceptors + + # Load settings from config and replace variables + if config.has_section("basic_settings"): + basic_settings = config.items("basic_settings") + for basic_setting in basic_settings: + global_var_scope[basic_setting[0]] = eval(basic_setting[1]) + + if config.has_section("advanced_settings"): + advanced_settings = config.items("advanced_settings") + for advanced_setting in advanced_settings: + global_var_scope[advanced_setting[0]] = eval(advanced_setting[1]) + + +def do_run(): + global has_purged + if has_purged: + global clip_model, clip_size, clip_tokenize, clip_normalize, clip_list + full_clip_load() + has_purged = False + global opt, model, p, base_count, make_cutouts, progress, target_embeds, weights, zero_embed, init, scale_factor, cur_step + if generate_video: + fps = 24 + p = Popen( + [ + "ffmpeg", + "-y", + "-f", + "image2pipe", + "-vcodec", + "png", + "-r", + str(fps), + "-i", + "-", + "-vcodec", + "libx264", + "-r", + str(fps), + "-pix_fmt", + "yuv420p", + "-crf", + "17", + "-preset", + "veryslow", + "video.mp4", + ], + stdin=PIPE, + ) + # with torch.cuda.amp.autocast(): + cur_step = 0 + scale_factor = 1 + make_cutouts = {} + for i in clip_list: + make_cutouts[i] = MakeCutouts(clip_size[i], Overview=1) + for i in clip_list: + target_embeds[i] = [] + weights[i] = [] + + for prompt in prompts: + txt, weight = parse_prompt(prompt) + for i in clip_list: + if "cloob" not in i: + with torch.cuda.amp.autocast(): + embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device)) + target_embeds[i].append(embeds) + weights[i].append(weight) + else: + embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device)) + target_embeds[i].append(embeds) + weights[i].append(weight) + + for prompt in image_prompts: + print(f"processing{prompt}", end="\r") + path, weight = parse_prompt(prompt) + img = Image.open(fetch(path)).convert("RGB") + img = TF.resize( + img, min(opt.W, opt.H, *img.size), transforms.InterpolationMode.LANCZOS + ) + for i in clip_list: + if "cloob" not in i: + with torch.cuda.amp.autocast(): + batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device)) + embed = clip_model[i].encode_image(clip_normalize[i](batch)) + target_embeds[i].append(embed) + weights[i].extend([weight]) + else: + batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device)) + embed = clip_model[i].encode_image(clip_normalize[i](batch)) + target_embeds[i].append(embed) + weights[i].extend([weight]) + # if anti_jpg != 0: + # target_embeds["ViT-B-32--openai"].append( + # torch.tensor( + # [ + # np.load(f"{model_path}/openimages_512x_png_embed224.npz")["arr_0"] + # - np.load(f"{model_path}/imagenet_512x_jpg_embed224.npz")["arr_0"] + # ], + # device=device, + # ) + # ) + # weights["ViT-B-32--openai"].append(anti_jpg) + + for i in clip_list: + target_embeds[i] = torch.cat(target_embeds[i]) + weights[i] = torch.tensor([weights[i]], device=device) + shape = [4, opt.H // 8, opt.W // 8] + init = None + mask = None + transform = T.GaussianBlur(kernel_size=3, sigma=0.4) + if init_image is not None: + init = Image.open(fetch(init_image)).convert("RGB") + init = TF.to_tensor(init).to(device).unsqueeze(0) + if init_rotate: + init = torch.rot90(init, 1, [3, 2]) + init = resize(init, out_shape=[opt.n_samples, 3, opt.H, opt.W]) + init = init.mul(2).sub(1).half() + init_encoded = ( + model.first_stage_model.encode(init).sample() * init_magnitude + + init_brightness + ) + init_encoded = init_encoded + noise_like(init_encoded.shape, device, False).mul( + init_noise + ) + else: + init = None + init_encoded = None + if init_mask is not None: + mask = Image.open(fetch(init_mask)).convert("RGB") + mask = TF.to_tensor(mask).to(device).unsqueeze(0) + if mask_rotate: + mask = torch.rot90(init, 1, [3, 2]) + mask = resize(mask, out_shape=[opt.n_samples, 1, opt.H // 8, opt.W // 8]) + mask = transform(mask) + print(mask) + + if progress: + display.display(progress) + + if opt.plms: + sampler = PLMSSampler(model) + else: + sampler = DDIMSampler(model) + + os.makedirs(opt.outdir, exist_ok=True) + outpath = opt.outdir + + prompt = opt.prompt + sample_path = os.path.join(outpath, "samples") + os.makedirs(sample_path, exist_ok=True) + base_count = len(os.listdir(sample_path)) + + all_samples = list() + last_step_upscale = False + eta1 = opt.ddim_eta + eta2 = opt.eta_end + with torch.enable_grad(): + with torch.cuda.amp.autocast(): + with model.ema_scope(): + uc = None + if opt.scale != 1.0: + uc = model.get_learned_conditioning(opt.n_samples * opt.uc).cuda() + + for n in trange(opt.n_iter, desc="Sampling"): + torch.cuda.empty_cache() + gc.collect() + c = model.get_learned_conditioning(opt.n_samples * prompt).cuda() + if init_encoded is None: + x_T = torch.randn([opt.n_samples, *shape], device=device) + else: + x_T = init_encoded + + for custom_schedule in custom_schedules: + if type(custom_schedule) != type(""): + torch.cuda.empty_cache() + gc.collect() + last_step_upscale = False + samples_ddim, _ = sampler.sample( + S=opt.ddim_steps, + conditioning=c, + batch_size=opt.n_samples, + shape=shape, + custom_schedule=custom_schedule, + verbose=False, + unconditional_guidance_scale=opt.scale, + unconditional_conditioning=uc, + eta=eta1, + eta_end=eta2, + img_callback=None if use_cond_fn else display_handler, + cond_fn=cond_fn if use_cond_fn else None, + temperature=opt.temperature, + x_adjust_fn=cond_clamp, + x_T=x_T, + x0=x_T, + mask=mask, + score_corrector=score_corrector, + corrector_kwargs={}, + ) + x_T = samples_ddim.clamp(-6, 6) + else: + torch.cuda.empty_cache() + gc.collect() + method, scale_factor = custom_schedule.split(":") + if method == "RGB": + scale_factor = float(scale_factor) + temp_file_name = ( + "temp_" + f"{str(round(time.time()))}.png" + ) + temp_file = os.path.join(sample_path, temp_file_name) + im.save(temp_file, format="PNG") + init = Image.open(fetch(temp_file)).convert("RGB") + init = TF.to_tensor(init).to(device).unsqueeze(0) + opt.H, opt.W = ( + opt.H * scale_factor, + opt.W * scale_factor, + ) + init = resize( + init, + out_shape=[opt.n_samples, 3, opt.H, opt.W], + antialiasing=True, + ) + init = init.mul(2).sub(1).half() + x_T = ( + model.first_stage_model.encode(init).sample() + * init_magnitude + ) + x_T += noise_like(x_T.shape, device, False) * init_noise + x_T = x_T.clamp(-6, 6) + if method == "gfpgan": + scale_factor = float(scale_factor) + last_step_upscale = True + temp_file_name = ( + "temp_" + f"{str(round(time.time()))}.png" + ) + temp_file = os.path.join(sample_path, temp_file_name) + im.save(temp_file, format="PNG") + GFP_factor = 2 if scale_factor > 1 else 1 + GFP_ver = 1.3 # if GFP_factor == 1 else 1.2 + + torch.cuda.empty_cache() + gc.collect() + + subprocess.call( + [ + "python3", + "inference_gfpgan.py", + "-i", + temp_file, + "-o", + "results", + "-v", + str(GFP_ver), + "-s", + str(GFP_factor), + ], + cwd="GFPGAN", + shell=False, + ) + + face_corrected = Image.open( + fetch( + f"GFPGAN/results/restored_imgs/{temp_file_name}" + ) + ) + with io.BytesIO() as output: + face_corrected.save(output, format="PNG") + if progress: + progress.value = output.getvalue() + init = Image.open( + fetch( + f"GFPGAN/results/restored_imgs/{temp_file_name}" + ) + ).convert("RGB") + init = TF.to_tensor(init).to(device).unsqueeze(0) + opt.H, opt.W = ( + opt.H * scale_factor, + opt.W * scale_factor, + ) + init = resize( + init, + out_shape=[opt.n_samples, 3, opt.H, opt.W], + antialiasing=True, + ) + init = init.mul(2).sub(1).half() + x_T = ( + model.first_stage_model.encode(init).sample() + * init_magnitude + ) + x_T += noise_like(x_T.shape, device, False) * init_noise + x_T = x_T.clamp(-6, 6) + if method == "purge": + has_purged = True + for i in scale_factor.split(","): + if i in clip_load_list: + arch, pub, m_id = i[1:-1].split(" - ") + print("Purge ", i) + del clip_list[clip_list.index(m_id)] + del clip_model[m_id] + del clip_size[m_id] + del clip_tokenize[m_id] + del clip_normalize[m_id] + + # last_step_uspcale_list.append(last_step_upscale) + scale_factor = 1 + current_time = str(round(time.time())) + if last_step_upscale: + latest_upscale = Image.open( + fetch(f"GFPGAN/results/restored_imgs/{temp_file_name}") + ).convert("RGB") + latest_upscale.save( + os.path.join(outpath, f"{current_time}.png"), format="PNG" + ) + else: + Image.fromarray(image_grid.astype(np.uint8)).save( + os.path.join(outpath, f"{current_time}.png"), format="PNG" + ) + settings = generate_settings_file( + add_prompts=True, add_dimensions=False + ) + text_file = open(f"{outpath}/{current_time}.cfg", "w") + text_file.write(settings) + text_file.close() + x_samples_ddim = model.decode_first_stage(samples_ddim) + x_samples_ddim = torch.clamp( + (x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0 + ) + all_samples.append(x_samples_ddim) + + if len(all_samples) > 1: + # additionally, save as grid + grid = torch.stack(all_samples, 0) + grid = rearrange(grid, "n b c h w -> (n b) c h w") + grid = make_grid(grid, nrow=opt.n_samples) + + # to image + grid = 255.0 * rearrange(grid, "c h w -> h w c").cpu().numpy() + Image.fromarray(grid.astype(np.uint8)).save( + os.path.join(outpath, f"grid_{str(round(time.time()))}.png") + ) + + if generate_video: + p.stdin.close() diff --git a/previous_versions/latent_v1.2.ipynb b/previous_versions/latent_v1.2.ipynb new file mode 100644 index 0000000..8e06f7a --- /dev/null +++ b/previous_versions/latent_v1.2.ipynb @@ -0,0 +1,1460 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NUmmV5ZvrPbP" + }, + "source": [ + "# Latent Majesty Diffusion v1.3\n", + "#### Formerly known as Princess Generator\n", + "##### Access our [Majestic Guide](https://multimodal.art/majesty-diffusion) (_under construction_), our [GitHub](https://github.com/multimodalart/majesty-diffusion), join our community on [Discord](https://discord.gg/yNBtQBEDfZ) or reach out via [@multimodalart on Twitter](https://twitter.com/multimodalart))\n", + "\\\n", + " \n", + "---\n", + "\\\n", + "\n", + "\n", + "#### CLIP Guided Latent Diffusion by [dango233](https://github.com/Dango233/) and [apolinario (@multimodalart)](https://twitter.com/multimodalart). \n", + "The LAION-400M-trained model and the modified inference code are from [CompVis Latent Diffusion](https://github.com/CompVis/latent-diffusion). The guided-diffusion method is modified by Dango233 based on [Katherine Crowson](https://twitter.com/RiversHaveWings)'s guided diffusion notebook. multimodalart savable settings, MMC and assembled the Colab. Check the complete list on our GitHub. Some functions and methods are from various code masters (nsheppard, DanielRussRuss and others)\n", + "\n", + "Changelog: 1.3 - better upscaler (learn how to use it on our [Majestic Guide](https://multimodal.art/majesty-diffusion))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uWLsDt7wkZfU" + }, + "source": [ + "## Save model and outputs on Google Drive? " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "aJF6wP2zkWE_" + }, + "outputs": [], + "source": [ + "#@markdown Enable saving outputs to Google Drive to save your creations at AI/models\n", + "save_outputs_to_google_drive = True #@param {type:\"boolean\"}\n", + "#@markdown Enable saving models to Google Drive to avoid downloading the model every Colab instance\n", + "save_models_to_google_drive = True #@param {type:\"boolean\"}\n", + "\n", + "if save_outputs_to_google_drive or save_models_to_google_drive:\n", + " from google.colab import drive\n", + " try:\n", + " drive.mount('/content/gdrive')\n", + " except:\n", + " save_outputs_to_google_drive = False\n", + " save_models_to_google_drive = False\n", + "\n", + "model_path = \"/content/gdrive/MyDrive/AI/models\" if save_models_to_google_drive else \"/content/\"\n", + "outputs_path = \"/content/gdrive/MyDrive/AI/latent_majesty_diffusion\" if save_outputs_to_google_drive else \"/content/outputs\"\n", + "!mkdir -p $model_path\n", + "!mkdir -p $outputs_path\n", + "print(f\"Model will be stored at {model_path}\")\n", + "print(f\"Outputs will be saved to {outputs_path}\")\n", + "\n", + "#If you want to run it locally change it to true\n", + "is_local = False\n", + "skip_installs = False\n", + "if(is_local):\n", + " model_path = \"/choose/your/local/model/path\"\n", + " outputs_path = \"/choose/your/local/outputs/path\"\n", + " skip_installs = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "5Fxt-5TaYBs2" + }, + "outputs": [], + "source": [ + "#@title Model settings\n", + "#@markdown The `original` model is the model trained by CompVis in the LAION-400M dataset\n", + "#@markdown
The `finetuned` model is a finetune of the `original` model by Jack000 that generates less watermarks, but is a bit worse in text synthesis. Colab Free does not have enough run for the finetuned (for now)\n", + "latent_diffusion_model = 'original' #@param [\"original\", \"finetuned\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xEVSOJ4f0B21" + }, + "source": [ + "# Setup stuff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "NHgUAp48qwoG" + }, + "outputs": [], + "source": [ + "#@title Installation\n", + "if(not skip_installs):\n", + " import subprocess\n", + " nvidiasmi_output = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE).stdout.decode('utf-8')\n", + " cards_requiring_downgrade = [\"Tesla T4\", \"V100\"]\n", + " if any(cardstr in nvidiasmi_output for cardstr in cards_requiring_downgrade):\n", + " downgrade_pytorch_result = subprocess.run(['pip', 'install', 'torch==1.10.2', 'torchvision==0.11.3', '-q'], stdout=subprocess.PIPE).stdout.decode('utf-8')\n", + " import sys\n", + " sys.path.append(\".\")\n", + " !git clone https://github.com/multimodalart/latent-diffusion\n", + " !git clone https://github.com/CompVis/taming-transformers\n", + " !git clone https://github.com/TencentARC/GFPGAN\n", + " !git clone https://github.com/multimodalart/majesty-diffusion\n", + " !git lfs clone https://github.com/LAION-AI/aesthetic-predictor\n", + " !pip install -e ./taming-transformers\n", + " !pip install omegaconf>=2.0.0 pytorch-lightning>=1.0.8 torch-fidelity einops\n", + " !pip install transformers\n", + " !pip install dotmap\n", + " !pip install resize-right\n", + " !pip install piq\n", + " !pip install lpips\n", + " !pip install basicsr\n", + " !pip install facexlib\n", + " !pip install realesrgan\n", + "\n", + " sys.path.append('./taming-transformers')\n", + " from taming.models import vqgan\n", + " from subprocess import Popen, PIPE\n", + " try:\n", + " import mmc\n", + " except:\n", + " # install mmc\n", + " !git clone https://github.com/apolinario/Multi-Modal-Comparators --branch gradient_checkpointing\n", + " !pip install poetry\n", + " !cd Multi-Modal-Comparators; poetry build\n", + " !cd Multi-Modal-Comparators; pip install dist/mmc*.whl\n", + " \n", + " # optional final step:\n", + " #poe napm_installs\n", + " !python Multi-Modal-Comparators/src/mmc/napm_installs/__init__.py\n", + " # suppress mmc warmup outputs\n", + " import mmc.loaders" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fNqCqQDoyZmq" + }, + "source": [ + "Now, download the checkpoint (~5.7 GB). This will usually take 3-6 minutes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "cNHvQBhzyXCI" + }, + "outputs": [], + "source": [ + "#@title Download models\n", + "import os\n", + "if os.path.isfile(f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\"):\n", + " print(\"Using Latent Diffusion model saved from Google Drive\")\n", + "else: \n", + " !wget -O $model_path/latent_diffusion_txt2img_f8_large.ckpt https://ommer-lab.com/files/latent-diffusion/nitro/txt2img-f8-large/model.ckpt --no-check-certificate\n", + "\n", + "if os.path.isfile(f\"{model_path}/finetuned_state_dict.pt\"):\n", + " print(\"Using Latent Diffusion model saved from Google Drive\")\n", + "else: \n", + " !wget -O $model_path/finetuned_state_dict.pt https://huggingface.co/multimodalart/compvis-latent-diffusion-text2img-large/resolve/main/finetuned_state_dict.pt --no-check-certificate\n", + "\n", + "if os.path.isfile(f\"{model_path}/ava_vit_l_14_336_linear.pth\"):\n", + " print(\"Using ViT-L/14@336px aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/ava_vit_l_14_336_linear.pth https://multimodal.art/models/ava_vit_l_14_336_linear.pth\n", + "\n", + "if os.path.isfile(f\"{model_path}/sa_0_4_vit_l_14_linear.pth\"):\n", + " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/sa_0_4_vit_l_14_linear.pth https://multimodal.art/models/sa_0_4_vit_l_14_linear.pth\n", + "\n", + "if os.path.isfile(f\"{model_path}/ava_vit_l_14_linear.pth\"):\n", + " print(\"Using ViT-L/14 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/ava_vit_l_14_linear.pth https://multimodal.art/models/ava_vit_l_14_linear.pth\n", + "\n", + "if os.path.isfile(f\"{model_path}/ava_vit_b_16_linear.pth\"):\n", + " print(\"Using ViT-B/16 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/ava_vit_b_16_linear.pth http://batbot.tv/ai/models/v-diffusion/ava_vit_b_16_linear.pth\n", + "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_16_linear.pth\"):\n", + " print(\"Using ViT-B/16 sa aesthetic model already saved\")\n", + "else:\n", + " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_16_linear.pth\n", + "if os.path.isfile(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"):\n", + " print(\"Using ViT-B/32 aesthetic model from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/sa_0_4_vit_b_32_linear.pth https://multimodal.art/models/sa_0_4_vit_b_32_linear.pth\n", + "if os.path.isfile(f\"{model_path}/openimages_512x_png_embed224.npz\"):\n", + " print(\"Using openimages png from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/openimages_512x_png_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/openimages_512x_png_embed224.npz\n", + "if os.path.isfile(f\"{model_path}/imagenet_512x_jpg_embed224.npz\"):\n", + " print(\"Using imagenet antijpeg from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/imagenet_512x_jpg_embed224.npz https://github.com/nshepperd/jax-guided-diffusion/raw/8437b4d390fcc6b57b89cedcbaf1629993c09d03/data/imagenet_512x_jpg_embed224.npz\n", + "if os.path.isfile(f\"{model_path}/GFPGANv1.3.pth\"):\n", + " print(\"Using GFPGAN v1.3 from Google Drive\")\n", + "else:\n", + " !wget -O $model_path/GFPGANv1.3.pth https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth\n", + "!cp $model_path/GFPGANv1.3.pth GFPGAN/experiments/pretrained_models/GFPGANv1.3.pth\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ThxmCePqt1mt" + }, + "source": [ + "Let's also check what type of GPU we've got." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jbL2zJ7Pt7Jl" + }, + "outputs": [], + "source": [ + "!nvidia-smi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "BPnyd-XUKbfE" + }, + "outputs": [], + "source": [ + "#@title Import stuff\n", + "import argparse, os, sys, glob\n", + "import torch\n", + "import numpy as np\n", + "from omegaconf import OmegaConf\n", + "from PIL import Image\n", + "from tqdm.auto import tqdm, trange\n", + "tqdm_auto_model = __import__(\"tqdm.auto\", fromlist=[None]) \n", + "sys.modules['tqdm'] = tqdm_auto_model\n", + "from einops import rearrange\n", + "from torchvision.utils import make_grid\n", + "import transformers\n", + "import gc\n", + "sys.path.append('./latent-diffusion')\n", + "from ldm.util import instantiate_from_config\n", + "from ldm.models.diffusion.ddim import DDIMSampler\n", + "from ldm.models.diffusion.plms import PLMSSampler\n", + "import tensorflow as tf\n", + "from dotmap import DotMap\n", + "import ipywidgets as widgets\n", + "from math import pi\n", + "\n", + "from subprocess import Popen, PIPE\n", + "\n", + "from dataclasses import dataclass\n", + "from functools import partial\n", + "import gc\n", + "import io\n", + "import math\n", + "import sys\n", + "import random\n", + "from piq import brisque\n", + "from itertools import product\n", + "from IPython import display\n", + "import lpips\n", + "from PIL import Image, ImageOps\n", + "import requests\n", + "import torch\n", + "from torch import nn\n", + "from torch.nn import functional as F\n", + "from torchvision import models\n", + "from torchvision import transforms\n", + "from torchvision import transforms as T\n", + "from torchvision.transforms import functional as TF\n", + "from numpy import nan\n", + "from threading import Thread\n", + "import time\n", + "\n", + "#sys.path.append('../CLIP')\n", + "#Resizeright for better gradient when resizing\n", + "#sys.path.append('../ResizeRight/')\n", + "#sys.path.append('../cloob-training/')\n", + "\n", + "from resize_right import resize\n", + "\n", + "import clip\n", + "#from cloob_training import model_pt, pretrained\n", + "\n", + "#pretrained.list_configs()\n", + "from torch.utils.tensorboard import SummaryWriter\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "twG4nxYCrI8F" + }, + "outputs": [], + "source": [ + "#@title Load the model\n", + "torch.backends.cudnn.benchmark = True\n", + "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n", + "def load_model_from_config(config, ckpt, verbose=False, latent_diffusion_model=\"original\"):\n", + " print(f\"Loading model from {ckpt}\")\n", + " print(latent_diffusion_model)\n", + " model = instantiate_from_config(config.model)\n", + " sd = torch.load(ckpt, map_location=\"cuda\")[\"state_dict\"]\n", + " m, u = model.load_state_dict(sd, strict = False)\n", + " if(latent_diffusion_model == \"finetuned\"): \n", + " del sd\n", + " sd_finetune = torch.load(f\"{model_path}/finetuned_state_dict.pt\",map_location=\"cuda\")\n", + " m, u = model.model.load_state_dict(sd_finetune, strict = False)\n", + " model.model = model.model.half().eval().to(device)\n", + " del sd_finetune\n", + " # sd = pl_sd[\"state_dict\"]\n", + " \n", + " if len(m) > 0 and verbose:\n", + " print(\"missing keys:\")\n", + " print(m)\n", + " if len(u) > 0 and verbose:\n", + " print(\"unexpected keys:\")\n", + " print(u)\n", + "\n", + " model.requires_grad_(False).half().eval().to('cuda')\n", + " return model\n", + "\n", + "config = OmegaConf.load(\"./latent-diffusion/configs/latent-diffusion/txt2img-1p4B-eval.yaml\") # TODO: Optionally download from same location as ckpt and chnage this logic\n", + "model = load_model_from_config(config, f\"{model_path}/latent_diffusion_txt2img_f8_large.ckpt\",False, latent_diffusion_model) # TODO: check path\n", + "model = model.half().eval().to(device)\n", + "#if(latent_diffusion_model == \"finetuned\"):\n", + "# model.model = model.model.half().eval().to(device)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "HY_7vvnPThzS" + }, + "outputs": [], + "source": [ + "#@title Load necessary functions\n", + "def set_custom_schedules(schedule):\n", + " custom_schedules = []\n", + " for schedule_item in schedule:\n", + " if(isinstance(schedule_item,list)):\n", + " custom_schedules.append(np.arange(*schedule_item))\n", + " else:\n", + " custom_schedules.append(schedule_item)\n", + " \n", + " return custom_schedules\n", + "\n", + "def parse_prompt(prompt):\n", + " if prompt.startswith('http://') or prompt.startswith('https://') or prompt.startswith(\"E:\") or prompt.startswith(\"C:\") or prompt.startswith(\"D:\"):\n", + " vals = prompt.rsplit(':', 2)\n", + " vals = [vals[0] + ':' + vals[1], *vals[2:]]\n", + " else:\n", + " vals = prompt.rsplit(':', 1)\n", + " vals = vals + ['', '1'][len(vals):]\n", + " return vals[0], float(vals[1])\n", + "\n", + "\n", + "class MakeCutouts(nn.Module):\n", + " def __init__(self, cut_size,\n", + " Overview=4, \n", + " WholeCrop = 0, WC_Allowance = 10, WC_Grey_P=0.2,\n", + " InnerCrop = 0, IC_Size_Pow=0.5, IC_Grey_P = 0.2\n", + " ):\n", + " super().__init__()\n", + " self.cut_size = cut_size\n", + " self.Overview = Overview\n", + " self.WholeCrop= WholeCrop\n", + " self.WC_Allowance = WC_Allowance\n", + " self.WC_Grey_P = WC_Grey_P\n", + " self.InnerCrop = InnerCrop\n", + " self.IC_Size_Pow = IC_Size_Pow\n", + " self.IC_Grey_P = IC_Grey_P\n", + " self.augs = T.Compose([\n", + " #T.RandomHorizontalFlip(p=0.5),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " T.RandomAffine(degrees=0, \n", + " translate=(0.05, 0.05), \n", + " #scale=(0.9,0.95),\n", + " fill=-1, interpolation = T.InterpolationMode.BILINEAR, ),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " #T.RandomPerspective(p=1, interpolation = T.InterpolationMode.BILINEAR, fill=-1,distortion_scale=0.2),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " T.RandomGrayscale(p=0.1),\n", + " T.Lambda(lambda x: x + torch.randn_like(x) * 0.01),\n", + " T.ColorJitter(brightness=0.05, contrast=0.05, saturation=0.05),\n", + " ])\n", + "\n", + " def forward(self, input):\n", + " gray = transforms.Grayscale(3)\n", + " sideY, sideX = input.shape[2:4]\n", + " max_size = min(sideX, sideY)\n", + " min_size = min(sideX, sideY, self.cut_size)\n", + " l_size = max(sideX, sideY)\n", + " output_shape = [input.shape[0],3,self.cut_size,self.cut_size] \n", + " output_shape_2 = [input.shape[0],3,self.cut_size+2,self.cut_size+2]\n", + " pad_input = F.pad(input,((sideY-max_size)//2+round(max_size*0.055),(sideY-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055),(sideX-max_size)//2+round(max_size*0.055)), **padargs)\n", + " cutouts_list = []\n", + " \n", + " if self.Overview>0:\n", + " cutouts = []\n", + " cutout = resize(pad_input, out_shape=output_shape, antialiasing=True)\n", + " output_shape_all = list(output_shape)\n", + " output_shape_all[0]=self.Overview*input.shape[0]\n", + " pad_input = pad_input.repeat(input.shape[0],1,1,1)\n", + " cutout = resize(pad_input, out_shape=output_shape_all)\n", + " if aug: cutout=self.augs(cutout)\n", + " cutouts_list.append(cutout)\n", + " \n", + " if self.InnerCrop >0:\n", + " cutouts=[]\n", + " for i in range(self.InnerCrop):\n", + " size = int(torch.rand([])**self.IC_Size_Pow * (max_size - min_size) + min_size)\n", + " offsetx = torch.randint(0, sideX - size + 1, ())\n", + " offsety = torch.randint(0, sideY - size + 1, ())\n", + " cutout = input[:, :, offsety:offsety + size, offsetx:offsetx + size]\n", + " if i <= int(self.IC_Grey_P * self.InnerCrop):\n", + " cutout = gray(cutout)\n", + " cutout = resize(cutout, out_shape=output_shape)\n", + " cutouts.append(cutout)\n", + " if cutout_debug:\n", + " TF.to_pil_image(cutouts[-1].add(1).div(2).clamp(0, 1).squeeze(0)).save(\"content/diff/cutouts/cutout_InnerCrop.jpg\",quality=99)\n", + " cutouts_tensor = torch.cat(cutouts)\n", + " cutouts=[]\n", + " cutouts_list.append(cutouts_tensor)\n", + " cutouts=torch.cat(cutouts_list)\n", + " return cutouts\n", + "\n", + "\n", + "def spherical_dist_loss(x, y):\n", + " x = F.normalize(x, dim=-1)\n", + " y = F.normalize(y, dim=-1)\n", + " return (x - y).norm(dim=-1).div(2).arcsin().pow(2).mul(2)\n", + "\n", + "\n", + "def tv_loss(input):\n", + " \"\"\"L2 total variation loss, as in Mahendran et al.\"\"\"\n", + " input = F.pad(input, (0, 1, 0, 1), 'replicate')\n", + " x_diff = input[..., :-1, 1:] - input[..., :-1, :-1]\n", + " y_diff = input[..., 1:, :-1] - input[..., :-1, :-1]\n", + " return (x_diff**2 + y_diff**2).mean([1, 2, 3])\n", + "\n", + "\n", + "def range_loss(input, range_min, range_max):\n", + " return (input - input.clamp(range_min,range_max)).pow(2).mean([1, 2, 3])\n", + "\n", + "def symmetric_loss(x):\n", + " w = x.shape[3]\n", + " diff = (x - torch.flip(x,[3])).square().mean().sqrt()/(x.shape[2]*x.shape[3]/1e4)\n", + " return(diff)\n", + "\n", + "def fetch(url_or_path):\n", + " \"\"\"Fetches a file from an HTTP or HTTPS url, or opens the local file.\"\"\"\n", + " if str(url_or_path).startswith('http://') or str(url_or_path).startswith('https://'):\n", + " r = requests.get(url_or_path)\n", + " r.raise_for_status()\n", + " fd = io.BytesIO()\n", + " fd.write(r.content)\n", + " fd.seek(0)\n", + " return fd\n", + " return open(url_or_path, 'rb')\n", + "\n", + "\n", + "def to_pil_image(x):\n", + " \"\"\"Converts from a tensor to a PIL image.\"\"\"\n", + " if x.ndim == 4:\n", + " assert x.shape[0] == 1\n", + " x = x[0]\n", + " if x.shape[0] == 1:\n", + " x = x[0]\n", + " return TF.to_pil_image((x.clamp(-1, 1) + 1) / 2)\n", + "\n", + "\n", + "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", + " std=[0.26862954, 0.26130258, 0.27577711])\n", + "\n", + "def centralized_grad(x, use_gc=True, gc_conv_only=False):\n", + " if use_gc:\n", + " if gc_conv_only:\n", + " if len(list(x.size())) > 3:\n", + " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", + " else:\n", + " if len(list(x.size())) > 1:\n", + " x.add_(-x.mean(dim=tuple(range(1, len(list(x.size())))), keepdim=True))\n", + " return x\n", + "\n", + "def cond_fn(x, t):\n", + " t=1000-t\n", + " t=t[0]\n", + " with torch.enable_grad():\n", + " global clamp_start_, clamp_max\n", + " x = x.detach()\n", + " x = x.requires_grad_()\n", + " x_in = model.decode_first_stage(x)\n", + " display_handler(x_in,t,1,False)\n", + " n = x_in.shape[0]\n", + " clip_guidance_scale = clip_guidance_index[t]\n", + " make_cutouts = {}\n", + " #rx_in_grad = torch.zeros_like(x_in)\n", + " for i in clip_list:\n", + " make_cutouts[i] = MakeCutouts(clip_size[i],\n", + " Overview= cut_overview[t], \n", + " InnerCrop = cut_innercut[t], \n", + " IC_Size_Pow=cut_ic_pow, IC_Grey_P = cut_icgray_p[t]\n", + " )\n", + " cutn = cut_overview[t]+cut_innercut[t]\n", + " for j in range(cutn_batches):\n", + " losses=0\n", + " for i in clip_list:\n", + " clip_in = clip_normalize[i](make_cutouts[i](x_in.add(1).div(2)).to(\"cuda\"))\n", + " image_embeds = clip_model[i].encode_image(clip_in).float().unsqueeze(0).expand([target_embeds[i].shape[0],-1,-1])\n", + " target_embeds_temp = target_embeds[i]\n", + " if i == 'ViT-B-32--openai' and experimental_aesthetic_embeddings:\n", + " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_b_32_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", + " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", + " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", + " if i == 'ViT-L-14--openai' and experimental_aesthetic_embeddings:\n", + " aesthetic_embedding = torch.from_numpy(np.load(f'aesthetic-predictor/vit_l_14_embeddings/rating{experimental_aesthetic_embeddings_score}.npy')).to(device) \n", + " aesthetic_query = target_embeds_temp + aesthetic_embedding * experimental_aesthetic_embeddings_weight\n", + " target_embeds_temp = (aesthetic_query) / torch.linalg.norm(aesthetic_query)\n", + " target_embeds_temp = target_embeds_temp.unsqueeze(1).expand([-1,cutn*n,-1]) \n", + " dists = spherical_dist_loss(image_embeds, target_embeds_temp)\n", + " dists = dists.mean(1).mul(weights[i].squeeze()).mean()\n", + " losses+=dists*clip_guidance_scale * (2 if i in [\"ViT-L-14-336--openai\", \"RN50x64--openai\", \"ViT-B-32--laion2b_e16\"] else (.4 if \"cloob\" in i else 1))\n", + " if i == \"ViT-L-14-336--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_336(F.normalize(image_embeds, dim=-1))).mean() \n", + " losses -= aes_loss * aes_scale \n", + " if i == \"ViT-L-14--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_224(F.normalize(image_embeds, dim=-1))).mean() \n", + " losses -= aes_loss * aes_scale \n", + " if i == \"ViT-B-16--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_16(F.normalize(image_embeds, dim=-1))).mean() \n", + " losses -= aes_loss * aes_scale \n", + " if i == \"ViT-B-32--openai\" and aes_scale !=0:\n", + " aes_loss = (aesthetic_model_32(F.normalize(image_embeds, dim=-1))).mean()\n", + " losses -= aes_loss * aes_scale\n", + " #x_in_grad += torch.autograd.grad(losses, x_in)[0] / cutn_batches / len(clip_list)\n", + " #losses += dists\n", + " #losses = losses / len(clip_list) \n", + " #gc.collect()\n", + " \n", + " tv_losses = tv_loss(x).sum() * tv_scales[0] +\\\n", + " tv_loss(F.interpolate(x, scale_factor= 1/2)).sum()* tv_scales[1] + \\\n", + " tv_loss(F.interpolate(x, scale_factor = 1/4)).sum()* tv_scales[2] + \\\n", + " tv_loss(F.interpolate(x, scale_factor = 1/8)).sum()* tv_scales[3] \n", + " range_scale= range_index[t]\n", + " range_losses = range_loss(x_in,RGB_min,RGB_max).sum() * range_scale\n", + " loss = tv_losses + range_losses + losses\n", + " #del losses\n", + " if symmetric_loss_scale != 0: loss += symmetric_loss(x_in) * symmetric_loss_scale\n", + " if init_image is not None and init_scale:\n", + " lpips_loss = (lpips_model(x_in, init) * init_scale).squeeze().mean()\n", + " #print(lpips_loss)\n", + " loss += lpips_loss\n", + " #loss_grad = torch.autograd.grad(loss, x_in, )[0]\n", + " #x_in_grad += loss_grad\n", + " #grad = -torch.autograd.grad(x_in, x, x_in_grad)[0]\n", + " loss.backward()\n", + " grad = -x.grad\n", + " grad = torch.nan_to_num(grad, nan=0.0, posinf=0, neginf=0)\n", + " if grad_center: grad = centralized_grad(grad, use_gc=True, gc_conv_only=False)\n", + " mag = grad.square().mean().sqrt()\n", + " if mag==0 or torch.isnan(mag):\n", + " print(\"ERROR\")\n", + " print(t)\n", + " return(grad)\n", + " if t>=0:\n", + " if active_function == \"softsign\":\n", + " grad = F.softsign(grad*grad_scale/mag)\n", + " if active_function == \"tanh\":\n", + " grad = (grad/mag*grad_scale).tanh()\n", + " if active_function==\"clamp\":\n", + " grad = grad.clamp(-mag*grad_scale*2,mag*grad_scale*2)\n", + " if grad.abs().max()>0:\n", + " grad=grad/grad.abs().max()*opt.mag_mul\n", + " magnitude = grad.square().mean().sqrt()\n", + " else:\n", + " return(grad)\n", + " clamp_max = clamp_index[t]\n", + " #print(magnitude, end = \"\\r\")\n", + " grad = grad* magnitude.clamp(max= clamp_max) /magnitude#0.2\n", + " grad = grad.detach()\n", + " return grad\n", + "\n", + "def null_fn(x_in):\n", + " return(torch.zeros_like(x_in))\n", + "\n", + "def display_handler(x,i,cadance = 5, decode = True):\n", + " global progress, image_grid, writer, img_tensor, im\n", + " img_tensor = x\n", + " if i%cadance==0:\n", + " if decode: \n", + " x = model.decode_first_stage(x)\n", + " grid = make_grid(torch.clamp((x+1.0)/2.0, min=0.0, max=1.0),round(x.shape[0]**0.5))\n", + " grid = 255. * rearrange(grid, 'c h w -> h w c').detach().cpu().numpy()\n", + " image_grid = grid.copy(order = \"C\") \n", + " with io.BytesIO() as output:\n", + " im = Image.fromarray(grid.astype(np.uint8))\n", + " im.save(output, format = \"PNG\")\n", + " progress.value = output.getvalue()\n", + " if generate_video:\n", + " im.save(p.stdin, 'PNG')\n", + "\n", + "\n", + " \n", + "def cond_clamp(image,t): \n", + " #if t >=0:\n", + " #mag=image.square().mean().sqrt()\n", + " #mag = (mag*cc).clamp(1.6,100)\n", + " image = image.clamp(-cc, cc)\n", + " image = torch.nan_to_num(image, nan=0.0, posinf=cc, neginf=-cc)\n", + " return(image)\n", + "\n", + "def make_schedule(t_start, t_end, step_size=1):\n", + " schedule = []\n", + " par_schedule = []\n", + " t = t_start\n", + " while t > t_end:\n", + " schedule.append(t)\n", + " t -= step_size\n", + " schedule.append(t_end)\n", + " return np.array(schedule)\n", + "\n", + "lpips_model = lpips.LPIPS(net='vgg').to(device)\n", + "\n", + "def list_mul_to_array(list_mul):\n", + " i = 0\n", + " mul_count = 0\n", + " mul_string = ''\n", + " full_list = list_mul\n", + " full_list_len = len(full_list)\n", + " for item in full_list:\n", + " if(i == 0):\n", + " last_item = item\n", + " if(item == last_item):\n", + " mul_count+=1\n", + " if(item != last_item or full_list_len == i+1):\n", + " mul_string = mul_string + f' [{last_item}]*{mul_count} +'\n", + " mul_count=1\n", + " last_item = item\n", + " i+=1\n", + " return(mul_string[1:-2])\n", + "\n", + "def generate_settings_file(add_prompts=False, add_dimensions=False):\n", + " \n", + " if(add_prompts):\n", + " prompts = f'''\n", + " clip_prompts = {clip_prompts}\n", + " latent_prompts = {latent_prompts}\n", + " latent_negatives = {latent_negatives}\n", + " image_prompts = {image_prompts}\n", + " '''\n", + " else:\n", + " prompts = ''\n", + "\n", + " if(add_dimensions):\n", + " dimensions = f'''width = {width}\n", + " height = {height}\n", + " '''\n", + " else:\n", + " dimensions = ''\n", + " settings = f'''\n", + " #This settings file can be loaded back to Latent Majesty Diffusion. If you like your setting consider sharing it to the settings library at https://github.com/multimodalart/MajestyDiffusion\n", + " [clip_list]\n", + " perceptors = {clip_load_list}\n", + " \n", + " [basic_settings]\n", + " #Perceptor things\n", + " {prompts}\n", + " {dimensions}\n", + " latent_diffusion_guidance_scale = {latent_diffusion_guidance_scale}\n", + " clip_guidance_scale = {clip_guidance_scale}\n", + " aesthetic_loss_scale = {aesthetic_loss_scale}\n", + " augment_cuts={augment_cuts}\n", + "\n", + " #Init image settings\n", + " starting_timestep = {starting_timestep}\n", + " init_scale = {init_scale} \n", + " init_brightness = {init_brightness}\n", + " init_noise = {init_noise}\n", + "\n", + " [advanced_settings]\n", + " #Add CLIP Guidance and all the flavors or just run normal Latent Diffusion\n", + " use_cond_fn = {use_cond_fn}\n", + "\n", + " #Custom schedules for cuts. Check out the schedules documentation here\n", + " custom_schedule_setting = {custom_schedule_setting}\n", + "\n", + " #Cut settings\n", + " clamp_index = {list_mul_to_array(clamp_index)}\n", + " cut_overview = {list_mul_to_array(cut_overview)}\n", + " cut_innercut = {list_mul_to_array(cut_innercut)}\n", + " cut_ic_pow = {cut_ic_pow}\n", + " cut_icgray_p = {list_mul_to_array(cut_icgray_p)}\n", + " cutn_batches = {cutn_batches}\n", + " range_index = {list_mul_to_array(range_index)}\n", + " active_function = \"{active_function}\"\n", + " tv_scales = {list_mul_to_array(tv_scales)}\n", + " latent_tv_loss = {latent_tv_loss}\n", + "\n", + " #If you uncomment this line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale will be used\n", + " clip_guidance_schedule = {list_mul_to_array(clip_guidance_index)}\n", + " \n", + " #Apply symmetric loss (force simmetry to your results)\n", + " symmetric_loss_scale = {symmetric_loss_scale} \n", + "\n", + " #Latent Diffusion Advanced Settings\n", + " #Use when latent upscale to correct satuation problem\n", + " scale_div = {scale_div}\n", + " #Magnify grad before clamping by how many times\n", + " opt_mag_mul = {opt_mag_mul}\n", + " opt_ddim_eta = {opt_ddim_eta}\n", + " opt_eta_end = {opt_eta_end}\n", + " opt_temperature = {opt_temperature}\n", + "\n", + " #Grad advanced settings\n", + " grad_center = {grad_center}\n", + " #Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + " grad_scale={grad_scale} \n", + "\n", + " #Init image advanced settings\n", + " init_rotate={init_rotate}\n", + " mask_rotate={mask_rotate}\n", + " init_magnitude = {init_magnitude}\n", + "\n", + " #More settings\n", + " RGB_min = {RGB_min}\n", + " RGB_max = {RGB_max}\n", + " #How to pad the image with cut_overview\n", + " padargs = {padargs} \n", + " flip_aug={flip_aug}\n", + " cc = {cc}\n", + " #Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", + " experimental_aesthetic_embeddings = {experimental_aesthetic_embeddings}\n", + " #How much you want this to influence your result\n", + " experimental_aesthetic_embeddings_weight = {experimental_aesthetic_embeddings_weight}\n", + " #9 are good aesthetic embeddings, 0 are bad ones\n", + " experimental_aesthetic_embeddings_score = {experimental_aesthetic_embeddings_score}\n", + " '''\n", + " return(settings)\n", + "\n", + "#Alstro's aesthetic model\n", + "aesthetic_model_336 = torch.nn.Linear(768,1).cuda()\n", + "aesthetic_model_336.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_336_linear.pth\"))\n", + "\n", + "aesthetic_model_224 = torch.nn.Linear(768,1).cuda()\n", + "aesthetic_model_224.load_state_dict(torch.load(f\"{model_path}/ava_vit_l_14_linear.pth\"))\n", + "\n", + "aesthetic_model_16 = torch.nn.Linear(512,1).cuda()\n", + "aesthetic_model_16.load_state_dict(torch.load(f\"{model_path}/ava_vit_b_16_linear.pth\"))\n", + "\n", + "aesthetic_model_32 = torch.nn.Linear(512,1).cuda()\n", + "aesthetic_model_32.load_state_dict(torch.load(f\"{model_path}/sa_0_4_vit_b_32_linear.pth\"))\n", + "\n", + "from ldm.modules.diffusionmodules.util import noise_like\n", + "def do_run():\n", + " # with torch.cuda.amp.autocast():\n", + " global progress,target_embeds, weights, zero_embed, init, scale_factor\n", + " scale_factor = 1\n", + " make_cutouts = {}\n", + " for i in clip_list:\n", + " make_cutouts[i] = MakeCutouts(clip_size[i],Overview=1)\n", + " target_embeds, weights ,zero_embed = {}, {}, {}\n", + " for i in clip_list:\n", + " target_embeds[i] = []\n", + " weights[i]=[]\n", + "\n", + " for prompt in prompts:\n", + " txt, weight = parse_prompt(prompt)\n", + " for i in clip_list:\n", + " if \"cloob\" not in i:\n", + " with torch.cuda.amp.autocast():\n", + " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", + " target_embeds[i].append(embeds)\n", + " weights[i].append(weight)\n", + " else:\n", + " embeds = clip_model[i].encode_text(clip_tokenize[i](txt).to(device))\n", + " target_embeds[i].append(embeds)\n", + " weights[i].append(weight)\n", + "\n", + " for prompt in image_prompts:\n", + " print(f\"processing{prompt}\",end=\"\\r\")\n", + " path, weight = parse_prompt(prompt)\n", + " img = Image.open(fetch(path)).convert('RGB')\n", + " img = TF.resize(img, min(opt.W, opt.H, *img.size), transforms.InterpolationMode.LANCZOS)\n", + " for i in clip_list:\n", + " if \"cloob\" not in i:\n", + " with torch.cuda.amp.autocast():\n", + " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", + " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", + " target_embeds[i].append(embed)\n", + " weights[i].extend([weight])\n", + " else:\n", + " batch = make_cutouts[i](TF.to_tensor(img).unsqueeze(0).to(device))\n", + " embed = clip_model[i].encode_image(clip_normalize[i](batch))\n", + " target_embeds[i].append(embed)\n", + " weights[i].extend([weight])\n", + " if anti_jpg != 0:\n", + " target_embeds[\"ViT-B-32--openai\"].append(torch.tensor([np.load(f\"{model_path}/openimages_512x_png_embed224.npz\")['arr_0']-np.load(f\"{model_path}/imagenet_512x_jpg_embed224.npz\")['arr_0']], device = device))\n", + " weights[\"ViT-B-32--openai\"].append(anti_jpg)\n", + "\n", + " for i in clip_list:\n", + " target_embeds[i] = torch.cat(target_embeds[i])\n", + " weights[i] = torch.tensor([weights[i]], device=device)\n", + " shape = [4, opt.H//8, opt.W//8]\n", + " init = None\n", + " mask = None\n", + " transform = T.GaussianBlur(kernel_size=3, sigma=0.4)\n", + " if init_image is not None:\n", + " init = Image.open(fetch(init_image)).convert('RGB')\n", + " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", + " if init_rotate: init = torch.rot90(init, 1, [3,2]) \n", + " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W])\n", + " init = init.mul(2).sub(1).half()\n", + " init_encoded = model.first_stage_model.encode(init).sample()* init_magnitude + init_brightness\n", + " init_encoded = init_encoded + noise_like(init_encoded.shape,device,False).mul(init_noise)\n", + " else:\n", + " init = None\n", + " init_encoded = None\n", + " if init_mask is not None:\n", + " mask = Image.open(fetch(init_mask)).convert('RGB')\n", + " mask = TF.to_tensor(mask).to(device).unsqueeze(0)\n", + " if mask_rotate: mask = torch.rot90(init, 1, [3,2]) \n", + " mask = resize(mask,out_shape = [opt.n_samples,1,opt.H//8, opt.W//8])\n", + " mask = transform(mask)\n", + " print(mask)\n", + "\n", + "\n", + " progress = widgets.Image(layout = widgets.Layout(max_width = \"400px\",max_height = \"512px\"))\n", + " display.display(progress)\n", + "\n", + " if opt.plms:\n", + " sampler = PLMSSampler(model)\n", + " else:\n", + " sampler = DDIMSampler(model)\n", + "\n", + " os.makedirs(opt.outdir, exist_ok=True)\n", + " outpath = opt.outdir\n", + "\n", + " prompt = opt.prompt\n", + " sample_path = os.path.join(outpath, \"samples\")\n", + " os.makedirs(sample_path, exist_ok=True)\n", + " base_count = len(os.listdir(sample_path))\n", + "\n", + " all_samples=list()\n", + " last_step_upscale = False\n", + " with torch.enable_grad():\n", + " with torch.cuda.amp.autocast():\n", + " with model.ema_scope():\n", + " uc = None\n", + " if opt.scale != 1.0:\n", + " uc = model.get_learned_conditioning(opt.n_samples * opt.uc).cuda()\n", + " \n", + " for n in trange(opt.n_iter, desc=\"Sampling\"):\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " c = model.get_learned_conditioning(opt.n_samples * prompt).cuda()\n", + " if init_encoded is None:\n", + " x_T = torch.randn([opt.n_samples,*shape], device=device)\n", + " else:\n", + " x_T = init_encoded\n", + " last_step_uspcale_list = []\n", + " \n", + " for custom_schedule in custom_schedules:\n", + " if type(custom_schedule) != type(\"\"):\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " last_step_upscale = False\n", + " samples_ddim, _ = sampler.sample(S=opt.ddim_steps,\n", + " conditioning=c,\n", + " batch_size=opt.n_samples,\n", + " shape=shape,\n", + " custom_schedule = custom_schedule,\n", + " verbose=False,\n", + " unconditional_guidance_scale=opt.scale,\n", + " unconditional_conditioning=uc,\n", + " eta=opt.ddim_eta,\n", + " eta_end = opt.eta_end,\n", + " img_callback=None if use_cond_fn else display_handler,\n", + " cond_fn=cond_fn, #if use_cond_fn else None,\n", + " temperature = opt.temperature,\n", + " x_adjust_fn=cond_clamp,\n", + " x_T = x_T,\n", + " x0=x_T,\n", + " mask=mask\n", + " )\n", + " x_T = samples_ddim.clamp(-6,6)\n", + " else:\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " method, scale_factor = custom_schedule.split(\":\")\n", + " scale_factor = float(scale_factor)\n", + " #clamp_index = np.array(clamp_index) * scale_factor\n", + " if method == \"latent\":\n", + " x_T = resize(samples_ddim, scale_factors=scale_factor, antialiasing=True)*scale_div\n", + " x_T += noise_like(x_T.shape,device,False)*init_noise\n", + " if method == \"gfpgan\":\n", + " last_step_upscale = True\n", + " temp_file_name = \"temp_\"+f\"{str(round(time.time()))}.png\"\n", + " temp_file = os.path.join(sample_path, temp_file_name)\n", + " im.save(temp_file, format = \"PNG\")\n", + " GFP_factor = 2 if scale_factor > 1 else 1\n", + " GFP_ver = 1.3 #if GFP_factor == 1 else 1.2\n", + " %cd GFPGAN\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " !python inference_gfpgan.py -i $temp_file -o results -v $GFP_ver -s $GFP_factor\n", + " %cd ..\n", + " face_corrected = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\"))\n", + " with io.BytesIO() as output:\n", + " face_corrected.save(output,format=\"PNG\")\n", + " progress.value = output.getvalue()\n", + " init = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", + " init = TF.to_tensor(init).to(device).unsqueeze(0)\n", + " opt.H, opt.W = opt.H*scale_factor, opt.W*scale_factor\n", + " init = resize(init,out_shape = [opt.n_samples,3,opt.H, opt.W], antialiasing=True)\n", + " init = init.mul(2).sub(1).half()\n", + " x_T = (model.first_stage_model.encode(init).sample()*init_magnitude)\n", + " x_T += noise_like(x_T.shape,device,False)*init_noise\n", + " x_T = x_T.clamp(-6,6)\n", + "\n", + " #last_step_uspcale_list.append(last_step_upscale)\n", + " scale_factor = 1\n", + " current_time = str(round(time.time()))\n", + " if(last_step_upscale):\n", + " latest_upscale = Image.open(fetch(f\"GFPGAN/results/restored_imgs/{temp_file_name}\")).convert('RGB')\n", + " latest_upscale.save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", + " else:\n", + " Image.fromarray(image_grid.astype(np.uint8)).save(os.path.join(outpath, f'{current_time}.png'), format = \"PNG\")\n", + " settings = generate_settings_file(add_prompts=True, add_dimensions=False)\n", + " text_file = open(f\"{outpath}/{current_time}.cfg\", \"w\")\n", + " text_file.write(settings)\n", + " text_file.close()\n", + " x_samples_ddim = model.decode_first_stage(samples_ddim)\n", + " x_samples_ddim = torch.clamp((x_samples_ddim+1.0)/2.0, min=0.0, max=1.0)\n", + " all_samples.append(x_samples_ddim)\n", + "\n", + "\n", + " if(len(all_samples) > 1):\n", + " # additionally, save as grid\n", + " grid = torch.stack(all_samples, 0)\n", + " grid = rearrange(grid, 'n b c h w -> (n b) c h w')\n", + " grid = make_grid(grid, nrow=opt.n_samples)\n", + "\n", + " # to image\n", + " grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()\n", + " Image.fromarray(grid.astype(np.uint8)).save(os.path.join(outpath, f'grid_{str(round(time.time()))}.png'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ILHGCEla2Rrm" + }, + "source": [ + "# Run!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VpR9JhyCu5iq" + }, + "source": [ + "#### Perceptors (Choose your CLIP and CLIP-like models) \n", + "Be careful if you don't pay for Colab Pro selecting more CLIPs might make you go out of memory. If you do have Pro, try adding ViT-L14 to your mix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "8K7l_E2JvLWC" + }, + "outputs": [], + "source": [ + "#@title Choose your perceptor models\n", + "\n", + "# suppress mmc warmup outputs\n", + "import mmc.loaders\n", + "clip_load_list = []\n", + "#@markdown #### Open AI CLIP models\n", + "ViT_B32 = False #@param {type:\"boolean\"}\n", + "ViT_B16 = True #@param {type:\"boolean\"}\n", + "ViT_L14 = False #@param {type:\"boolean\"}\n", + "ViT_L14_336px = False #@param {type:\"boolean\"}\n", + "#RN101 = False #@param {type:\"boolean\"}\n", + "#RN50 = False #@param {type:\"boolean\"}\n", + "RN50x4 = False #@param {type:\"boolean\"}\n", + "RN50x16 = False #@param {type:\"boolean\"}\n", + "RN50x64 = False #@param {type:\"boolean\"}\n", + "\n", + "#@markdown #### OpenCLIP models\n", + "ViT_B16_plus = False #@param {type: \"boolean\"}\n", + "ViT_B32_laion2b = True #@param {type: \"boolean\"}\n", + "\n", + "#@markdown #### Multilangual CLIP models \n", + "clip_farsi = False #@param {type: \"boolean\"}\n", + "clip_korean = False #@param {type: \"boolean\"}\n", + "\n", + "#@markdown #### CLOOB models\n", + "cloob_ViT_B16 = False #@param {type: \"boolean\"}\n", + "\n", + "# @markdown Load even more CLIP and CLIP-like models (from [Multi-Modal-Comparators](https://github.com/dmarx/Multi-Modal-Comparators))\n", + "model1 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model2 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "model3 = \"\" # @param [\"[clip - openai - RN50]\",\"[clip - openai - RN101]\",\"[clip - mlfoundations - RN50--yfcc15m]\",\"[clip - mlfoundations - RN50--cc12m]\",\"[clip - mlfoundations - RN50-quickgelu--yfcc15m]\",\"[clip - mlfoundations - RN50-quickgelu--cc12m]\",\"[clip - mlfoundations - RN101--yfcc15m]\",\"[clip - mlfoundations - RN101-quickgelu--yfcc15m]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_e32]\",\"[clip - mlfoundations - ViT-B-32-quickgelu--laion400m_avg]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e31]\",\"[clip - mlfoundations - ViT-B-16--laion400m_e32]\",\"[clip - sbert - ViT-B-32-multilingual-v1]\",\"[clip - facebookresearch - clip_small_25ep]\",\"[simclr - facebookresearch - simclr_small_25ep]\",\"[slip - facebookresearch - slip_small_25ep]\",\"[slip - facebookresearch - slip_small_50ep]\",\"[slip - facebookresearch - slip_small_100ep]\",\"[clip - facebookresearch - clip_base_25ep]\",\"[simclr - facebookresearch - simclr_base_25ep]\",\"[slip - facebookresearch - slip_base_25ep]\",\"[slip - facebookresearch - slip_base_50ep]\",\"[slip - facebookresearch - slip_base_100ep]\",\"[clip - facebookresearch - clip_large_25ep]\",\"[simclr - facebookresearch - simclr_large_25ep]\",\"[slip - facebookresearch - slip_large_25ep]\",\"[slip - facebookresearch - slip_large_50ep]\",\"[slip - facebookresearch - slip_large_100ep]\",\"[clip - facebookresearch - clip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc3m_40ep]\",\"[slip - facebookresearch - slip_base_cc12m_35ep]\",\"[clip - facebookresearch - clip_base_cc12m_35ep]\"] {allow-input: true}\n", + "\n", + "if ViT_B32: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-32--openai]\")\n", + "if ViT_B16: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-16--openai]\")\n", + "if ViT_L14: \n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14--openai]\")\n", + "if RN50x4: \n", + " clip_load_list.append(\"[clip - mlfoundations - RN50x4--openai]\")\n", + "if RN50x64: \n", + " clip_load_list.append(\"[clip - mlfoundations - RN50x64--openai]\")\n", + "if RN50x16: \n", + " clip_load_list.append(\"[clip - mlfoundations - RN50x16--openai]\")\n", + "if ViT_L14_336px:\n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-L-14-336--openai]\")\n", + "if ViT_B16_plus:\n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-16-plus-240--laion400m_e32]\")\n", + "if ViT_B32_laion2b:\n", + " clip_load_list.append(\"[clip - mlfoundations - ViT-B-32--laion2b_e16]\")\n", + "if clip_farsi:\n", + " clip_load_list.append(\"[clip - sajjjadayobi - clipfa]\")\n", + "if clip_korean:\n", + " clip_load_list.append(\"[clip - navervision - kelip_ViT-B/32]\")\n", + "if cloob_ViT_B16:\n", + " clip_load_list.append(\"[cloob - crowsonkb - cloob_laion_400m_vit_b_16_32_epochs]\")\n", + "\n", + "if model1:\n", + " clip_load_list.append(model1)\n", + "if model2:\n", + " clip_load_list.append(model2)\n", + "if model3:\n", + " clip_load_list.append(model3)\n", + "\n", + "\n", + "i = 0\n", + "from mmc.multimmc import MultiMMC\n", + "from mmc.modalities import TEXT, IMAGE\n", + "temp_perceptor = MultiMMC(TEXT, IMAGE)\n", + "\n", + "def get_mmc_models(clip_load_list):\n", + " mmc_models = []\n", + " for model_key in clip_load_list:\n", + " if not model_key:\n", + " continue\n", + " arch, pub, m_id = model_key[1:-1].split(' - ')\n", + " mmc_models.append({\n", + " 'architecture':arch,\n", + " 'publisher':pub,\n", + " 'id':m_id,\n", + " })\n", + " return mmc_models\n", + "mmc_models = get_mmc_models(clip_load_list)\n", + "\n", + "import mmc\n", + "from mmc.registry import REGISTRY\n", + "import mmc.loaders # force trigger model registrations\n", + "from mmc.mock.openai import MockOpenaiClip\n", + "\n", + "normalize = transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073],\n", + " std=[0.26862954, 0.26130258, 0.27577711])\n", + "\n", + "\n", + "def load_clip_models(mmc_models):\n", + " clip_model, clip_size, clip_tokenize, clip_normalize= {},{},{},{}\n", + " clip_list = []\n", + " for item in mmc_models:\n", + " print(\"Loaded \", item[\"id\"])\n", + " clip_list.append(item[\"id\"])\n", + " model_loaders = REGISTRY.find(**item)\n", + " for model_loader in model_loaders:\n", + " clip_model_loaded = model_loader.load()\n", + " clip_model[item[\"id\"]] = MockOpenaiClip(clip_model_loaded)\n", + " clip_size[item[\"id\"]] = clip_model[item[\"id\"]].visual.input_resolution\n", + " clip_tokenize[item[\"id\"]] = clip_model[item[\"id\"]].preprocess_text()\n", + " if(item[\"architecture\"] == 'cloob'):\n", + " clip_normalize[item[\"id\"]] = clip_model[item[\"id\"]].normalize\n", + " else:\n", + " clip_normalize[item[\"id\"]] = normalize\n", + " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + "\n", + "\n", + "def full_clip_load(clip_load_list):\n", + " torch.cuda.empty_cache()\n", + " gc.collect()\n", + " try:\n", + " del clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + " except:\n", + " pass\n", + " mmc_models = get_mmc_models(clip_load_list)\n", + " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = load_clip_models(mmc_models)\n", + " return clip_model, clip_size, clip_tokenize, clip_normalize, clip_list\n", + "\n", + "clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", + "\n", + "torch.cuda.empty_cache()\n", + "gc.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "N_Di3xFSXGWe" + }, + "source": [ + "#### Advanced settings for the generation\n", + "##### Access [our guide](https://multimodal.art/majesty-diffusion) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pAALegoCXEbm" + }, + "outputs": [], + "source": [ + "opt = DotMap()\n", + "\n", + "#Change it to false to not use CLIP Guidance at all \n", + "use_cond_fn = True\n", + "\n", + "#Custom cut schedules and super-resolution. Check out the guide on how to use it a https://multimodal.art/majestydiffusion\n", + "custom_schedule_setting = [\n", + " [200,1000,8],\n", + " [50,200,5],\n", + " #\"gfpgan:1.5\",\n", + " #[50,200,5],\n", + "]\n", + " \n", + "#Cut settings\n", + "clamp_index = [1]*1000 \n", + "cut_overview = [8]*500 + [4]*500\n", + "cut_innercut = [0]*500 + [4]*500\n", + "cut_ic_pow = .1\n", + "cut_icgray_p = [.1]*300+[0]*1000\n", + "cutn_batches = 1\n", + "range_index = [0]*300 + [0]*1000 \n", + "active_function = \"softsign\" # function to manipulate the gradient - help things to stablize\n", + "tv_scales = [1000]*1+[600]*3\n", + "latent_tv_loss = True #Applies the TV Loss in the Latent space instead of pixel, improves generation quality\n", + "\n", + "#If you uncomment next line you can schedule the CLIP guidance across the steps. Otherwise the clip_guidance_scale basic setting will be used\n", + "#clip_guidance_schedule = [10000]*300 + [500]*700\n", + "\n", + "symmetric_loss_scale = 0 #Apply symmetric loss\n", + "\n", + "#Latent Diffusion Advanced Settings\n", + "scale_div = 0.5 # Use when latent upscale to correct satuation problem\n", + "opt_mag_mul = 10 #Magnify grad before clamping\n", + "#PLMS Currently not working, working on a fix\n", + "#opt.plms = False #Won;=t work with clip guidance\n", + "opt_ddim_eta, opt_eta_end = [1.4,1] # linear variation of eta\n", + "opt_temperature = .975 \n", + "\n", + "#Grad advanced settings\n", + "grad_center = False\n", + "grad_scale= 0.5 #5 Lower value result in more coherent and detailed result, higher value makes it focus on more dominent concept\n", + "anti_jpg = 0 #not working\n", + "\n", + "#Init image advanced settings\n", + "init_rotate, mask_rotate=[False, False]\n", + "init_magnitude = 0.15\n", + "\n", + "#More settings\n", + "RGB_min, RGB_max = [-0.95,0.95]\n", + "padargs = {\"mode\":\"constant\", \"value\": -1} #How to pad the image with cut_overview\n", + "flip_aug=False\n", + "cc = 60\n", + "cutout_debug = False\n", + "opt.outdir = outputs_path\n", + "\n", + "#Experimental aesthetic embeddings, work only with OpenAI ViT-B/32 and ViT-L/14\n", + "experimental_aesthetic_embeddings = False\n", + "#How much you want this to influence your result\n", + "experimental_aesthetic_embeddings_weight = 0.5\n", + "#9 are good aesthetic embeddings, 0 are bad ones\n", + "experimental_aesthetic_embeddings_score = 9" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZUu_pyTkuxiT" + }, + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wo1tM270ryit" + }, + "source": [ + "### Prompts\n", + "The main prompt is the CLIP prompt. The Latent Prompts usually help with style and composition, you can turn them off by setting `latent_diffsion_guidance_scale=0` " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rRIC0eQervDN" + }, + "outputs": [], + "source": [ + "#Amp up your prompt game with prompt engineering, check out this guide: https://matthewmcateer.me/blog/clip-prompt-engineering/\n", + "#Prompt for CLIP Guidance\n", + "clip_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "\n", + "#Prompt for Latent Diffusion\n", + "latent_prompts = [\"portrait of a princess in sanctuary, hyperrealistic painting trending on artstation\"]\n", + "\n", + "#Negative prompts for Latent Diffusion\n", + "latent_negatives = [\"low quality image\"]\n", + "\n", + "image_prompts = []" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iv8-gEvUsADL" + }, + "source": [ + "### Diffuse!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "fmafGmcyT1mZ" + }, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "#@markdown ### Basic settings \n", + "#@markdown We're still figuring out default settings. Experiment and share your settings with us\n", + "width = 256#@param{type: 'integer'}\n", + "height = 256#@param{type: 'integer'}\n", + "latent_diffusion_guidance_scale = 2 #@param {type:\"number\"}\n", + "clip_guidance_scale = 5000 #@param{type: 'integer'}\n", + "how_many_batches = 1 #@param{type: 'integer'}\n", + "aesthetic_loss_scale = 200 #@param{type: 'integer'}\n", + "augment_cuts=True #@param{type:'boolean'}\n", + "\n", + "#@markdown\n", + "\n", + "#@markdown ### Init image settings\n", + "#@markdown `init_image` requires the path of an image to use as init to the model\n", + "init_image = None #@param{type: 'string'}\n", + "if(init_image == '' or init_image == 'None'):\n", + " init_image = None\n", + "#@markdown `starting_timestep`: How much noise do you want to add to your init image for it to then be difused by the model\n", + "starting_timestep = 0.9 #@param{type: 'number'}\n", + "#@markdown `init_mask` is a mask same width and height as the original image with the color black indicating where to inpaint\n", + "init_mask = None #@param{type: 'string'}\n", + "#@markdown `init_scale` controls how much the init image should influence the final result. Experiment with values around `1000`\n", + "init_scale = 1000 #@param{type: 'integer'}\n", + "init_brightness = 0.0 #@param{type: 'number'}\n", + "#@markdown How much extra noise to add to the init image, independently from skipping timesteps (use it also if you are upscaling)\n", + "init_noise = 0.6 #@param{type: 'number'}\n", + "\n", + "#@markdown\n", + "\n", + "#@markdown ### Custom saved settings\n", + "#@markdown If you choose custom saved settings, the settings set by the preset overrule some of your choices. You can still modify the settings not in the preset. Check what each preset modifies here\n", + "custom_settings = 'path/to/settings.cfg' #@param{type:'string'}\n", + "settings_library = 'None (use settings defined above)' #@param [\"None (use settings defined above)\", \"default (optimized for colab free)\", \"dango233_princesses\", \"the_other_zippy_defaults\", \"makeitrad_defaults\"]\n", + "if(settings_library != 'None (use settings defined above)'):\n", + " if(settings_library == 'default (optimized for colab free)'):\n", + " custom_settings = f'majesty-diffusion/latent_settings_library/default.cfg'\n", + " else:\n", + " custom_settings = f'majesty-diffusion/latent_settings_library/{settings_library}.cfg'\n", + "\n", + "global_var_scope = globals()\n", + "if(custom_settings is not None and custom_settings != '' and custom_settings != 'path/to/settings.cfg'):\n", + " print('Loaded ', custom_settings)\n", + " try:\n", + " from configparser import ConfigParser\n", + " except ImportError:\n", + " from ConfigParser import ConfigParser\n", + " import configparser\n", + " \n", + " config = ConfigParser()\n", + " config.read(custom_settings)\n", + " #custom_settings_stream = fetch(custom_settings)\n", + " #Load CLIP models from config\n", + " if(config.has_section('clip_list')):\n", + " clip_incoming_list = config.items('clip_list')\n", + " clip_incoming_models = clip_incoming_list[0]\n", + " incoming_perceptors = eval(clip_incoming_models[1])\n", + " if((len(incoming_perceptors) != len(clip_load_list)) or not all(elem in incoming_perceptors for elem in clip_load_list)):\n", + " clip_load_list = incoming_perceptors\n", + " clip_model, clip_size, clip_tokenize, clip_normalize, clip_list = full_clip_load(clip_load_list)\n", + "\n", + " #Load settings from config and replace variables\n", + " if(config.has_section('basic_settings')):\n", + " basic_settings = config.items('basic_settings')\n", + " for basic_setting in basic_settings:\n", + " global_var_scope[basic_setting[0]] = eval(basic_setting[1])\n", + " \n", + " if(config.has_section('advanced_settings')):\n", + " advanced_settings = config.items('advanced_settings')\n", + " for advanced_setting in advanced_settings:\n", + " global_var_scope[advanced_setting[0]] = eval(advanced_setting[1])\n", + "\n", + "if(((init_image is not None) and (init_image != 'None') and (init_image != '')) and starting_timestep != 1 and custom_schedule_setting[0][1] == 1000):\n", + " custom_schedule_setting[0] = [custom_schedule_setting[0][0], int(custom_schedule_setting[0][1]*starting_timestep), custom_schedule_setting[0][2]]\n", + "\n", + "prompts = clip_prompts\n", + "opt.prompt = latent_prompts\n", + "opt.uc = latent_negatives\n", + "custom_schedules = set_custom_schedules(custom_schedule_setting)\n", + "aes_scale = aesthetic_loss_scale\n", + "try: \n", + " clip_guidance_schedule\n", + " clip_guidance_index = clip_guidance_schedule\n", + "except:\n", + " clip_guidance_index = [clip_guidance_scale]*1000\n", + "\n", + "opt.W = (width//64)*64;\n", + "opt.H = (height//64)*64;\n", + "if opt.W != width or opt.H != height:\n", + " print(f'Changing output size to {opt.W}x{opt.H}. Dimensions must by multiples of 64.')\n", + "\n", + "opt.mag_mul = opt_mag_mul \n", + "opt.ddim_eta = opt_ddim_eta\n", + "opt.eta_end = opt_eta_end\n", + "opt.temperature = opt_temperature\n", + "opt.n_iter = how_many_batches\n", + "opt.n_samples = 1\n", + "#opt.W, opt.H = [width,height]\n", + "opt.scale = latent_diffusion_guidance_scale\n", + "aug = augment_cuts\n", + "\n", + "torch.cuda.empty_cache()\n", + "gc.collect()\n", + "generate_video = False\n", + "if generate_video: \n", + " fps = 24\n", + " p = Popen(['ffmpeg', '-y', '-f', 'image2pipe', '-vcodec', 'png', '-r', str(fps), '-i', '-', '-vcodec', 'libx264', '-r', str(fps), '-pix_fmt', 'yuv420p', '-crf', '17', '-preset', 'veryslow', 'video.mp4'], stdin=PIPE)\n", + "do_run()\n", + "if generate_video: \n", + " p.stdin.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4cvUzcO9FeMT" + }, + "source": [ + "### Save your own settings\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "LGLUCX_UGqka" + }, + "outputs": [], + "source": [ + "\n", + "#@markdown ### Save current settings\n", + "#@markdown If you would like to save your current settings, uncheck `skip_saving` and run this cell. You will get a `custom_settings.cfg` file you can reuse and share. If you like your results, send us a pull request to add your settings to the selectable library\n", + "skip_saving = True #@param{type:'boolean'}\n", + "if(not skip_saving):\n", + " data = generate_settings_file(add_prompts=False, add_dimensions=True)\n", + " text_file = open(\"custom_settings.cfg\", \"w\")\n", + " text_file.write(data)\n", + " text_file.close()\n", + " from google.colab import files\n", + " files.download('custom_settings.cfg')\n", + " print(\"Downloaded as custom_settings.cfg\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Fzd-2mVMWHV0" + }, + "source": [ + "### Biases acknowledgment\n", + "Despite how impressive being able to turn text into image is, beware to the fact that this model may output content that reinforces or exarcbates societal biases. According to the Latent Diffusion paper: \\\"Deep learning modules tend to reproduce or exacerbate biases that are already present in the data\\\". \n", + "\n", + "The model was trained on an unfiltered version the LAION-400M dataset, which scrapped non-curated image-text-pairs from the internet (the exception being the the removal of illegal content) and is meant to be used for research purposes, such as this one. You can read more on LAION's website" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [ + "xEVSOJ4f0B21", + "VpR9JhyCu5iq", + "N_Di3xFSXGWe", + "xEVSOJ4f0B21" + ], + "machine_shape": "hm", + "name": "Latent Majesty Diffusion v1.3", + "private_outputs": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}