diff --git a/.github/workflows/docs-file-copy.yml b/.github/workflows/docs-file-copy.yml new file mode 100644 index 0000000..eb432eb --- /dev/null +++ b/.github/workflows/docs-file-copy.yml @@ -0,0 +1,35 @@ +name: Copy workload documentation to public docs repo +# We rsync the ai-workloads documentation to a temp clone of the public docs repo +# and commit and push the changes to the main branch of the public docs repo. Purpose is to keep the Docs repo (consolidated SiloGen docs) updated with ai-workloads repository changes. + +on: + push: + branches: + - main + paths: + - "docs/**" + - "workloads/**" + - ".github/workflows/docs-file-copy.yml" + +jobs: + copy-docs: + if: github.repository == 'silogen/ai-workloads' + runs-on: ubuntu-latest + steps: + - name: Checkout core repo + uses: actions/checkout@v4 + + - name: Push to public docs repo + run: | + git config --global user.name 'GitHub Actions' + git config --global user.email 'actions@github.com' + git clone https://x-access-token:${{ secrets.DOCS_REPO_TOKEN }}@github.com/silogen/ai-workloads.git source_docs + git clone https://x-access-token:${{ secrets.DOCS_REPO_TOKEN }}@github.com/silogen/docs.git target_silogen_docs + cd target_silogen_docs + rsync -av --delete --exclude='.git' ../source_docs/docs docs/ai-workloads-docs + rsync -av --delete --exclude='.git' ../source_docs/workloads docs/ai-workloads-manifests + git add . + git diff --staged --quiet || git commit -m "Update external docs from ai-workloads repo" + git push origin main + env: + DOCS_REPO_TOKEN: ${{ secrets.DOCS_REPO_TOKEN }} diff --git a/.gitignore b/.gitignore index ed7ea17..028313d 100644 --- a/.gitignore +++ b/.gitignore @@ -82,6 +82,9 @@ target/ profile_default/ ipython_config.py +# MacOS stuff +.DS_Store + # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 384fea5..39675cd 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ repos: hooks: - id: check-json - id: check-yaml - exclude: templates|mkdocs.yml + exclude: templates|mkdocs.yml|vlm-lora-finetune - id: end-of-file-fixer - id: requirements-txt-fixer - id: trailing-whitespace @@ -20,20 +20,20 @@ repos: args: ["--config=pyproject.toml"] - repo: https://github.com/pycqa/flake8 - rev: 7.1.2 + rev: 7.2.0 hooks: - id: flake8 args: ["--config=.flake8"] - repo: https://github.com/pycqa/isort - rev: 6.0.0 + rev: 6.0.1 hooks: - id: isort name: isort (python) args: ["--settings-path=pyproject.toml"] - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.15.0 + rev: v1.16.0 hooks: - id: mypy args: ["--config-file=pyproject.toml"] @@ -43,7 +43,7 @@ repos: - types-PyYAML - repo: https://github.com/gruntwork-io/pre-commit - rev: v0.1.26 + rev: v0.1.29 hooks: - id: helmlint exclude: kaiwo|llm-finetune-silogen-engine diff --git a/docker/lifescience/reinvent4/Dockerfile b/docker/lifescience/reinvent4/Dockerfile new file mode 100644 index 0000000..4bb8d89 --- /dev/null +++ b/docker/lifescience/reinvent4/Dockerfile @@ -0,0 +1,28 @@ +FROM rocm/pytorch:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.5.1 + +# Use bash to support string substitution. +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +# Clone the Reinvet4 repository and use the stable 4.6.22 version +RUN git clone https://github.com/MolecularAI/REINVENT4 +WORKDIR /REINVENT4 +RUN git checkout v4.6.22 +RUN wget -O priors/reinvent.prior "https://zenodo.org/records/15641297/files/reinvent.prior?download=1" + +# Remove torch and torchvision from pyproject.toml +RUN sed -i.bak '/torch==/d' pyproject.toml && \ + sed -i.bak '/torchvision /d' pyproject.toml + +# Now run the install script as usual +RUN python install.py cpu + +COPY demo_notebooks/ notebooks/ + +# Download the chemprop model +RUN mkdir chemprop +RUN wget -q --show-progress -O chemprop/model.pt "https://www.dropbox.com/scl/fi/zpnqc9at5a5dnkzfdbo6g/model.pt?rlkey=g005yli9364uptd94d60jtg5c&e=1&dl=1" + +# Copy entrypoint script +COPY entrypoint.sh / + +CMD ["/bin/bash"] diff --git a/docker/lifescience/reinvent4/README.md b/docker/lifescience/reinvent4/README.md new file mode 100644 index 0000000..283ae98 --- /dev/null +++ b/docker/lifescience/reinvent4/README.md @@ -0,0 +1,31 @@ +# Running Reinvent inference interactively + +Connect to the pod with your favorite terminal. + +This repo provides an altered version of these notebooks to be runnable from the terminal with the subscript `_clean`. These can simply be run by: + +```sh +python3 notebooks/ +``` +Alternatively, Reinvent jobs can be run by: +```sh +reinvent -l +``` + +## Running inference job automatically (non-interactive) + +In order to run Reinvent jobs automatically using the above image do the following: +- Set up config and output directory: + +Put your config files as well any other files needed such as datasets or priors in `CONFIG_PATH`. In `OUTPUT_PATH`, the job will write output logs. +```sh +export CONFIG_PATH= +export OUTPUT_PATH= +``` + +Then, the following command will run the job: + +```sh +docker run --rm -v $CONFIG_PATH:/data -v $OUTPUT_PATH:/output --device=/dev/kfd --device=/dev/dri/renderD rocm-reinvent /data/.toml /output/ +``` +where the last two arguments provide paths to the config file to run as well where to save outputs. diff --git a/docker/lifescience/reinvent4/demo_notebooks/Reinvent_TLRL_clean.py b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_TLRL_clean.py new file mode 100644 index 0000000..aba73f8 --- /dev/null +++ b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_TLRL_clean.py @@ -0,0 +1,223 @@ +# This script is based on the file notebooks/Reinvent_TLRL.py: +# https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py + +import os +import re +import shutil +import subprocess + +import pandas as pd + + +def main(): + wd = "R4_TLRL_output" + + # Delete existing working directory and create a new one + if not os.path.isdir(wd): + shutil.rmtree(wd, ignore_errors=True) + os.mkdir(wd) + os.chdir(wd) + + # Write config file + prior_filename = "../priors/reinvent.prior" + agent_filename = prior_filename + stage1_checkpoint = "stage1.chkpt" + stage1_parameters = f""" + run_type = "staged_learning" + device = "cuda:0" + tb_logdir = "tb_stage1" + json_out_config = "_stage1.json" + + [parameters] + prior_file = "{prior_filename}" + agent_file = "{agent_filename}" + summary_csv_prefix = "stage1" + batch_size = 100 + use_checkpoint = false + sample_strategy = "beamsearch" #Additional interesting param? + + [learning_strategy] + type = "dap" + sigma = 128 + rate = 0.0001 + + [[stage]] + max_score = 1.0 + max_steps = 5 + chkpt_file = "{stage1_checkpoint}" + [stage.scoring] + type = "geometric_mean" + [[stage.scoring.component]] + [stage.scoring.component.custom_alerts] + [[stage.scoring.component.custom_alerts.endpoint]] + name = "Alerts" + params.smarts = [ "[*;r8]", "[*;r9]", "[*;r10]", "[*;r11]", "[*;r12]", "[*;r13]", "[*;r14]", "[*;r15]", "[*;r16]", "[*;r17]", "[#8][#8]", "[#6;+]", "[#16][#16]", "[#7;!n][S;!$(S(=O)=O)]", "[#7;!n][#7;!n]", "C#C", "C(=[O,S])[O,S]", "[#7;!n][C;!$(C(=[O,N])[N,O])][#16;!s]", "[#7;!n][C;!$(C(=[O,N])[N,O])][#7;!n]", "[#7;!n][C;!$(C(=[O,N])[N,O])][#8;!o]", "[#8;!o][C;!$(C(=[O,N])[N,O])][#16;!s]", "[#8;!o][C;!$(C(=[O,N])[N,O])][#8;!o]", "[#16;!s][C;!$(C(=[O,N])[N,O])][#16;!s]" ] + [[stage.scoring.component]] + [stage.scoring.component.QED] + [[stage.scoring.component.QED.endpoint]] + name = "QED" + weight = 0.6 + [[stage.scoring.component]] + [stage.scoring.component.NumAtomStereoCenters] + [[stage.scoring.component.NumAtomStereoCenters.endpoint]] + name = "Stereo" + weight = 0.4 + transform.type = "left_step" + transform.low = 0 + """ + + stage1_config_filename = "stage1.toml" + with open(stage1_config_filename, "w") as tf: + tf.write(stage1_parameters) + + # Stage 1 Reinforcement Learning + shutil.rmtree("tb_stage1_0", ignore_errors=True) + + # Run the stage1 process using subprocess + print("Starting Stage 1 Reinforcement Learning...") + stage1_result = subprocess.run(f"reinvent {stage1_config_filename} 2>&1 | tee stage1.log", shell=True, text=True) + if stage1_result.returncode == 0: + print("Stage 1 completed.") + else: + raise RuntimeError(f"Stage 1 execution failed with exit code: {stage1_result.returncode}") + + # Transfer Learning to focus the model + # Prepare the data + bdb = pd.read_csv("../notebooks/data/tnks2.csv") + clean = bdb[~bdb["exp (nM)"].str.match("[<>]")] + clean = clean.astype({"exp (nM)": "float"}) + + good = clean[clean["exp (nM)"] < 1000] + good = good[good["exp_method"] != "EC50"] + good = good[good["exp_method"] != "Kd"] + good = good.rename(columns={"exp (nM)": "IC50"}) + good = good.drop(columns=["exp_method"]) + + # Write the good binders to a SMILES file + TL_train_filename = "tnks2_train.smi" + TL_validation_filename = "tnks2_validation.smi" + data = good.sample(frac=1) + n_head = int(0.8 * len(data)) # 80% of the data for training + n_tail = len(good) - n_head + print(f"number of molecules for: training={n_head}, validation={n_tail}") + + train, validation = data.head(n_head), data.tail(n_tail) + train.to_csv(TL_train_filename, sep="\t", index=False, header=False) + validation.to_csv(TL_validation_filename, sep="\t", index=False, header=False) + + # TL setup + TL_parameters = f""" + run_type = "transfer_learning" + device = "cuda:0" + tb_logdir = "tb_TL" + + [parameters] + num_epochs = 1 + save_every_n_epochs = 1 + batch_size = 100 + sample_batch_size = 2000 + input_model_file = "{stage1_checkpoint}" + output_model_file = "TL_reinvent.model" + smiles_file = "{TL_train_filename}" + validation_smiles_file = "{TL_validation_filename}" + standardize_smiles = true + randomize_smiles = true + randomize_all_smiles = false + internal_diversity = true + """ + + TL_config_filename = "transfer_learning.toml" + with open(TL_config_filename, "w") as tf: + tf.write(TL_parameters) + + # Start Transfer Learning + shutil.rmtree("tb_TL", ignore_errors=True) + + # Run the transfer learning process using subprocess + print("Starting Transfer Learning...") + transfer_result = subprocess.run( + f"reinvent {TL_config_filename} 2>&1 | tee transfer_learning.log", shell=True, text=True + ) + if transfer_result.returncode == 0: + print("Transfer learning completed.") + else: + raise RuntimeError(f"Transfer learning execution failed with exit code: {transfer_result.returncode}") + + # Choose the model from transfer learning + TL_model_filename = "TL_reinvent.model.1.chkpt" + + stage2_parameters = re.sub("stage1", "stage2", stage1_parameters) + stage2_parameters = re.sub("agent_file.*\n", f"agent_file = '{TL_model_filename}'\n", stage2_parameters) + stage2_parameters = re.sub("max_steps.*\n", "max_steps = 5\n", stage2_parameters) + + # Stage 2 RL + # Predictive model (ChemProp) + chemprop_path = "../chemprop/" + pred_model_parameters = f""" + [[stage.scoring.component]] + [stage.scoring.component.ChemProp] + [[stage.scoring.component.ChemProp.endpoint]] + name = "ChemProp" + weight = 0.6 + params.checkpoint_dir = "{chemprop_path}" + params.rdkit_2d_normalized = true + params.target_column = "DG" + params.features = "rdkit_2d_normalized" + transform.type = "reverse_sigmoid" + transform.high = 0.0 + transform.low = -50.0 + transform.k = 0.4 + """ + + # Combine parameters and write to file + full_stage2_parameters = stage2_parameters + pred_model_parameters + df_parameters = """ + [diversity_filter] + type = "IdenticalMurckoScaffold" + bucket_size = 10 + minscore = 0.7 + """ + inception_parameters = """ + [inception] + smiles_file = "" # no seed SMILES + memory_size = 50 + sample_size = 10 + """ + + full_stage2_parameters += df_parameters + inception_parameters + stage2_config_filename = "stage2.toml" + with open(stage2_config_filename, "w") as tf: + tf.write(full_stage2_parameters) + + # Run stage2 using subprocess + print("Starting Stage 2 Reinforcement Learning...") + stage2_result = subprocess.run(f"reinvent {stage2_config_filename} 2>&1 | tee stage2.log", shell=True, text=True) + if stage2_result.returncode == 0: + print("Stage 2 completed.") + else: + raise RuntimeError(f"Stage 2 execution failed with exit code: {stage2_result.returncode}") + + # Inspect results with TensorBoard + # Run TensorBoard separately after REINVENT finishes + # subprocess.run(["tensorboard", "--bind_all", "--logdir", f"{wd}/tb_stage2_0"]) + + # Process the results for good binders + # csv_file = os.path.join(wd, "stage2_1.csv") + csv_file = "stage2_1.csv" + df = pd.read_csv(csv_file) + good_QED = df["QED"] > 0.8 + good_dG = df["ChemProp (raw)"] < -25.0 # kcal/mol + good_binders = df[good_QED & good_dG] + print(len(good_binders)) + + # Duplicate removal + good_binders = good_binders.drop_duplicates(subset=["SMILES"]) + print(len(good_binders)) + + # Displaying good binders + # grid = create_mol_grid(good_binders) + # display(grid) + + +if __name__ == "__main__": + main() diff --git a/docker/lifescience/reinvent4/demo_notebooks/Reinvent_demo_clean.py b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_demo_clean.py new file mode 100644 index 0000000..60c8ac6 --- /dev/null +++ b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_demo_clean.py @@ -0,0 +1,160 @@ +# This script is based on the file notebooks/Reinvent_demo.py: +# https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_demo.py + +import os +import shutil +import subprocess + +import pandas as pd + + +def setup_work_directory(wd): + shutil.rmtree(wd, ignore_errors=True) + os.mkdir(wd) + os.chdir(wd) + + +def write_config_file(filename, config_data): + with open(filename, "w") as tf: + tf.write(config_data) + + +def run_reinvent(config_filename, log_filename="stage1.log"): + try: + result = subprocess.run( + ["reinvent", "-l", log_filename, config_filename], + check=True, + capture_output=True, + text=True, + ) + print(result.stdout) + except subprocess.CalledProcessError as e: + print(f"Error occurred while running REINVENT: {e}") + print(f"STDOUT: {e.stdout}") + print(f"STDERR: {e.stderr}") + raise + + +def analyze_results(wd): + df = pd.read_csv("stage1_1.csv") + print(df.head()) + return df + + +def calculate_sample_efficiency(df): + total_smilies = len(df) + total_invalid_smilies = len(df[df["SMILES_state"] == 0]) + total_batch_duplicate_smilies = len(df[df["SMILES_state"] == 2]) + total_duplicate_smilies = len(df[df.duplicated(subset=["SMILES"])]) + + print( + f"Total number of SMILES generated: {total_smilies}\n" + f"Total number of invalid SMILES: {total_invalid_smilies}\n" + f"Total number of batch duplicate SMILES: {total_batch_duplicate_smilies}\n" + f"Total number of duplicate SMILES: {total_duplicate_smilies}" + ) + + +if __name__ == "__main__": + wd = "R4_notebooks_output" + setup_work_directory(wd) + + prior_filename = "../priors/reinvent.prior" + agent_filename = prior_filename + + global_parameters = """ + run_type = "staged_learning" + device = "cuda:0" + tb_logdir = "tb_stage1" + json_out_config = "_stage1.json" + """ + parameters = f""" + [parameters] + + prior_file = "{prior_filename}" + agent_file = "{agent_filename}" + summary_csv_prefix = "stage1" + + batch_size = 100 + + use_checkpoint = false + """ + + learning_strategy = """ + [learning_strategy] + + type = "dap" + sigma = 128 + rate = 0.0001 + """ + + stages = """ + [[stage]] + + max_score = 1.0 + max_steps = 300 + + chkpt_file = 'stage1.chkpt' + + [stage.scoring] + type = "geometric_mean" + + [[stage.scoring.component]] + [stage.scoring.component.custom_alerts] + + [[stage.scoring.component.custom_alerts.endpoint]] + name = "Alerts" + + params.smarts = [ + "[*;r8]", + "[*;r9]", + "[*;r10]", + "[*;r11]", + "[*;r12]", + "[*;r13]", + "[*;r14]", + "[*;r15]", + "[*;r16]", + "[*;r17]", + "[#8][#8]", + "[#6;+]", + "[#16][#16]", + "[#7;!n][S;!$(S(=O)=O)]", + "[#7;!n][#7;!n]", + "C#C", + "C(=[O,S])[O,S]", + "[#7;!n][C;!$(C(=[O,N])[N,O])][#16;!s]", + "[#7;!n][C;!$(C(=[O,N])[N,O])][#7;!n]", + "[#7;!n][C;!$(C(=[O,N])[N,O])][#8;!o]", + "[#8;!o][C;!$(C(=[O,N])[N,O])][#16;!s]", + "[#8;!o][C;!$(C(=[O,N])[N,O])][#8;!o]", + "[#16;!s][C;!$(C(=[O,N])[N,O])][#16;!s]" + ] + + [[stage.scoring.component]] + [stage.scoring.component.QED] + + [[stage.scoring.component.QED.endpoint]] + name = "QED" + weight = 0.6 + + + [[stage.scoring.component]] + [stage.scoring.component.NumAtomStereoCenters] + + [[stage.scoring.component.NumAtomStereoCenters.endpoint]] + name = "Stereo" + weight = 0.4 + + transform.type = "left_step" + transform.low = 0 + """ + + config = global_parameters + parameters + learning_strategy + stages + + toml_config_filename = "stage1.toml" + write_config_file(toml_config_filename, config) + + run_reinvent(toml_config_filename) + df = analyze_results(wd) + calculate_sample_efficiency(df) diff --git a/docker/lifescience/reinvent4/entrypoint.sh b/docker/lifescience/reinvent4/entrypoint.sh new file mode 100644 index 0000000..673e51e --- /dev/null +++ b/docker/lifescience/reinvent4/entrypoint.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +CONFIG_FILE="$1" +OUTPUT_FILE="$2" + +# Check if OUTPUT ends with ".log" and append it if not +if [[ ! "$OUTPUT_FILE" =~ .log$ ]]; then + OUTPUT_FILE="${OUTPUT_FILE}.log" +fi + +if [ -n "$CONFIG_FILE" ] && [ -n "$OUTPUT_FILE" ]; then + echo "Config file provided: $CONFIG_FILE" + echo "Output file provided: $OUTPUT_FILE" + echo "Running software in automated mode..." + + # Set the correct permissions for the config file + chmod +r "$CONFIG_FILE" + + exec reinvent -l "$OUTPUT_FILE" "$CONFIG_FILE" + + echo "Processing complete. Results saved in $(dirname "$OUTPUT_FILE")." +else + echo "No config file or output file provided. Starting interactive mode..." + exec /bin/bash +fi diff --git a/docker/lifescience/semlaflow/Dockerfile b/docker/lifescience/semlaflow/Dockerfile new file mode 100644 index 0000000..dc7d938 --- /dev/null +++ b/docker/lifescience/semlaflow/Dockerfile @@ -0,0 +1,16 @@ +FROM rocm/pytorch:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.8.0 + +RUN apt-get update -y \ + && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \ + ca-certificates \ + && apt-get autoremove -y \ + && apt-get clean + +RUN pip install numpy==1.26.2 pandas==2.2.2 scipy==1.11.4 rdkit lightning torchmetrics openbabel-wheel typing_extensions wandb numba hiredis tqdm ipython certifi + +RUN git clone https://github.com/rssrwn/semla-flow.git +WORKDIR /semla-flow + +COPY entrypoint.sh /entrypoint.sh + +CMD ["/bin/bash"] diff --git a/docker/lifescience/semlaflow/README.md b/docker/lifescience/semlaflow/README.md new file mode 100644 index 0000000..42abe2e --- /dev/null +++ b/docker/lifescience/semlaflow/README.md @@ -0,0 +1,17 @@ +## Running inference job automatically (non-interactive) + +In order to run SemlaFlow jobs automatically using the above image do the following: +- Set up config and output directory: + +Put your config files as well any other files needed such as datasets or priors in `CONFIG_PATH`. In `OUTPUT_PATH`, the job will write output logs. + +```sh +export DATA_PATH= +export OUTPUT_PATH= +``` + +Then, the following command will run the job: + +```sh +docker run -it --shm-size=256g --device=/dev/kfd --device=/dev/dri/renderD --network host --ipc host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $DATA_PATH:/data -v $OUTPUT_PATH:/output rocm-semlaflow + + + + + + +
+ +
+
+

Submit Training Job

+

Provide a YAML configuration and a ZIP file containing the dataset.

+
+
+ +
+ + +
+ +
+ + +
+ +
+ +
+
+
+ + + +
+ + + + diff --git a/docker/vlm-lora-finetune/misc_test_files/training_jobs.db b/docker/vlm-lora-finetune/misc_test_files/training_jobs.db new file mode 100755 index 0000000..f0fd733 Binary files /dev/null and b/docker/vlm-lora-finetune/misc_test_files/training_jobs.db differ diff --git a/docker/vlm-lora-finetune/old_requirements.txt b/docker/vlm-lora-finetune/old_requirements.txt new file mode 100644 index 0000000..6811354 --- /dev/null +++ b/docker/vlm-lora-finetune/old_requirements.txt @@ -0,0 +1,10 @@ +accelerate==0.24.1 +bitsandbytes==0.41.1 +datasets[vision] +open_clip_torch==2.23.0 +pandas==2.1.2 +peft +scipy +torch==2.1.0 +tqdm==4.66.1 +wandb==0.15.12 diff --git a/docker/vlm-lora-finetune/setup.py b/docker/vlm-lora-finetune/setup.py new file mode 100644 index 0000000..e69de29 diff --git a/docker/vlm-lora-finetune/train.py b/docker/vlm-lora-finetune/train.py new file mode 100644 index 0000000..05d616d --- /dev/null +++ b/docker/vlm-lora-finetune/train.py @@ -0,0 +1,247 @@ +import argparse +import itertools +import logging +import os +import time +from typing import Callable + +import numpy as np +import open_clip +import torch +from accelerate import Accelerator +from clipora.config import TrainConfig, parse_yaml_to_config, save_config_to_yaml +from clipora.data import get_dataloader +from clipora.lora.inject import inject_linear_attention +from clipora.scheduler.cosine import cosine_lr +from peft import LoraConfig, PeftModel, get_peft_model +from tqdm.auto import tqdm + +logger = logging.getLogger(__name__) + + +def compute_clip_loss(model, X, Y): + loss = open_clip.ClipLoss() + image_features, text_features, logit_scale = model(X, Y) + total_loss = loss(image_features, text_features, logit_scale) + return total_loss + + +@torch.no_grad() +def evaluate(model, dataloader, config): + out = {} + model.eval() + losses = torch.zeros(config.eval_steps) + for k in range(config.eval_steps): + X, Y = next(iter(dataloader)) + loss = compute_clip_loss(model, X, Y) + losses[k] = loss.item() + out["eval_loss"] = losses.mean() + model.train() + return out + + +def init_model(config: TrainConfig, lora_adapter_path=None, full_weights_path=None): + # lora_adapter_path = checkpoint path. + # full_weights_path = if given, dont download pretrained weights, instead load weights from this + pretrained = None if full_weights_path is not None else config.pretrained + model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms( + model_name=config.model_name, + pretrained=pretrained, + ) + model_config = open_clip.get_model_config(config.model_name) + if config.lora_text: + model = inject_linear_attention( + model=model, + encoders={"transformer"}, + embed_dim=model_config["embed_dim"], + num_heads=model_config["text_cfg"]["heads"], + ) + if config.lora_vision: + model = inject_linear_attention( + model=model, + encoders={"visual.transformer"}, + embed_dim=model_config["vision_cfg"]["width"], + num_heads=config.vision_heads, + ) + + if full_weights_path: + model.load_state_dict(torch.load(full_weights_path)) + + # If not None, load existing loras from here + if lora_adapter_path: + model = PeftModel.from_pretrained(model, lora_adapter_path) + + else: + lora_config = LoraConfig( + r=config.lora_rank, + lora_alpha=config.lora_alpha, + lora_dropout=config.lora_dropout, + target_modules=["qkv", "proj"], + ) + model = get_peft_model(model, lora_config) + + if config.compile: + model.compile() + return model, preprocess_train + + +def main(config: TrainConfig, job_callback: Callable | None = None): + """Main training loop. + + Args: + config (TrainConfig): clipora training config + job_callback (Callable | None, optional): A callback function to update job status + each iteration. Used by API. Defaults to None. + """ + logging.basicConfig(level=logging.INFO) + + accelerator = Accelerator( + gradient_accumulation_steps=config.gradient_accumulation_steps, + log_with="wandb" if config.wandb else None, + ) + + if accelerator.is_main_process: + accelerator.print() + if config.output_dir is not None: + accelerator.print(f"Output directory: {config.output_dir}") + os.makedirs(config.output_dir, exist_ok=True) + + if config.wandb: + accelerator.init_trackers( + project_name=config.wandb_project if config.wandb_project else None, + ) + + if config.seed is not None: + seed = config.seed + accelerator.print(f"Using seed {seed}") + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + model, preprocess_train = init_model(config) + + train_dataloader = get_dataloader(config, preprocess_train, "train") + eval_dataloader = get_dataloader(config, preprocess_train, "val") + assert len(train_dataloader), "No data found, please check your data location." + + if config.gradient_checkpointing: + model.set_grad_checkpointing(True) + + if config.use_8bit_adam: + try: + import bitsandbytes as bnb + except ImportError: + raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.") + + optimizer_class = bnb.optim.AdamW8bit + else: + optimizer_class = torch.optim.AdamW + + if isinstance(config.learning_rate, str): + config.learning_rate = float(config.learning_rate) + + params_to_optimize = [ + { + "params": itertools.chain(model.parameters()), + "lr": config.learning_rate, + }, + ] + + optimizer = optimizer_class( + params_to_optimize, + lr=config.learning_rate, + betas=(config.adam_beta1, config.adam_beta2), + eps=config.adam_epsilon, + ) + + # create scheduler if train + total_steps = train_dataloader.num_batches * config.epochs + # if args.warmup is float, it is a percentage of total_steps + if isinstance(config.warmup, float): + assert 0 <= config.warmup <= 1, "Warmup must be between 0 and 1 if not a fixed number of steps." + config.warmup = int(config.warmup * total_steps) + + scheduler = cosine_lr(optimizer, config.learning_rate, config.warmup, total_steps) + + model, optimizer, scheduler, train_dataloader, eval_dataloader = accelerator.prepare( + model, optimizer, scheduler, train_dataloader, eval_dataloader + ) + + print("***** Running training *****") + print(f" Using device: {accelerator.device}") + print(f" Num Iters = {len(train_dataloader)}") + print(f" Num Epochs = {config.epochs}") + print(f" Instantaneous batch size per device = {config.batch_size}") + print(f" Gradient Accumulation steps = {config.gradient_accumulation_steps}") + # Only show the progress bar once on each machine. + progress_bar = tqdm( + range(config.epochs * len(train_dataloader)), + disable=not accelerator.is_local_main_process, + ) + progress_bar.set_description("Steps") + global_step = 0 + best_val_loss = float("inf") + + for epoch in range(config.epochs): + model.train() + for step, batch in enumerate(train_dataloader): + if accelerator.is_local_main_process: + if global_step % config.eval_interval == 0: + if accelerator.is_local_main_process: + eval_loss = evaluate(model, eval_dataloader, config) + accelerator.log(eval_loss, step=global_step) + progress_bar.write(f"Step: {global_step}, Eval loss: {eval_loss['eval_loss']}") + if eval_loss["eval_loss"] < best_val_loss: + best_val_loss = eval_loss["eval_loss"] + checkpoint_name = f"checkpoint_{global_step}" + save_path = os.path.join(config.output_dir, checkpoint_name) + model.save_pretrained(save_path) + if job_callback: + job_callback(best_finetuned_model_path=checkpoint_name) + # save the clipora config we used for training for later use and bookkeeping + save_config_to_yaml(config, os.path.join(save_path, "train_config.yaml")) + + X, Y = batch + loss = compute_clip_loss(model, X, Y) + accelerator.backward(loss) + if accelerator.sync_gradients: + params_to_clip = model.parameters() + accelerator.clip_grad_norm_(params_to_clip, 1.0) # args.max_grad_norm) + optimizer.step() + scheduler(global_step) + progress_bar.update(1) + if job_callback: + percent = int((epoch * len(train_dataloader) + step) / (config.epochs * len(train_dataloader)) * 100) + job_callback(status="training", detail=f"Training at {percent}%") + global_step += 1 + + logs = { + "loss": loss.item(), + "learning_rate": optimizer.param_groups[0]["lr"], + "step": global_step, + "epoch": epoch, + } + progress_bar.set_postfix(**logs) + accelerator.log(logs, step=global_step) + + accelerator.wait_for_everyone() + + if accelerator.is_local_main_process: + save_path = os.path.join(config.output_dir, "final") + model.save_pretrained(save_path) + save_config_to_yaml(config, os.path.join(save_path, "train_config.yaml")) + + accelerator.print("\n\nTraining completed.\n\n") + accelerator.end_training() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=str, + help="The path to the yaml file containing the training configuration.", + ) + print(f"Starting clipora training with config: {parser.parse_args().config}") + config = parse_yaml_to_config(parser.parse_args().config) + main(config) diff --git a/docker/vlm-lora-finetune/train_config.yml b/docker/vlm-lora-finetune/train_config.yml new file mode 100644 index 0000000..d6455e2 --- /dev/null +++ b/docker/vlm-lora-finetune/train_config.yml @@ -0,0 +1,35 @@ +model_name: "ViT-L-14" +pretrained: "datacomp_xl_s13b_b90k" +compile: False +seed: 1337 + +device: "cuda" +output_dir: "./output" + +wandb: True +wandb_project: test-clipora + +train_dataset: "awilliamson/fashion-train" +eval_dataset: "awilliamson/fashion-eval" +datatype: "hf" +csv_separator: "\t" +image_col: "image" +text_col: "text" +shuffle: True + +lora_rank: 16 +lora_alpha: 32 +lora_dropout: 0.0 + +batch_size: 32 +gradient_accumulation_steps: 1 +gradient_checkpointing: False + +use_8bit_adam: False + +learning_rate: 1e-4 +epochs: 3 +warmup: 0.01 +save_interval: 1000 +eval_interval: 100 +eval_steps: 100 diff --git a/docker/vlm-lora-finetune/visualize_results.py b/docker/vlm-lora-finetune/visualize_results.py new file mode 100644 index 0000000..c497909 --- /dev/null +++ b/docker/vlm-lora-finetune/visualize_results.py @@ -0,0 +1,203 @@ +import argparse +import os +import random + +import matplotlib.patches as patches +import matplotlib.pyplot as plt +import numpy as np +import open_clip # or open_clip +import pandas as pd +import torch +from clipora.config import parse_yaml_to_config +from matplotlib.colors import Normalize +from PIL import Image +from train import init_model + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + +def visualize_results(image_path, texts, before, after, output_dir): + """Saves an image with a table showing text probabilities before and after LORA fine-tuning. + texts, before and after have to be the same length + Args: + image_path (str): The path to the input image. + texts (list): The list of text prompts. + before (list): The probabilities before fine-tuning. + after (list): The probabilities after fine-tuning. + output_dir (str): The directory where the output image will be saved. + """ + img = Image.open(image_path) + norm = Normalize(vmin=0, vmax=1) + + # Find max indices + max_before_idx = np.argmax(before) + max_after_idx = np.argmax(after) + + # Calculate dynamic column widths based on text length + max_text_length = max(len(text) for text in texts) + + # Adjust column widths dynamically + if max_text_length > 50: # Long text + col_widths = [0.6, 0.2, 0.2] # Give more space to text + elif max_text_length > 30: # Medium text + col_widths = [0.5, 0.25, 0.25] + else: # Short text + col_widths = [0.4, 0.3, 0.3] # Original proportions + + # Create figure with dynamic width + fig_width = max(12, max_text_length * 0.15) # Scale figure width with text length + fig, ax = plt.subplots(figsize=(fig_width, max(6, len(texts) * 0.5 + 2))) + ax.axis("off") + + # Image panel - adjust based on figure width + img_width = min(0.25, 3.0 / fig_width) # Cap image width but scale with figure + img_ax = fig.add_axes([0.05, 0.1, img_width, 0.8]) + img_ax.imshow(img) + img_ax.axis("off") + + # Table panel - use remaining space + table_start = 0.05 + img_width + 0.05 + table_width = 0.9 - table_start + table_ax = fig.add_axes([table_start, 0.1, table_width, 0.8]) + table_ax.axis("off") + + # Calculate positions + x_positions = np.cumsum([0] + col_widths[:-1]) + + # Table headers + headers = ["Text", "Before", "After"] + for i, header in enumerate(headers): + table_ax.text( + x_positions[i], 1, header, ha="left", va="bottom", fontsize=12, weight="bold", transform=table_ax.transAxes + ) + + # Calculate row height based on number of items + row_height = min(0.08, 0.7 / len(texts)) # Dynamic row height + + # Draw cells + for i, (text, b, a) in enumerate(zip(texts, before, after)): + y = 0.9 - i * row_height + + # Handle long text with wrapping + if len(text) > 40: + # Split long text into multiple lines + words = text.split() + lines = [] + current_line = "" + max_chars_per_line = int(40 * col_widths[0] / 0.4) # Scale with column width + + for word in words: + if len(current_line + " " + word) <= max_chars_per_line: + current_line += " " + word if current_line else word + else: + if current_line: + lines.append(current_line) + current_line = word + if current_line: + lines.append(current_line) + + # Display multiline text + for j, line in enumerate(lines): + table_ax.text( + x_positions[0], y - j * 0.02, line, ha="left", va="center", fontsize=9, transform=table_ax.transAxes + ) + else: + # Display single line text + table_ax.text(x_positions[0], y, text, ha="left", va="center", fontsize=10, transform=table_ax.transAxes) + + # Draw probability boxes + for j, val in enumerate([b, a], start=1): + color = plt.cm.Blues(norm(val)) if j == 1 else plt.cm.Reds(norm(val)) + x = x_positions[j] + width = col_widths[j] + + # Highlight max values with red border + edge_color = "red" if (j == 1 and i == max_before_idx) or (j == 2 and i == max_after_idx) else "black" + + rect = patches.Rectangle( + (x, y - row_height / 2 + 0.01), + width, + row_height - 0.02, + transform=table_ax.transAxes, + color=color, + ec=edge_color, + lw=1.5, + ) + table_ax.add_patch(rect) + + table_ax.text( + x + width / 2, + y, + f"{val:.2f}", + ha="center", + va="center", + fontsize=10, + color="white" if val > 0.5 else "black", # Better contrast + weight="bold", + transform=table_ax.transAxes, + ) + + plt.savefig(os.path.join(output_dir, "output_image.png"), bbox_inches="tight", dpi=150) + plt.close() + + +def main(original_model, lora_model, preprocess, config, csv_path=None): + """Display probabilities before and after LORA fine-tuning for 10 random texts + from the evaluation dataset. + """ + # === Load CSV and Select Data === + csv_path = csv_path or config.eval_dataset + df = pd.read_csv(csv_path) + # Get 10 random texts from csv as classes + classes = df[config.text_col].drop_duplicates().sample(10).tolist() + # Get an image which has the first text as the correct one + correct_text = classes[0] + img_path = df[df[config.text_col] == correct_text][config.image_col].iloc[0] + # === Preprocess Inputs === + image = preprocess(Image.open(img_path)).unsqueeze(0).to(device) + text_tokens = open_clip.tokenize(classes).to(device) + + with torch.no_grad(): + img_feat_before = original_model.encode_image(image) + txt_feat_before = original_model.encode_text(text_tokens) + probs_before = (img_feat_before @ txt_feat_before.T).softmax(dim=-1).squeeze().cpu().numpy() + + img_feat_after = lora_model.encode_image(image) + txt_feat_after = lora_model.encode_text(text_tokens) + probs_after = (img_feat_after @ txt_feat_after.T).softmax(dim=-1).squeeze().cpu().numpy() + + print("probs before:") + print(probs_before) + print("probs after:") + print(probs_after) + + visualize_results(img_path, classes, probs_before, probs_after, config.output_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--config", + type=str, + help="The path to the yaml file containing the training configuration.", + ) + + parser.add_argument( + "--lora_adapter_path", + type=str, + help="The path to the LoRA adapter weights, e.g. checkpoint folder.", + ) + + args = parser.parse_args() + lora_adapter_path = args.lora_adapter_path + config = parse_yaml_to_config(args.config) + lora_model, preprocess = init_model(config, lora_adapter_path=lora_adapter_path) + # Load the original CLIP model (no LORA) + original_model, _, _ = open_clip.create_model_and_transforms( + model_name=config.model_name, + pretrained=config.pretrained, + ) + + original_model = original_model.to(device) + lora_model = lora_model.to(device) + main(original_model, lora_model, preprocess, config) diff --git a/docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md b/docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md new file mode 100644 index 0000000..f360c57 --- /dev/null +++ b/docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md @@ -0,0 +1,123 @@ +# Silogen-Engine Fine-Tuning Llama 3.1 8B Instruct with Open Protein Instructions Dataset + +## Introduction + +In this tutorial, we demonstrate LoRA fine-tuning of Llama 3.1 8B Instruct with the Open Protein Instructions dataset using the [Silogen fine-tuning engine](https://github.com/silogen/llm-finetuning) end-to-end, from downloading data to querying the fine-tuned model. Upon receiving a query containing a protein sequence as a input, the fine-tuned model attempts to provide an expert response about the protein sequence, such as its functional description. Depending on your fine-tuning configuration, the end result would be similar to [OPI-Llama](https://huggingface.co/BAAI/OPI-Llama-3.1-8B-Instruct) fine-tuned by the dataset authors. + +[Base model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [Fine-tuning dataset](https://huggingface.co/datasets/BAAI/OPI) + +Please note that the dataset is CC-BY-NC-4.0 licensed and used solely for demonstration purposes here with permission from the authors. + +## Prerequisites +- MinIO cluster storage (or similar) with credentials configured in your Kubernetes namespace secrets +- Huggingface token for data and model download in your Kubernetes namespace secrets + +## Running workloads +The commands below are assumed to be run at the repository root. + +### Data download and preprocessing +We will use the `workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml` override to download the dataset from Huggingface, convert it to the format expected by the Silogen fine-tuning engine, and persist the processed dataset to `bucketDataDir` configured in our override. Our `dataScript` will also create a sample of 1k rows for quick demonstration of the fine-tuning workflow. + +``` +helm template workloads/download-data-to-bucket/helm \ + -f workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml \ + --name-template "download-opi-data" \ + | kubectl apply -f - +``` + +### Base model download +We can download the base model to MinIO without customizing the existing override for our base model. Downloading this model requires a Huggingface token (assumed to be available in the namespace), which we specify in another override. +``` +helm template workloads/download-huggingface-model-to-bucket/helm \ + -f workloads/download-huggingface-model-to-bucket/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \ + -f workloads/llm-finetune-silogen-engine/helm/overrides/utilities/hf-token.yaml \ + --name-template "download-llama-31-8-instruct" \ + | kubectl apply -f - +``` + +### Fine-tuning +To start fine-tuning with the Silogen engine, we can use existing overrides for reasonable default fine-tuning parameters for the base model, and to enable Tensorboard monitoring. We can customize any parameters, such as the number of fine-tuning GPUs, with `workloads/llm-finetune-silogen-engine/helm/overrides/tutorial-05-llama-lora-opi-data.yaml`. + +``` +workloads_path="workloads/llm-finetune-silogen-engine/helm" +helm template $workloads_path \ + -f $workloads_path/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \ + -f $workloads_path/overrides/utilities/tensorboard.yaml \ + -f $workloads_path/overrides/tutorial-05-llama-lora-opi-data.yaml \ + --name-template llm-finetune-llama-opi \ + | kubectl apply -f - +``` + +To monitor fine-tuning progress with Tensorboard, we can forward the associated port to access with a local browser, e.g., `kubectl port-forward pods/ 6006:6006`. Model checkpoints and logs will persist in the `checkpointsRemote` specified in our custom override file. + +### Inference + +#### Deploying each model +To deploy the base model using vLLM, we can use the existing override +``` +name="llama-31-8-instruct" +helm template $name workloads/llm-inference-vllm/helm \ +-f workloads/llm-inference-vllm/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \ +--set "vllm_engine_args.served_model_name=$name" \ +| kubectl apply -f - +``` + +To deploy our fine-tuned model, we set the model path to our final experiment checkpoint +``` +name="llama-31-8B-lora-opi-1k" +helm template workloads/llm-inference-vllm/helm \ + -f workloads/llm-inference-vllm/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \ + --set "model=s3://default-bucket/experiments/finetuning/$name/checkpoint-final" \ + --set "vllm_engine_args.served_model_name=$name" \ + --name-template "opi-llama" \ + | kubectl apply -f - +``` +#### Querying the deployed models + +Forward a port for each deployment +``` +base_model="llama-31-8-instruct" +ft_model="llama-31-8B-lora-opi-1k" +port_1=8011 +port_2=8012 + +kubectl port-forward svc/llm-inference-vllm-$base_model $port_1:80 > /dev/null & portforwardPID=$! + +kubectl port-forward svc/llm-inference-vllm-$ft_model $port_2:80 > /dev/null & portforwardPID=$! +``` + +Query each model to compare their outputs +``` +question="Can you provide the functional description of the following protein sequence? Sequence: MRWQEMGYIFYPRKLR" + +# Base model +curl http://localhost:$port_1/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'$base_model'", + "messages": [ + {"role": "user", "content": "'"$question"'"} + ] + }' | jq ".choices[0].message.content" --raw-output + +# [Example response] Unfortunately, I can't identify the exact function of the given protein sequence. However, ... + +# Fine-tuned model +curl http://localhost:$port_2/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "'$ft_model'", + "messages": [ + {"role": "user", "content": "'"$question"'"} + ] + }' | jq ".choices[0].message.content" --raw-output + +# [Example response] This protein is a ribonucleoprotein involved in the processing of rRNA and the assembly of ribosomes. +``` + +### Cleaning up +We can delete our model deployments for example with kubectl +``` +kubectl delete deployments/llm-inference-vllm- +kubectl delete svc/llm-inference-vllm- +``` diff --git a/docs/tutorials/tutorial-06-package-and-serve-wan2.1-with-torchserve.md b/docs/tutorials/tutorial-06-package-and-serve-wan2.1-with-torchserve.md new file mode 100644 index 0000000..b3a6c74 --- /dev/null +++ b/docs/tutorials/tutorial-06-package-and-serve-wan2.1-with-torchserve.md @@ -0,0 +1,92 @@ +# Tutorial 06: Package and Serve Wan2.1 with TorchServe + +This tutorial shows how to prepare the Wan2.1 model for TorchServe, upload it to a cluster-internal MinIO storage, and then deploy a TorchServe workload that serves the model behind an API endpoint. The process consists of two steps: **packaging the model** and **serving it**. + +## 1. Setup + +Follow the setup in the [tutorial pre-requisites section](./tutorial-00-prerequisites.md). + +--- + +## 2. Package model into MinIO + +Before TorchServe can serve a model, it needs a model `.zip` archive. We use the workload `torchserve-model-packager` to compress the Wan2.1 model and upload it to MinIO storage. Its user input file is: + +```bash +workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml +``` + +Run: + +```bash +helm template workloads/torchserve-model-packager/helm \ + --values workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml \ + --name-template wan21-packager \ + | kubectl create -f - +``` + +This job downloads Wan2.1 weights, prepares TorchServe assets, and writes the `.zip` file into MinIO. + +You can follow logs and progress as described in [the monitoring section](./tutorial-00-prerequisites.md#monitoring-progress-logs-and-gpu-utilization-with-k9s). + +--- + +## 3. Deploy TorchServe with Wan2.1 + +Once the model archive is available in MinIO, we can deploy TorchServe itself. + +The workload for this is `media-torchserve-wan21`. Its user input file is: + +```bash +workloads/media-torchserve-wan21/helm/overrides/tutorial-06-serve.yaml +``` + +Run: + +```bash +helm template workloads/media-torchserve-wan21/helm \ + --values workloads/torchserve-wan21/helm/overrides/tutorial-06-serve.yaml \ + --name-template wan21-serve \ + | kubectl apply -f - +``` + +This creates a GPU-enabled TorchServe deployment, installs dependencies, mounts configuration files, downloads the `.zip` from MinIO, creates `.mar` archive, and starts serving. + +--- + +## 4. Access the API + +Forward TorchServe’s REST API to your local machine: + +```bash +kubectl port-forward deployment/wan21-serve-media-torchserve-wan21 8080:8080 +``` + +Now you can send a test request to generate video: + +```bash +curl -X POST http://localhost:8080/predictions/wan21 \ + -H "Content-Type: application/json" \ + -d '{ + "prompt": "a scene of an astronaut riding a horse on mars", + "width": 480, + "height": 832, + "num_frames": 81, + "num_inference_steps": 40 + }' \ + --output "output-$(date +%Y%m%d%H%M%S).mp4" +``` + +The output video will be written to the current directory. + +When finished, you can stop the port-forwarding process and clean up the deployment: + +```bash +kubectl delete deployment wan21-serve-media-torchserve-wan21 +``` + +--- + +## Next Steps + +This tutorial showed how to package Wan2.1 and serve it through TorchServe. The natural next step is to deploy additional handlers for different models. diff --git a/docs/tutorials/tutorial-07-wan-video-finetuning.md b/docs/tutorials/tutorial-07-wan-video-finetuning.md new file mode 100644 index 0000000..d86fd2b --- /dev/null +++ b/docs/tutorials/tutorial-07-wan-video-finetuning.md @@ -0,0 +1,268 @@ +# Tutorial 07: Fine-tuning Wan 2.2 for Video Generation + +This tutorial shows how to fine-tune Wan 2.2 models for custom video generation using the DiffSynth framework. Wan 2.2 is a state-of-the-art text-to-video generation model available in 5B and 14B parameter versions. We'll demonstrate both LoRA (Low-Rank Adaptation) and full parameter fine-tuning approaches, using the Disney VideoGeneration Dataset as an example. + +The tutorial covers the complete pipeline: downloading models and datasets to cluster MinIO storage, setting up the training environment, running fine-tuning jobs with multi-GPU support, and uploading trained checkpoints for future use. + +We'll start with the 5B parameter model using LoRA fine-tuning, which provides a good balance between training efficiency and model quality. The approach can be scaled up to the 14B model or full parameter training for more demanding use cases. + +## 1. Setup + +Follow the setup in the [tutorial pre-requisites section](./tutorial-prereqs.md). + +## 2. Download model and dataset + +We'll use the workloads to download the Wan 2.2 model and the Disney VideoGeneration Dataset to cluster MinIO storage. + +### Download Wan 2.2 5B Model + +First, download the Wan 2.2 5B parameter text-to-video model: + +```bash +helm template workloads/download-huggingface-model-to-bucket/helm \ + -f workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml \ + --name-template download-wan2-2-ti2v-5b \ + | kubectl apply -f - +``` + +### Download Disney VideoGeneration Dataset + +Next, download and preprocess the Disney VideoGeneration Dataset (Steamboat Willy): + +```bash +helm template workloads/download-data-to-bucket/helm \ + -f workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml \ + --name-template download-disney-dataset \ + | kubectl apply -f - +``` + +Monitor the downloads using [k9s or kubectl logs](./tutorial-prereqs.md#monitoring-progress-logs-and-gpu-utilization-with-k9s). The model download includes: +- Main diffusion model (3 safetensors files, ~18.7 GiB) +- VAE model (Wan2.2_VAE.pth, ~2.6 GiB) +- Text encoder (T5-XXL, ~11 GiB) +- Configuration and tokenizer files + +## 3. Interactive exploration (optional) + +For testing and experimentation, you can launch an interactive version that provides a ready environment without automatic training: + +```bash +helm template workloads/media-finetune-wan/helm \ + --name-template wan-finetune-interactive \ + | kubectl apply -f - +``` + +Connect to the interactive pod: +```bash +kubectl exec -it wan-finetune-interactive -- /bin/bash +``` + +This gives you access to explore the DiffSynth framework, examine the downloaded models and data, and test training configurations manually. + +## 4. LoRA fine-tuning on multiple GPUs + +LoRA (Low-Rank Adaptation) provides efficient fine-tuning by updating only a small number of parameters. This approach is recommended for most use cases as it requires less compute resources while maintaining good performance. + +### Launch 5B LoRA Fine-tuning + +Run LoRA fine-tuning on the 5B model using our optimized configuration: + +```bash +helm template workloads/media-finetune-wan/helm \ + -f workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml \ + --name-template wan-finetune-5b-lora \ + | kubectl apply -f - +``` + +This configuration uses: +- **4 GPUs** with **32 CPU cores** for reasonable compilation performance +- **LoRA rank 32** for good parameter efficiency +- **DeepSpeed ZeRO Stage 2** for distributed training +- **BF16 precision** for memory efficiency +- **5 epochs** with gradient accumulation + +### Monitor training progress + +Follow the training progress using: + +```bash +# Watch job status +kubectl get jobs -w + +# View training logs (replace with your actual job name) +kubectl logs job/wan-finetune-5b-lora -f + +# Check GPU utilization with k9s +k9s +``` + +Training phases you'll observe: +1. **Installation**: DiffSynth framework setup +2. **Resource Download**: Model and dataset download from MinIO +3. **Compilation**: PyTorch model compilation for AMD GPUs +4. **Training**: 5 epochs with progress bars +5. **Upload**: Checkpoint upload to MinIO + +Total training time: ~90-120 minutes depending on cluster load. + +## 5. Scale to larger configurations + +For more advanced use cases, different configurations can be explored. + +### 14B Model Fine-tuning + +For higher quality results, use the 14B parameter model by using the relevant configuration from among the override files inside the workload. + +### Full Parameter Fine-tuning + +For maximum customization, use the relevant override file for `architecture: "full"` instead of LoRA. This requires: +- More GPU resources (4+ GPUs recommended) +- Increased memory allocation +- Longer training time but potentially better results + +## 6. Working with checkpoints + +Trained checkpoints are automatically uploaded to MinIO with organized paths: + +``` +default-bucket/models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2-TI2V-5B_lora/20250925-141325/ +β”œβ”€β”€ epoch-0.safetensors +β”œβ”€β”€ epoch-1.safetensors +β”œβ”€β”€ epoch-2.safetensors +β”œβ”€β”€ epoch-3.safetensors +β”œβ”€β”€ epoch-4.safetensors +β”œβ”€β”€ adapter_config.json +β”œβ”€β”€ adapter_model.safetensors +└── training_args.bin +``` + +### Using trained models + +The checkpoints can be used for: + +1. **Further fine-tuning**: Resume training from any epoch +2. **Inference deployment**: Load for video generation +3. **Model evaluation**: Compare different configurations +4. **Checkpoint merging**: Combine LoRA weights with base model + +### Download checkpoints locally + +```bash +# Setup MinIO client (if not already configured) +mc alias set minio-cluster http://minio.cluster.local + +# Download specific checkpoint +mc cp --recursive \ + minio-cluster/default-bucket/models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2-TI2V-5B_lora/20250925-141325/ \ + ./local-checkpoints/ +``` + +## 7. Hyperparameter tuning + +Experiment with different LoRA ranks to find optimal configurations: + +```bash +run_id=wan-lora-sweep +for rank in 8 16 32 64 128; do + name="wan-5b-lora-r$rank-$run_id" + helm template workloads/media-finetune-wan/helm \ + -f workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml \ + --name-template $name \ + --set finetune_config.lora_rank=$rank \ + | kubectl apply -f - +done +``` + +This creates parallel jobs testing different LoRA ranks, allowing you to compare training efficiency and model quality. + +## 8. Advanced configurations + +### Custom datasets + +To use your own video dataset: + +1. **Prepare data**: Organize videos and captions in the required format +2. **Upload to MinIO**: Use the data upload workload +3. **Update configuration**: Modify `datasetId` and paths in your override file +4. **Adjust parameters**: Tune learning rate, batch size, and epochs based on your data size + +### Multi-node training + +For very large models or datasets, distribute across multiple nodes: + +```yaml +resources: + cpu: 64 + gpus: 8 # Use all 8 GPUs per node + memory: 512Gi +``` + +### Memory optimization + +For memory-constrained scenarios: + +```yaml +finetune_config: + # Use gradient checkpointing + gradient_checkpointing: true + # Reduce batch size + train_batch_size: 8 + # Increase gradient accumulation + gradient_accumulation_steps: 8 +``` + +## 9. Troubleshooting + +### Common issues and solutions + +**Compilation taking too long**: +- Increase CPU allocation to 32+ cores +- Use at least 3-4 GPUs for better parallelization + +**Out of memory errors**: +- Reduce batch size: `train_batch_size: 8` +- Enable gradient checkpointing +- Use smaller LoRA rank: `lora_rank: 16` + +**Slow model download**: +- Check MinIO cluster connectivity +- Verify bucket credentials and permissions + +**Training divergence**: +- Lower learning rate: `learning_rate: 1e-5` +- Increase warmup steps +- Use different noise schedules + +### Monitoring resources + +```bash +# Check GPU usage across cluster +kubectl top nodes + +# View detailed pod resource usage +kubectl describe pod wan-finetune-5b-lora-xyz + +# Monitor job events +kubectl describe job wan-finetune-5b-lora +``` + +## 10. Next steps + +After successful fine-tuning: + +1. **Deploy for inference**: Use trained checkpoints in inference workloads +2. **Quality evaluation**: Generate test videos and evaluate results +3. **Dataset expansion**: Add more diverse training data +4. **Architecture experiments**: Try different model variants and training strategies + +The fine-tuned Wan 2.2 models can be integrated into video generation pipelines, content creation tools, or further research projects focusing on controllable video synthesis. + +## Configuration files reference + +The tutorial uses these override configurations: + +- `workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml`: Model download configuration +- `workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml`: Dataset download configuration +- `workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml`: Optimized 5B LoRA fine-tuning configuration + +Each configuration includes detailed comments explaining the parameter choices and trade-offs for different use cases. diff --git a/mkdocs.yml b/mkdocs.yml index aa820b6..0b94ca2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,7 +93,8 @@ nav: - llm-finetune-llama-factory: workloads/llm-finetune-llama-factory/helm/README.md - llm-finetune-silogen-engine: - Overview: workloads/llm-finetune-silogen-engine/helm/README.md - - Finetuning Config: workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md + - SFT Config: workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md + - DPO Config: workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md - llm-finetune-verl: workloads/llm-finetune-verl/helm/README.md - llm-inference-llamacpp-mi300x: workloads/llm-inference-llamacpp-mi300x/helm/README.md - llm-inference-megatron-lm: workloads/llm-inference-megatron-lm/helm/README.md diff --git a/workloads/benchmark-lifescience-reinvent4/helm/Chart.yaml b/workloads/benchmark-lifescience-reinvent4/helm/Chart.yaml new file mode 100644 index 0000000..07074d8 --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: benchmark-lifescience-reinvent4 +description: A Helm chart for Reinvent4 inference +version: 0.0.1 diff --git a/workloads/benchmark-lifescience-reinvent4/helm/README.md b/workloads/benchmark-lifescience-reinvent4/helm/README.md new file mode 100644 index 0000000..1a44170 --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/README.md @@ -0,0 +1,75 @@ +# Life Science - Reinvent4 + +This Helm Chart deploys a workload as a Kubernetes job for REINVENT4 run mode Transfer Learning (TL) + +## Prerequisites + +Ensure the following prerequisites are met before deploying any workloads: + +1. **Helm**: Install `helm`. Refer to the [Helm documentation](https://helm.sh/) for instructions. + +## Deploying the Workload + +It is recommended to use `helm template` and pipe the result to `kubectl create` , rather than using `helm install`. Generally, a command looks as follows + +```bash +helm template [optional-release-name] -f --set = | kubectl apply -f - +``` + +The chart provides three main ways to deploy models, detailed below. + +## User Input Values + +Refer to the `values.yaml` file for the user input values you can provide, along with instructions. + +### Verify Job + +Check the job status: + +```bash +kubectl get jobs +``` + +# Running Reinvent inference interactively + +Connect to the pod with your favorite terminal. + +The job runs the script `docker/lifescience/reinvent4/Reinvent_TLRL_clean.py` automatically. The logs can be followed by running `kubectl logs -f` + +Alternatively, you can uncomment the bottom part of the `values.yaml` file to run `Reinvent_demo_clean.py` as well. Or you can interactively connect to the job and run either of the notebooks manually. Just make sure you don't run two scripts at the same time by accident. + +```sh +# Connect to the pod +kubectl exec -it -- /bin/bash + +python3 notebooks/ +``` + +Alternatively, Reinvent jobs can be run by: +```sh +reinvent -l +``` + +## Expected outputs from the demo runs + +- `Reinvent_demo_clean.py` + +| Agent | Prior | Target | Score | SMILES | SMILES_state | QED | QED (raw) | Stereo | Stereo (raw) | Alerts | Alerts (raw) | step | +|------------|-----------|------------|--------------|-----------------------------------------------------------------------------------------------------|--------------|-------------|-----------|--------|---------------|--------|---------------|------| +| 38.0957 | 38.0957 | -38.0276 | 0.000531 | Cc1ccc(C2CC(=O)Nc3cccc(NC45CC6CC(CC4C6)C5)c32)cc1 | 1 | 0.751039 | 0.7510 | 0.0 | 3.0 | 1.0 | 1.0 | 1 | +| 29.6628 | 29.6628 | 89.1671 | 0.928359 | Cc1cc(-c2nnn(CC3CCCCC3)n2)nc(C(=O)NC2CCC2)n1 | 1 | 0.883472 | 0.8835 | 1.0 | 0.0 | 1.0 | 1.0 | 1 | +| 33.0351 | 33.0351 | -33.0351 | 0.000000 | COc1ccc(CCCCON=C2CCC3CC2CN3C(=O)OCc2ccccc2)cc1 | 1 | 0.000000 | 0.0000 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | +| 22.6197 | 22.6197 | -22.6197 | 0.000000 | CCOC(=O)C1C(=O)N=C(N)NC1c1ccc2c(c1)OCO2 | 1 | 0.000000 | 0.0000 | 0.0 | 0.0 | 0.0 | 0.0 | 1 | +| 28.3920 | 28.3920 | -28.3144 | 0.000606 | CC(NC(=O)c1nccs1)c1ccc2c(c1)COC2 | 1 | 0.935827 | 0.9358 | 0.0 | 1.0 | 1.0 | 1.0 | 1 | + +Total number of SMILES generated: 30000\ +Total number of invalid SMILES: 283\ +Total number of batch duplicate SMILES: 8\ +Total number of duplicate SMILES: 1317 + +- `Reinvent_TLRL_clean.py` + +(This is the number of produced "good binders" defined by `QED < 0.8` and `ChemProp (raw) < -25.0` before and after removing duplicates.) + +4\ +4 diff --git a/workloads/benchmark-lifescience-reinvent4/helm/mount/README.md b/workloads/benchmark-lifescience-reinvent4/helm/mount/README.md new file mode 100644 index 0000000..75734b3 --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/mount/README.md @@ -0,0 +1,3 @@ +Files in this directory are mounted to the workload at `/workload/mount`. + +**Note:** Subdirectories and binary files are not supported. diff --git a/workloads/benchmark-lifescience-reinvent4/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/benchmark-lifescience-reinvent4/helm/overrides/kaiwo/kaiwo-enable.yaml new file mode 100644 index 0000000..e6d278a --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/overrides/kaiwo/kaiwo-enable.yaml @@ -0,0 +1,3 @@ +# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload) +kaiwo: + enabled: true diff --git a/workloads/benchmark-lifescience-reinvent4/helm/templates/_helpers.tpl b/workloads/benchmark-lifescience-reinvent4/helm/templates/_helpers.tpl new file mode 100644 index 0000000..0861d3a --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/templates/_helpers.tpl @@ -0,0 +1,101 @@ +# Release name helper +{{- define "release.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +# Release fullname helper +{{- define "release.fullname" -}} +{{- $currentTime := now | date "20060102-1504" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- if ne .Release.Name "release-name" -}} +{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +# Init container resources helper +{{- define "init_container.resources" -}} +requests: + memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi" + cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}" +limits: + memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi" + cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}" +{{- end -}} + +# Container resources helper +{{- define "container.resources" -}} +requests: + {{- if .Values.gpus }} + amd.com/gpu: "{{ .Values.gpus }}" + {{- end }} + memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi" + cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}" +limits: + {{- if .Values.gpus }} + amd.com/gpu: "{{ .Values.gpus }}" + {{- end }} + memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi" + cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}" +{{- end -}} + +# Container environment variables helper +{{- define "container.env" -}} +{{- range $key, $value := .Values.env_vars }} +{{- if (typeIs "string" $value) }} +- name: {{ $key }} + value: {{ $value | quote }} +{{- else }} +- name: {{ $key }} + valueFrom: + secretKeyRef: + name: {{ $value.name }} + key: {{ $value.key }} +{{- end }} +{{- end }} +{{- end -}} + +# Container volume mounts helper +{{- define "container.volumeMounts" -}} +- mountPath: /workload + name: ephemeral-storage +- mountPath: /workload/mount + name: workload-mount +- mountPath: /dev/shm + name: dshm +{{- end -}} + +# Container volumes helper +{{- define "container.volumes" -}} +{{- if .Values.storage.ephemeral.storageClassName -}} +- ephemeral: + volumeClaimTemplate: + spec: + {{- if .Values.storage.ephemeral.accessModes }} + accessModes: {{ .Values.storage.ephemeral.accessModes }} + {{- else }} + accessModes: + - ReadWriteOnce + {{- end }} + resources: + requests: + storage: {{ .Values.storage.ephemeral.quantity }} + storageClassName: {{ .Values.storage.ephemeral.storageClassName }} + name: ephemeral-storage +{{- else }} +- emptyDir: {} + name: ephemeral-storage + sizeLimit: {{ .Values.storage.ephemeral.quantity }} +{{- end }} +- emptyDir: + medium: Memory + sizeLimit: {{ .Values.storage.dshm.sizeLimit }} + name: dshm +- configMap: + name: {{ include "release.fullname" . }} + name: workload-mount +{{- end -}} diff --git a/workloads/benchmark-lifescience-reinvent4/helm/templates/configmap.yaml b/workloads/benchmark-lifescience-reinvent4/helm/templates/configmap.yaml new file mode 100644 index 0000000..db5a6c7 --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/templates/configmap.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "release.fullname" . }} +data: +{{- $files := .Files }} +{{- range $path, $_ := .Files.Glob "mount/*" }} + {{ $key := $path | trimPrefix "mount/" }} + {{- $key }}: | +{{ $files.Get $path | indent 4 }} +{{- end }} diff --git a/workloads/benchmark-lifescience-reinvent4/helm/templates/job.yaml b/workloads/benchmark-lifescience-reinvent4/helm/templates/job.yaml new file mode 100644 index 0000000..5f3d079 --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/templates/job.yaml @@ -0,0 +1,60 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "release.fullname" . }} + labels: + app: {{ include "release.fullname" . }} + {{- range $key, $value := .Values.metadata.labels }} + {{ $key }}: {{ $value | quote }} + {{- end }} +spec: + ttlSecondsAfterFinished: 3600 + activeDeadlineSeconds: 7200 # 2 hours timeout + backoffLimit: 1 + template: + metadata: + labels: + app: {{ include "release.fullname" . }} + spec: + {{- if .Values.nodeSelector }} + nodeSelector: + {{- .Values.nodeSelector | toYaml | nindent 8 }} + {{- end }} + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{- range .Values.imagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + {{- if .Values.init_args }} + initContainers: + - name: {{ .Chart.Name }}-init + image: {{ .Values.image | quote}} + imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }} + command: ["sh", "-c"] + args: + - | + {{- .Values.init_args | nindent 12 }} + resources: + {{- include "init_container.resources" . | nindent 12 }} + volumeMounts: + {{- include "container.volumeMounts" . | nindent 12 }} + {{- end}} + containers: + - name: {{ .Chart.Name }} + image: {{ .Values.image | quote}} + imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }} + {{- if .Values.entrypoint }} + command: ["sh", "-c"] + args: + - | + {{- .Values.entrypoint | nindent 12 }} + {{- end }} + resources: + {{- include "container.resources" . | nindent 12 }} + volumeMounts: + {{- include "container.volumeMounts" . | nindent 12 }} + + restartPolicy: Never + volumes: + {{- include "container.volumes" . | nindent 8 }} diff --git a/workloads/benchmark-lifescience-reinvent4/helm/values.schema.json b/workloads/benchmark-lifescience-reinvent4/helm/values.schema.json new file mode 100644 index 0000000..de2555c --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/values.schema.json @@ -0,0 +1,133 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "metadata": { + "type": "object", + "description": "Metadata for the deployment", + "properties": { + "labels": { + "type": "object", + "description": "Labels to apply to the deployment", + "additionalProperties": { + "type": "string" + } + } + }, + "required": ["labels"] + }, + "image": { + "type": "string", + "description": "Docker image to use for the deployment" + }, + "imagePullPolicy": { + "type": "string", + "description": "Image pull policy", + "enum": ["Always", "IfNotPresent", "Never"] + }, + "imagePullSecrets": { + "type": "array", + "description": "Image pull secrets for private registries" + }, + "entrypoint": { + "type": "string", + "description": "Entrypoint for the container" + }, + "init_args": { + "type": "string", + "description": "Commands for the initContainer" + }, + "gpus": { + "type": "integer", + "description": "Number of GPUs to allocate", + "minimum": 1 + }, + "memory_per_gpu": { + "type": "integer", + "description": "Memory per GPU in Gi", + "minimum": 1 + }, + "cpu_per_gpu": { + "type": "integer", + "description": "CPU cores per GPU", + "minimum": 1 + }, + "vllm_engine_args": { + "type": "object", + "description": "Arguments for the vllm engine", + "additionalProperties": { + "type": "string" + } + }, + "env_vars": { + "type": "object", + "description": "Environment variables for the deployment" + }, + "storage": { + "type": "object", + "description": "Storage configuration", + "properties": { + "ephemeral": { + "type": "object", + "description": "Ephemeral storage configuration", + "properties": { + "quantity": { + "type": "string", + "description": "Quantity of ephemeral storage" + }, + "storageClassName": { + "type": "string", + "description": "Storage class name for ephemeral storage" + }, + "accessModes": { + "type": "array", + "description": "Access modes for ephemeral storage", + "items": { + "type": "string" + } + } + }, + "required": ["quantity", "storageClassName", "accessModes"] + }, + "dshm": { + "type": "object", + "description": "Shared memory configuration", + "properties": { + "sizeLimit": { + "type": "string", + "description": "Size limit for shared memory" + } + }, + "required": ["sizeLimit"] + } + }, + "required": ["ephemeral", "dshm"] + }, + "nodeSelector": { + "type": "object", + "properties": { + "dev": { + "type": "string", + "description": "If true, use the dev node selector" + } + } + }, + "startupProbe": { + "type": ["object"], + "additionalProperties": true, + "description": "Startup probe configuration for the container" + }, + "livenessProbe": { + "type": "object", + "additionalProperties": true, + "description": "Liveness probe configuration for the container" + }, + "readinessProbe": { + "type": ["object"], + "additionalProperties": true, + "description": "Readiness probe configuration for the container" + } + }, + "required": ["metadata", "image", "imagePullPolicy", "gpus", "memory_per_gpu", "cpu_per_gpu", "storage"], + "additionalProperties": false +} diff --git a/workloads/benchmark-lifescience-reinvent4/helm/values.yaml b/workloads/benchmark-lifescience-reinvent4/helm/values.yaml new file mode 100644 index 0000000..8766630 --- /dev/null +++ b/workloads/benchmark-lifescience-reinvent4/helm/values.yaml @@ -0,0 +1,48 @@ +metadata: + labels: {} + +# The build steps for this file can be found under ai-workloads/docker/lifescience/reinvent4/Dockerfile +image: ghcr.io/silogen/reinvent4:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.5.1 +imagePullPolicy: Always + +gpus: 1 +memory_per_gpu: 64 +cpu_per_gpu: 12 + +storage: + ephemeral: + quantity: 128Gi + # Change the storageClassName to standard if mlstorage is not available. + storageClassName: mlstorage + accessModes: + - ReadWriteOnce + dshm: + sizeLimit: 32Gi + +entrypoint: | + # Move the REINVENT4 directory to the ephemeral storage + mv /REINVENT4 /workload/REINVENT4 + + cd /workload/REINVENT4 + + START_TIME=$(date +%s) + + # Run the REINVENT4 demo notebook + if python3 ./notebooks/Reinvent_demo_clean.py; then + echo "REINVENT4 demo workflow completed successfully" + else + echo "REINVENT4 demo workflow failed with exit code: $?" + fi + + # # Uncomment the lines below if you want to run the TLRL notebook instead + # if python3 ./notebooks/Reinvent_TLRL_clean.py; then + # echo "REINVENT4 TLRL workflow completed successfully" + # else + # echo "REINVENT4 TLRL workflow failed with exit code: $?" + # fi + + END_TIME=$(date +%s) + DURATION=$((END_TIME - START_TIME)) + + echo "Benchmark completed at: $(date)" + echo "Total benchmark duration: ${DURATION} seconds ($(date -u -d @${DURATION} +%H:%M:%S))" diff --git a/workloads/benchmark-lifescience-semlaflow/helm/Chart.yaml b/workloads/benchmark-lifescience-semlaflow/helm/Chart.yaml new file mode 100644 index 0000000..fc336d6 --- /dev/null +++ b/workloads/benchmark-lifescience-semlaflow/helm/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: benchmark-lifescience-semlaflow +description: A Helm chart for SemlaFlow inference +version: 0.0.1 diff --git a/workloads/benchmark-lifescience-semlaflow/helm/README.md b/workloads/benchmark-lifescience-semlaflow/helm/README.md new file mode 100644 index 0000000..0a76468 --- /dev/null +++ b/workloads/benchmark-lifescience-semlaflow/helm/README.md @@ -0,0 +1,90 @@ +# Life Science - SemlaFlow + +This Helm Chart deploys the LLM Inference SemlaFlow workload. + +# SemlaFlow model + +Original repo is [here](https://github.com/rssrwn/semla-flow) + +This project creates a novel equivariant attention-based message passing architecture, Semla, for molecular design and dynamics tasks. We train a molecular generation model, SemlaFlow, using flow matching with optimal transport to generate realistic 3D molecular structures. + +## Scripts + +There are 4 scripts in the original semlaflow repository: +* `preprocess` - Used for preprocessing larger datasets into the internal representation used by the model for training +* `train` - Trains a MolFlow model on preprocessed data +* `evaluate` - Evaluates a trained model and prints the results +* `predict` - Runs the sampling for a trained model and saves the generated molecules + +## Instructions on choosing a GPU to attach a docker container to + 1. check with `amd-smi process` which GPU is free + 2. check with `rocm-smi` what is the node id of the free GPU (Note: node id is not the same as the device id and is displayed in the second column of the rocm-smi output) + 3. If say, the node id 2 gpu is free, the device to be added to docker run is given by `cat /sys/class/kfd/kfd/topology/nodes/2/properties | grep drm_render_minor` + 4. you can directly create a container using docker run --device=/dev/kfd --device=/dev/dri/renderD + +## Running inference interactively + +Start a container with the above mentioned image on a cluster and connect. + +Each script can be run as follows (where `