diff --git a/.github/workflows/docs-file-copy.yml b/.github/workflows/docs-file-copy.yml
new file mode 100644
index 0000000..eb432eb
--- /dev/null
+++ b/.github/workflows/docs-file-copy.yml
@@ -0,0 +1,35 @@
+name: Copy workload documentation to public docs repo
+# We rsync the ai-workloads documentation to a temp clone of the public docs repo
+# and commit and push the changes to the main branch of the public docs repo. Purpose is to keep the Docs repo (consolidated SiloGen docs) updated with ai-workloads repository changes.
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "docs/**"
+      - "workloads/**"
+      - ".github/workflows/docs-file-copy.yml"
+
+jobs:
+  copy-docs:
+    if: github.repository == 'silogen/ai-workloads'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout core repo
+        uses: actions/checkout@v4
+
+      - name: Push to public docs repo
+        run: |
+          git config --global user.name 'GitHub Actions'
+          git config --global user.email 'actions@github.com'
+          git clone https://x-access-token:${{ secrets.DOCS_REPO_TOKEN }}@github.com/silogen/ai-workloads.git source_docs
+          git clone https://x-access-token:${{ secrets.DOCS_REPO_TOKEN }}@github.com/silogen/docs.git target_silogen_docs
+          cd target_silogen_docs
+          rsync -av --delete --exclude='.git' ../source_docs/docs docs/ai-workloads-docs
+          rsync -av --delete --exclude='.git' ../source_docs/workloads docs/ai-workloads-manifests
+          git add .
+          git diff --staged --quiet || git commit -m "Update external docs from ai-workloads repo"
+          git push origin main
+        env:
+          DOCS_REPO_TOKEN: ${{ secrets.DOCS_REPO_TOKEN }}
diff --git a/.gitignore b/.gitignore
index ed7ea17..028313d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,6 +82,9 @@ target/
 profile_default/
 ipython_config.py
 
+# MacOS stuff
+.DS_Store
+
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 384fea5..39675cd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,7 +7,7 @@ repos:
     hooks:
       - id: check-json
       - id: check-yaml
-        exclude: templates|mkdocs.yml
+        exclude: templates|mkdocs.yml|vlm-lora-finetune
       - id: end-of-file-fixer
       - id: requirements-txt-fixer
       - id: trailing-whitespace
@@ -20,20 +20,20 @@ repos:
         args: ["--config=pyproject.toml"]
 
   - repo: https://github.com/pycqa/flake8
-    rev: 7.1.2
+    rev: 7.2.0
     hooks:
       - id: flake8
         args: ["--config=.flake8"]
 
   - repo: https://github.com/pycqa/isort
-    rev: 6.0.0
+    rev: 6.0.1
     hooks:
       - id: isort
         name: isort (python)
         args: ["--settings-path=pyproject.toml"]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.15.0
+    rev: v1.16.0
     hooks:
       - id: mypy
         args: ["--config-file=pyproject.toml"]
@@ -43,7 +43,7 @@ repos:
           - types-PyYAML
 
   - repo: https://github.com/gruntwork-io/pre-commit
-    rev: v0.1.26
+    rev: v0.1.29
     hooks:
       - id: helmlint
         exclude: kaiwo|llm-finetune-silogen-engine
diff --git a/docker/lifescience/reinvent4/Dockerfile b/docker/lifescience/reinvent4/Dockerfile
new file mode 100644
index 0000000..4bb8d89
--- /dev/null
+++ b/docker/lifescience/reinvent4/Dockerfile
@@ -0,0 +1,28 @@
+FROM rocm/pytorch:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.5.1
+
+# Use bash to support string substitution.
+SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+
+# Clone the Reinvet4 repository and use the stable 4.6.22 version
+RUN git clone https://github.com/MolecularAI/REINVENT4
+WORKDIR /REINVENT4
+RUN git checkout v4.6.22
+RUN wget -O priors/reinvent.prior "https://zenodo.org/records/15641297/files/reinvent.prior?download=1"
+
+# Remove torch and torchvision from pyproject.toml
+RUN sed -i.bak '/torch==/d' pyproject.toml && \
+    sed -i.bak '/torchvision /d' pyproject.toml
+
+# Now run the install script as usual
+RUN python install.py cpu
+
+COPY demo_notebooks/ notebooks/
+
+# Download the chemprop model
+RUN mkdir chemprop
+RUN wget -q --show-progress -O chemprop/model.pt "https://www.dropbox.com/scl/fi/zpnqc9at5a5dnkzfdbo6g/model.pt?rlkey=g005yli9364uptd94d60jtg5c&e=1&dl=1"
+
+# Copy entrypoint script
+COPY entrypoint.sh /
+
+CMD ["/bin/bash"]
diff --git a/docker/lifescience/reinvent4/README.md b/docker/lifescience/reinvent4/README.md
new file mode 100644
index 0000000..283ae98
--- /dev/null
+++ b/docker/lifescience/reinvent4/README.md
@@ -0,0 +1,31 @@
+# Running Reinvent inference interactively
+
+Connect to the pod with your favorite terminal.
+
+This repo provides an altered version of these notebooks to be runnable from the terminal with the subscript `_clean`. These can simply be run by:
+
+```sh
+python3 notebooks/<notebook_name.py>
+```
+Alternatively, Reinvent jobs can be run by:
+```sh
+reinvent -l <log_name> <config_name>
+```
+
+## Running inference job automatically (non-interactive)
+
+In order to run Reinvent jobs automatically using the above image do the following:
+- Set up config and output directory:
+
+Put your config files as well any other files needed such as datasets or priors in `CONFIG_PATH`. In `OUTPUT_PATH`, the job will write output logs.
+```sh
+export CONFIG_PATH=<local_path_to_configs>
+export OUTPUT_PATH=<local_path_to_output>
+```
+
+Then, the following command will run the job:
+
+```sh
+docker run --rm -v $CONFIG_PATH:/data -v $OUTPUT_PATH:/output  --device=/dev/kfd --device=/dev/dri/renderD<RENDER_ID> rocm-reinvent /data/<config_file>.toml /output/<output_log_name>
+```
+where the last two arguments provide paths to the config file to run as well where to save outputs.
diff --git a/docker/lifescience/reinvent4/demo_notebooks/Reinvent_TLRL_clean.py b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_TLRL_clean.py
new file mode 100644
index 0000000..aba73f8
--- /dev/null
+++ b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_TLRL_clean.py
@@ -0,0 +1,223 @@
+# This script is based on the file notebooks/Reinvent_TLRL.py:
+# https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_TLRL.py
+
+import os
+import re
+import shutil
+import subprocess
+
+import pandas as pd
+
+
+def main():
+    wd = "R4_TLRL_output"
+
+    # Delete existing working directory and create a new one
+    if not os.path.isdir(wd):
+        shutil.rmtree(wd, ignore_errors=True)
+        os.mkdir(wd)
+    os.chdir(wd)
+
+    # Write config file
+    prior_filename = "../priors/reinvent.prior"
+    agent_filename = prior_filename
+    stage1_checkpoint = "stage1.chkpt"
+    stage1_parameters = f"""
+    run_type = "staged_learning"
+    device = "cuda:0"
+    tb_logdir = "tb_stage1"
+    json_out_config = "_stage1.json"
+
+    [parameters]
+    prior_file = "{prior_filename}"
+    agent_file = "{agent_filename}"
+    summary_csv_prefix = "stage1"
+    batch_size = 100
+    use_checkpoint = false
+    sample_strategy = "beamsearch" #Additional interesting param?
+
+    [learning_strategy]
+    type = "dap"
+    sigma = 128
+    rate = 0.0001
+
+    [[stage]]
+    max_score = 1.0
+    max_steps = 5
+    chkpt_file = "{stage1_checkpoint}"
+    [stage.scoring]
+    type = "geometric_mean"
+    [[stage.scoring.component]]
+    [stage.scoring.component.custom_alerts]
+    [[stage.scoring.component.custom_alerts.endpoint]]
+    name = "Alerts"
+    params.smarts = [ "[*;r8]", "[*;r9]", "[*;r10]", "[*;r11]", "[*;r12]", "[*;r13]", "[*;r14]", "[*;r15]", "[*;r16]", "[*;r17]", "[#8][#8]", "[#6;+]", "[#16][#16]", "[#7;!n][S;!$(S(=O)=O)]", "[#7;!n][#7;!n]", "C#C", "C(=[O,S])[O,S]", "[#7;!n][C;!$(C(=[O,N])[N,O])][#16;!s]", "[#7;!n][C;!$(C(=[O,N])[N,O])][#7;!n]", "[#7;!n][C;!$(C(=[O,N])[N,O])][#8;!o]", "[#8;!o][C;!$(C(=[O,N])[N,O])][#16;!s]", "[#8;!o][C;!$(C(=[O,N])[N,O])][#8;!o]", "[#16;!s][C;!$(C(=[O,N])[N,O])][#16;!s]" ]
+    [[stage.scoring.component]]
+    [stage.scoring.component.QED]
+    [[stage.scoring.component.QED.endpoint]]
+    name = "QED"
+    weight = 0.6
+    [[stage.scoring.component]]
+    [stage.scoring.component.NumAtomStereoCenters]
+    [[stage.scoring.component.NumAtomStereoCenters.endpoint]]
+    name = "Stereo"
+    weight = 0.4
+    transform.type = "left_step"
+    transform.low = 0
+    """
+
+    stage1_config_filename = "stage1.toml"
+    with open(stage1_config_filename, "w") as tf:
+        tf.write(stage1_parameters)
+
+    # Stage 1 Reinforcement Learning
+    shutil.rmtree("tb_stage1_0", ignore_errors=True)
+
+    # Run the stage1 process using subprocess
+    print("Starting Stage 1 Reinforcement Learning...")
+    stage1_result = subprocess.run(f"reinvent {stage1_config_filename} 2>&1 | tee stage1.log", shell=True, text=True)
+    if stage1_result.returncode == 0:
+        print("Stage 1 completed.")
+    else:
+        raise RuntimeError(f"Stage 1 execution failed with exit code: {stage1_result.returncode}")
+
+    # Transfer Learning to focus the model
+    # Prepare the data
+    bdb = pd.read_csv("../notebooks/data/tnks2.csv")
+    clean = bdb[~bdb["exp (nM)"].str.match("[<>]")]
+    clean = clean.astype({"exp (nM)": "float"})
+
+    good = clean[clean["exp (nM)"] < 1000]
+    good = good[good["exp_method"] != "EC50"]
+    good = good[good["exp_method"] != "Kd"]
+    good = good.rename(columns={"exp (nM)": "IC50"})
+    good = good.drop(columns=["exp_method"])
+
+    # Write the good binders to a SMILES file
+    TL_train_filename = "tnks2_train.smi"
+    TL_validation_filename = "tnks2_validation.smi"
+    data = good.sample(frac=1)
+    n_head = int(0.8 * len(data))  # 80% of the data for training
+    n_tail = len(good) - n_head
+    print(f"number of molecules for: training={n_head}, validation={n_tail}")
+
+    train, validation = data.head(n_head), data.tail(n_tail)
+    train.to_csv(TL_train_filename, sep="\t", index=False, header=False)
+    validation.to_csv(TL_validation_filename, sep="\t", index=False, header=False)
+
+    # TL setup
+    TL_parameters = f"""
+    run_type = "transfer_learning"
+    device = "cuda:0"
+    tb_logdir = "tb_TL"
+
+    [parameters]
+    num_epochs = 1
+    save_every_n_epochs = 1
+    batch_size = 100
+    sample_batch_size = 2000
+    input_model_file = "{stage1_checkpoint}"
+    output_model_file = "TL_reinvent.model"
+    smiles_file = "{TL_train_filename}"
+    validation_smiles_file = "{TL_validation_filename}"
+    standardize_smiles = true
+    randomize_smiles = true
+    randomize_all_smiles = false
+    internal_diversity = true
+    """
+
+    TL_config_filename = "transfer_learning.toml"
+    with open(TL_config_filename, "w") as tf:
+        tf.write(TL_parameters)
+
+    # Start Transfer Learning
+    shutil.rmtree("tb_TL", ignore_errors=True)
+
+    # Run the transfer learning process using subprocess
+    print("Starting Transfer Learning...")
+    transfer_result = subprocess.run(
+        f"reinvent {TL_config_filename} 2>&1 | tee transfer_learning.log", shell=True, text=True
+    )
+    if transfer_result.returncode == 0:
+        print("Transfer learning completed.")
+    else:
+        raise RuntimeError(f"Transfer learning execution failed with exit code: {transfer_result.returncode}")
+
+    # Choose the model from transfer learning
+    TL_model_filename = "TL_reinvent.model.1.chkpt"
+
+    stage2_parameters = re.sub("stage1", "stage2", stage1_parameters)
+    stage2_parameters = re.sub("agent_file.*\n", f"agent_file = '{TL_model_filename}'\n", stage2_parameters)
+    stage2_parameters = re.sub("max_steps.*\n", "max_steps = 5\n", stage2_parameters)
+
+    # Stage 2 RL
+    # Predictive model (ChemProp)
+    chemprop_path = "../chemprop/"
+    pred_model_parameters = f"""
+    [[stage.scoring.component]]
+    [stage.scoring.component.ChemProp]
+    [[stage.scoring.component.ChemProp.endpoint]]
+    name = "ChemProp"
+    weight = 0.6
+    params.checkpoint_dir = "{chemprop_path}"
+    params.rdkit_2d_normalized = true
+    params.target_column = "DG"
+    params.features = "rdkit_2d_normalized"
+    transform.type = "reverse_sigmoid"
+    transform.high = 0.0
+    transform.low = -50.0
+    transform.k = 0.4
+    """
+
+    # Combine parameters and write to file
+    full_stage2_parameters = stage2_parameters + pred_model_parameters
+    df_parameters = """
+    [diversity_filter]
+    type = "IdenticalMurckoScaffold"
+    bucket_size = 10
+    minscore = 0.7
+    """
+    inception_parameters = """
+    [inception]
+    smiles_file = ""  # no seed SMILES
+    memory_size = 50
+    sample_size = 10
+    """
+
+    full_stage2_parameters += df_parameters + inception_parameters
+    stage2_config_filename = "stage2.toml"
+    with open(stage2_config_filename, "w") as tf:
+        tf.write(full_stage2_parameters)
+
+    # Run stage2 using subprocess
+    print("Starting Stage 2 Reinforcement Learning...")
+    stage2_result = subprocess.run(f"reinvent {stage2_config_filename} 2>&1 | tee stage2.log", shell=True, text=True)
+    if stage2_result.returncode == 0:
+        print("Stage 2 completed.")
+    else:
+        raise RuntimeError(f"Stage 2 execution failed with exit code: {stage2_result.returncode}")
+
+    # Inspect results with TensorBoard
+    # Run TensorBoard separately after REINVENT finishes
+    # subprocess.run(["tensorboard", "--bind_all", "--logdir", f"{wd}/tb_stage2_0"])
+
+    # Process the results for good binders
+    # csv_file = os.path.join(wd, "stage2_1.csv")
+    csv_file = "stage2_1.csv"
+    df = pd.read_csv(csv_file)
+    good_QED = df["QED"] > 0.8
+    good_dG = df["ChemProp (raw)"] < -25.0  # kcal/mol
+    good_binders = df[good_QED & good_dG]
+    print(len(good_binders))
+
+    # Duplicate removal
+    good_binders = good_binders.drop_duplicates(subset=["SMILES"])
+    print(len(good_binders))
+
+    # Displaying good binders
+    # grid = create_mol_grid(good_binders)
+    # display(grid)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker/lifescience/reinvent4/demo_notebooks/Reinvent_demo_clean.py b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_demo_clean.py
new file mode 100644
index 0000000..60c8ac6
--- /dev/null
+++ b/docker/lifescience/reinvent4/demo_notebooks/Reinvent_demo_clean.py
@@ -0,0 +1,160 @@
+# This script is based on the file notebooks/Reinvent_demo.py:
+# https://github.com/MolecularAI/REINVENT4/blob/main/notebooks/Reinvent_demo.py
+
+import os
+import shutil
+import subprocess
+
+import pandas as pd
+
+
+def setup_work_directory(wd):
+    shutil.rmtree(wd, ignore_errors=True)
+    os.mkdir(wd)
+    os.chdir(wd)
+
+
+def write_config_file(filename, config_data):
+    with open(filename, "w") as tf:
+        tf.write(config_data)
+
+
+def run_reinvent(config_filename, log_filename="stage1.log"):
+    try:
+        result = subprocess.run(
+            ["reinvent", "-l", log_filename, config_filename],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+        print(result.stdout)
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred while running REINVENT: {e}")
+        print(f"STDOUT: {e.stdout}")
+        print(f"STDERR: {e.stderr}")
+        raise
+
+
+def analyze_results(wd):
+    df = pd.read_csv("stage1_1.csv")
+    print(df.head())
+    return df
+
+
+def calculate_sample_efficiency(df):
+    total_smilies = len(df)
+    total_invalid_smilies = len(df[df["SMILES_state"] == 0])
+    total_batch_duplicate_smilies = len(df[df["SMILES_state"] == 2])
+    total_duplicate_smilies = len(df[df.duplicated(subset=["SMILES"])])
+
+    print(
+        f"Total number of SMILES generated: {total_smilies}\n"
+        f"Total number of invalid SMILES: {total_invalid_smilies}\n"
+        f"Total number of batch duplicate SMILES: {total_batch_duplicate_smilies}\n"
+        f"Total number of duplicate SMILES: {total_duplicate_smilies}"
+    )
+
+
+if __name__ == "__main__":
+    wd = "R4_notebooks_output"
+    setup_work_directory(wd)
+
+    prior_filename = "../priors/reinvent.prior"
+    agent_filename = prior_filename
+
+    global_parameters = """
+    run_type = "staged_learning"
+    device = "cuda:0"
+    tb_logdir = "tb_stage1"
+    json_out_config = "_stage1.json"
+    """
+    parameters = f"""
+    [parameters]
+
+    prior_file = "{prior_filename}"
+    agent_file = "{agent_filename}"
+    summary_csv_prefix = "stage1"
+
+    batch_size = 100
+
+    use_checkpoint = false
+    """
+
+    learning_strategy = """
+    [learning_strategy]
+
+    type = "dap"
+    sigma = 128
+    rate = 0.0001
+    """
+
+    stages = """
+    [[stage]]
+
+    max_score = 1.0
+    max_steps = 300
+
+    chkpt_file = 'stage1.chkpt'
+
+    [stage.scoring]
+    type = "geometric_mean"
+
+    [[stage.scoring.component]]
+    [stage.scoring.component.custom_alerts]
+
+    [[stage.scoring.component.custom_alerts.endpoint]]
+    name = "Alerts"
+
+    params.smarts = [
+        "[*;r8]",
+        "[*;r9]",
+        "[*;r10]",
+        "[*;r11]",
+        "[*;r12]",
+        "[*;r13]",
+        "[*;r14]",
+        "[*;r15]",
+        "[*;r16]",
+        "[*;r17]",
+        "[#8][#8]",
+        "[#6;+]",
+        "[#16][#16]",
+        "[#7;!n][S;!$(S(=O)=O)]",
+        "[#7;!n][#7;!n]",
+        "C#C",
+        "C(=[O,S])[O,S]",
+        "[#7;!n][C;!$(C(=[O,N])[N,O])][#16;!s]",
+        "[#7;!n][C;!$(C(=[O,N])[N,O])][#7;!n]",
+        "[#7;!n][C;!$(C(=[O,N])[N,O])][#8;!o]",
+        "[#8;!o][C;!$(C(=[O,N])[N,O])][#16;!s]",
+        "[#8;!o][C;!$(C(=[O,N])[N,O])][#8;!o]",
+        "[#16;!s][C;!$(C(=[O,N])[N,O])][#16;!s]"
+    ]
+
+    [[stage.scoring.component]]
+    [stage.scoring.component.QED]
+
+    [[stage.scoring.component.QED.endpoint]]
+    name = "QED"
+    weight = 0.6
+
+
+    [[stage.scoring.component]]
+    [stage.scoring.component.NumAtomStereoCenters]
+
+    [[stage.scoring.component.NumAtomStereoCenters.endpoint]]
+    name = "Stereo"
+    weight = 0.4
+
+    transform.type = "left_step"
+    transform.low = 0
+    """
+
+    config = global_parameters + parameters + learning_strategy + stages
+
+    toml_config_filename = "stage1.toml"
+    write_config_file(toml_config_filename, config)
+
+    run_reinvent(toml_config_filename)
+    df = analyze_results(wd)
+    calculate_sample_efficiency(df)
diff --git a/docker/lifescience/reinvent4/entrypoint.sh b/docker/lifescience/reinvent4/entrypoint.sh
new file mode 100644
index 0000000..673e51e
--- /dev/null
+++ b/docker/lifescience/reinvent4/entrypoint.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+CONFIG_FILE="$1"
+OUTPUT_FILE="$2"
+
+# Check if OUTPUT ends with ".log" and append it if not
+if [[ ! "$OUTPUT_FILE" =~ .log$ ]]; then
+    OUTPUT_FILE="${OUTPUT_FILE}.log"
+fi
+
+if [ -n "$CONFIG_FILE" ] && [ -n "$OUTPUT_FILE" ]; then
+    echo "Config file provided: $CONFIG_FILE"
+    echo "Output file provided: $OUTPUT_FILE"
+    echo "Running software in automated mode..."
+
+    # Set the correct permissions for the config file
+    chmod +r "$CONFIG_FILE"
+
+    exec reinvent -l "$OUTPUT_FILE" "$CONFIG_FILE"
+
+    echo "Processing complete. Results saved in $(dirname "$OUTPUT_FILE")."
+else
+    echo "No config file or output file provided. Starting interactive mode..."
+    exec /bin/bash
+fi
diff --git a/docker/lifescience/semlaflow/Dockerfile b/docker/lifescience/semlaflow/Dockerfile
new file mode 100644
index 0000000..dc7d938
--- /dev/null
+++ b/docker/lifescience/semlaflow/Dockerfile
@@ -0,0 +1,16 @@
+FROM rocm/pytorch:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.8.0
+
+RUN apt-get update -y \
+    && DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+        ca-certificates \
+    && apt-get autoremove -y \
+    && apt-get clean
+
+RUN pip install numpy==1.26.2 pandas==2.2.2 scipy==1.11.4 rdkit lightning torchmetrics openbabel-wheel typing_extensions wandb numba hiredis tqdm ipython certifi
+
+RUN git clone https://github.com/rssrwn/semla-flow.git
+WORKDIR /semla-flow
+
+COPY entrypoint.sh /entrypoint.sh
+
+CMD ["/bin/bash"]
diff --git a/docker/lifescience/semlaflow/README.md b/docker/lifescience/semlaflow/README.md
new file mode 100644
index 0000000..42abe2e
--- /dev/null
+++ b/docker/lifescience/semlaflow/README.md
@@ -0,0 +1,17 @@
+## Running inference job automatically (non-interactive)
+
+In order to run SemlaFlow jobs automatically using the above image do the following:
+- Set up config and output directory:
+
+Put your config files as well any other files needed such as datasets or priors in `CONFIG_PATH`. In `OUTPUT_PATH`, the job will write output logs.
+
+```sh
+export DATA_PATH=<local_path_to_semlaflow_data_directory>
+export OUTPUT_PATH=<local_path_to_save_output>
+```
+
+Then, the following command will run the job:
+
+```sh
+docker run -it --shm-size=256g --device=/dev/kfd --device=/dev/dri/renderD<RENDER_ID> --network host --ipc host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v $DATA_PATH:/data -v $OUTPUT_PATH:/output rocm-semlaflow <script> <output-file> --data_path /data/<path-to-dataset> <other_args>
+```
diff --git a/docker/lifescience/semlaflow/entrypoint.sh b/docker/lifescience/semlaflow/entrypoint.sh
new file mode 100644
index 0000000..0f676b2
--- /dev/null
+++ b/docker/lifescience/semlaflow/entrypoint.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SCRIPT="$1"
+OUTPUT_FILE="$2"
+shift
+shift
+
+echo ${OUTPUT_FILE}
+# Check if SCRIPT is one of the allowed values
+if [[ "$SCRIPT" != "preprocess" && "$SCRIPT" != "train" && "$SCRIPT" != "evaluate" && "$SCRIPT" != "predict" ]]; then
+  echo "Error: SCRIPT must be one of 'preprocess', 'train', 'evaluate', or 'predict'."
+  exit 1
+fi
+
+python -m semlaflow."$SCRIPT" "$@" &> /output/${OUTPUT_FILE}
+
+if [[ "$SCRIPT" == "train" ]]; then
+  # Copy the model checkpoint to the output directory
+  cp -r lightning_logs/version_* /output/
+fi
diff --git a/docker/robotics/openSplat/Dockerfile b/docker/robotics/openSplat/Dockerfile
new file mode 100644
index 0000000..5f75af3
--- /dev/null
+++ b/docker/robotics/openSplat/Dockerfile
@@ -0,0 +1,42 @@
+# NOTE: AN AMD GPU IS NEEDED TO BUILD THIS DOCKERFILE. THE OPENSPLAT MAKEFILE REQUIRES IT.
+FROM rocm/pytorch:rocm6.4.3_ubuntu22.04_py3.10_pytorch_release_2.6.0
+
+RUN apt-get update && apt-get install -y git \
+    && apt-get install -y libopencv-dev \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workload
+COPY . /workload/
+
+# Build OpenSplat
+RUN git clone https://github.com/pierotofy/OpenSplat
+WORKDIR /workload/OpenSplat
+
+# Latest working commit
+RUN git checkout 86d2500d3f50de3fee933525a1217c06634f791f
+
+RUN source activate py_3.10 && \
+    mkdir build && \
+    cd build && \
+    cmake .. \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DGPU_RUNTIME=HIP \
+      -DHIP_ROOT_DIR=/opt/rocm \
+      -DOPENSPLAT_BUILD_SIMPLE_TRAINER=ON \
+      -DCMAKE_PREFIX_PATH=/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch \
+      -DCMAKE_INSTALL_PREFIX=/code/install && \
+    make
+
+WORKDIR /workload
+
+# Install gdown for downloading data
+RUN pip install --upgrade gdown
+
+RUN gdown --id 1mUUZFDo2swd6CE5vwPPkjN63Hyf4XyEv --no-check-certificate -O banana.zip
+RUN gdown --id 1WWXo-GKo6d-yf-K1T1CswIdkdZrBNZ_e --no-check-certificate -O truck.zip
+
+RUN unzip banana.zip -d /workload/
+RUN unzip truck.zip -d /workload/
+RUN rm banana.zip truck.zip
+
+CMD ["/bin/bash"]
diff --git a/docker/robotics/openSplat/README.md b/docker/robotics/openSplat/README.md
new file mode 100644
index 0000000..eaa1450
--- /dev/null
+++ b/docker/robotics/openSplat/README.md
@@ -0,0 +1,39 @@
+# OpenSplat by Silogen using ROCm
+
+https://github.com/pierotofy/OpenSplat
+
+A free and open source implementation of 3D gaussian splatting written in C++, focused on being portable, lean and fast.
+
+OpenSplat takes camera poses + sparse points in COLMAP, OpenSfM, ODM, OpenMVG or nerfstudio project format and computes a scene file (.ply or .splat) that can be later imported for viewing, editing and rendering in other software.
+
+# Prerequisities
+- Access to a server with AMD GPU available
+- Docker installed
+
+## Benchmark ##
+1. Create the container and connect to it.
+    ```sh
+    docker build . -t opensplat:latest
+    docker run --device /dev/kfd --device /dev/dri -d --name opensplat opensplat:latest
+    docker exec -it opensplat /bin/bash
+    ```
+2. Benchmark OpenSplat by running the following commands:
+    ```sh
+    # Disconnect from the container above and go back to your local terminal
+    docker cp benchmark.sh opensplat:/var/lib/jenkins/benchmark.sh
+    docker exec -it opensplat /bin/bash
+    chmod +x benchmark.sh
+
+    # This script will eventually fail at some point, and thats intended. The point is to see how far it makes it before it fails.
+    bash benchmark.sh &> opensplat.log
+    # The following will reformat the benchmarking logs into two output tables (output_table1 and output_table2) similar to the tables in the section **Benchmarking Results**.
+    python reformat_log.py
+    ```
+    **Note on benchmarking:** We cannot give the number of Gaussians as input to the OpenSplat (the number of Gaussians come from the sparse points in the input data, e.g., keypoints in COLMAP). We just can change splitting and densifying thresholds. But, there is a [simple_trainer script](https://github.com/pierotofy/OpenSplat/blob/main/simple_trainer.cpp) in the OpenSplat repository that we can use for testing with different image widths, heights, and different numbers of Gaussians. We need to see when the command results in Memory overflow on MI300 and compare it with H100.
+
+    **Note:** simple_trainer script in OpenSplat is similar to [image_fitting script](https://github.com/nerfstudio-project/gsplat/blob/main/examples/image_fitting.py) in gsplat. [Here](https://docs.gsplat.studio/main/examples/image.html) is image_fitting documentation.
+3. Delete the container
+    ```sh
+    docker stop opensplat
+    docker image rm opensplat:latest
+    ```
diff --git a/docker/robotics/openSplat/benchmark.sh b/docker/robotics/openSplat/benchmark.sh
new file mode 100644
index 0000000..4ba8def
--- /dev/null
+++ b/docker/robotics/openSplat/benchmark.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# The source for this dataset is available at:
+# https://github.com/pierotofy/OpenSplat?tab=readme-ov-file#run
+
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./truck -n 2000/5000/10000/30000
+# time simple_trainer --width 640 --height 360 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+# time simple_trainer --width 1280 --height 720 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+# time simple_trainer --width 1920 --height 1080 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+# time simple_trainer --width 3840 --height 2160 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+
+# download data
+if [ ! -f /workspace/banana.zip ]; then
+    echo "Download banana data"
+    gdown --id 1mUUZFDo2swd6CE5vwPPkjN63Hyf4XyEv --no-check-certificate
+    unzip banana.zip
+fi
+if [ ! -f /workspace/truck.zip ]; then
+    echo "Download truck data"
+    gdown --id 1WWXo-GKo6d-yf-K1T1CswIdkdZrBNZ_e --no-check-certificate
+    unzip truck.zip
+fi
+
+echo "Benchmark Banana"
+for itern_num in 2000 5000 10000 30000
+do
+  echo "itern_num: $itern_num"
+  time /workspace/OpenSplat/build/opensplat ./banana -n $itern_num
+done
+
+echo "Benchmark Truck"
+for itern_num in 2000 5000 10000 30000
+do
+  echo "itern_num: $itern_num"
+  time /workspace/OpenSplat/build/opensplat ./truck -n $itern_num
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (640, 360)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time /workspace/OpenSplat/build/simple_trainer --width 640 --height 360 --iters $itern_num --points $num_points
+    done
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (1280, 720)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time /workspace/OpenSplat/build/simple_trainer --width 1280 --height 720 --iters $itern_num --points $num_points
+    done
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (1920, 1080)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time /workspace/OpenSplat/build/simple_trainer --width 1920 --height 1080 --iters $itern_num --points $num_points
+    done
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (3840, 2160)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time /workspace/OpenSplat/build/simple_trainer --width 3840 --height 2160 --iters $itern_num --points $num_points
+    done
+done
diff --git a/docker/robotics/openSplat/reformat_log.py b/docker/robotics/openSplat/reformat_log.py
new file mode 100644
index 0000000..1620aff
--- /dev/null
+++ b/docker/robotics/openSplat/reformat_log.py
@@ -0,0 +1,160 @@
+import os
+import re
+
+
+def safe_get(lst, index, default="N/A"):
+    """Safely get an element from a list, returning default if index doesn't exist"""
+    return lst[index] if index < len(lst) else default
+
+
+def convert_time_to_seconds(time_str):
+    """
+    Converts a time string like "0m18.236s" to total seconds.
+
+    Args:
+        time_str (str): The time string in "XhYmZs" format (e.g., "0m18.236s", "1h2m3.4s").
+                        Hours and minutes are optional. Milliseconds are optional.
+
+    Returns:
+        float: The total time in seconds, or None if the format is invalid.
+    """
+    total_seconds = 0.0
+
+    # Regular expression to capture hours, minutes, and seconds (with optional milliseconds)
+    # The groups are: (hours), (minutes), (seconds_whole), (milliseconds)
+    match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)(?:\.(\d+))?s)?", time_str)
+
+    if not match:
+        print(f"Error: Invalid time format for '{time_str}'. Expected formats like '0m18.236s', '1h2m3s', '5s'.")
+        return None
+
+    hours_str, minutes_str, seconds_whole_str, milliseconds_str = match.groups()
+
+    if hours_str:
+        total_seconds += int(hours_str) * 3600
+    if minutes_str:
+        total_seconds += int(minutes_str) * 60
+    if seconds_whole_str:
+        total_seconds += int(seconds_whole_str)
+    if milliseconds_str:
+        # Pad milliseconds to 3 digits and convert to float
+        total_seconds += float(f"0.{milliseconds_str.ljust(3, '0')[:3]}")
+
+    return total_seconds
+
+
+def main():
+    """
+    Reads the output log generated by benchmark.sh, filters and processes relevant benchmark lines,
+    and reformats the timing results into Markdown tables for easier analysis.
+
+    Specifically:
+    - Extracts timing and error information for Banana and Truck benchmarks, and writes a Markdown table to 'output_table1.txt'.
+    - Extracts timing and error information for rasterization benchmarks with various input sizes and writes a Markdown table to 'output_table2.txt'.
+
+    The function expects 'opensplat.log' to be present in the current directory.
+    """
+    with open("opensplat.log", "r") as f:
+        lines = f.readlines()
+
+    filtered_lines = []
+    idx = 0
+    for line in lines:
+        if (
+            "Benchmark Banana" in line
+            or "Benchmark Truck" in line
+            or "iter_num:" in line
+            or "num_points:" in line
+            or "real\t" in line
+            or "Benchmark the rasterization" in line
+            or "terminate" in line
+        ):
+            if "terminate" in line and (idx + 1 < len(lines) and "negative dimension" in lines[idx + 1]):
+                filtered_lines.append("Memory overflow\n")
+            elif "terminate" in line:
+                filtered_lines.append("Error\n")
+            else:
+                filtered_lines.append(line)
+        idx += 1
+
+    # reformat output of time opensplat ./banana -n 2000/5000/10000/30000 ...
+    # and opensplat ./truck -n 2000/5000/10000/30000 to a markdown table
+    output = ["|Input|#iters: 2000|#iters: 5000|#iters: 10000|#iters: 30000|\n", "|-----|-----|-----|-----|-----|\n"]
+    for i, line in enumerate(filtered_lines):
+        if "Benchmark Banana" in line or "Benchmark Truck" in line:
+            times = []
+            errors = []
+            for j in range(i, i + 20):
+                if "real" in filtered_lines[j]:
+                    times.append(convert_time_to_seconds(filtered_lines[j].split("\t")[1].split("\n")[0]))
+                    if "Memory overflow" in filtered_lines[j - 1]:
+                        errors.append(True)
+                    else:
+                        errors.append(False)
+            times = ["**Memory overflow**" if flag else val for val, flag in zip(times, errors)]
+            if "Benchmark Banana" in line:
+                output.append(
+                    f"|Banana|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|\n"
+                )
+            elif "Benchmark Truck" in line:
+                output.append(
+                    f"|Truck|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|\n"
+                )
+                break
+
+    if os.path.exists("output_table1.txt"):
+        os.remove("output_table1.txt")
+
+    with open("output_table1.txt", "a") as f:
+        for line in output:
+            f.write(line)
+
+    # reformat output of time simple_trainer --width 640/1280/1920/3840 --height 360/720/1080/2160 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000  ...
+    # to a markdown table
+    output = [
+        "|Input size|#iters: 2000| | | |#iters: 5000| | | |#iters: 10000| | | |#iters: 30000| | | |\n",
+        "|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|\n",
+        "| |#points: 2000|#points: 10000|#points: 100000|#points: 1000000|#points: 2000|#points: 10000|#points: 100000|#points: 1000000|#points: 2000|#points: 10000|#points: 100000|#points: 1000000|#points: 2000|#points: 10000|#points: 100000|#points: 1000000|\n",
+    ]
+    for i, line in enumerate(filtered_lines):
+        if "Benchmark the rasterization" in line:
+            times = []
+            errors = []
+            for j in range(i, i + 100):
+                if j >= len(filtered_lines):
+                    break
+                if "real" in filtered_lines[j]:
+                    times.append(convert_time_to_seconds(filtered_lines[j].split("\t")[1].split("\n")[0]))
+                    if "Memory overflow" in filtered_lines[j - 1]:
+                        errors.append(True)
+                    else:
+                        errors.append(False)
+            times = ["**Memory overflow**" if flag else val for val, flag in zip(times, errors)]
+            if "(640, 360)" in line:
+                output.append(
+                    f"|(640 360)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+            elif "(1280, 720)" in line:
+                output.append(
+                    f"|(1280 720)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+            elif "(1920, 1080)" in line:
+                output.append(
+                    f"|(1920 1080)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+            elif "(3840, 2160)" in line:
+                output.append(
+                    f"|(3840 2160)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+                break
+
+    if os.path.exists("output_table2.txt"):
+        os.remove("output_table2.txt")
+
+    with open("output_table2.txt", "a") as f:
+        for line in output:
+            f.write(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker/vlm-lora-finetune/Dockerfile b/docker/vlm-lora-finetune/Dockerfile
new file mode 100644
index 0000000..274e5f6
--- /dev/null
+++ b/docker/vlm-lora-finetune/Dockerfile
@@ -0,0 +1,5 @@
+FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
+COPY . /app
+WORKDIR /app
+RUN pip install --no-cache-dir -r amd_requirements.txt
+CMD ["/bin/bash"]
diff --git a/docker/vlm-lora-finetune/LICENSE b/docker/vlm-lora-finetune/LICENSE
new file mode 100644
index 0000000..4cd1ab5
--- /dev/null
+++ b/docker/vlm-lora-finetune/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 Andrew Williamson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/docker/vlm-lora-finetune/README.md b/docker/vlm-lora-finetune/README.md
new file mode 100644
index 0000000..d043709
--- /dev/null
+++ b/docker/vlm-lora-finetune/README.md
@@ -0,0 +1,39 @@
+# Clipora - Low Rank Adapter Fine Tuning of OpenCLIP
+
+Original repo used as base: https://github.com/awilliamson10/clipora/tree/main/clipora
+
+Added Dockerfile, AMD GPU support and some changes like ability to load model with finetuned LORA adapters.
+
+Also added a quick-and-dirty API that can be used for demoing, see [README_api.md](./README_api.md).
+
+## Build
+
+```
+docker build -t tag .
+docker push tag
+```
+
+# ORIGINAL README:
+
+## Main Features
+
+- Fine tune OpenCLIP models via LoRA
+
+### TODOS
+
+- [x] Adapter saving
+- [ ] Merging adapters with the original model
+- [ ] Inferencing with the adapter
+- [ ] Add better documentation
+- [ ] Support for more logging
+
+# References
+
+The links below have most of the information/implmentation here in much cleaner ways, all credit to them, I just frankensteined them together to make this.
+
+- https://github.com/cloneofsimo/lora
+- https://arxiv.org/abs/2106.09685
+- https://github.com/facebookresearch/ov-seg
+- https://huggingface.co/blog/lora
+- https://github.com/huggingface/peft
+- https://github.com/KyanChen/MakeMultiHeadNaive
diff --git a/docker/vlm-lora-finetune/README_api.md b/docker/vlm-lora-finetune/README_api.md
new file mode 100644
index 0000000..93dff81
--- /dev/null
+++ b/docker/vlm-lora-finetune/README_api.md
@@ -0,0 +1,71 @@
+# FastAPI OpenCLIP LORA train/inference endpoint (WIP)
+
+## API usage
+
+Check `api_main.py` for endpoint code.
+
+- `/train`: takes in a YAML string (clipora config) and a ZIP file containing training data. This will start a training run in a separate process. It will save checkpoints of LORA layers in `TRAIN_JOB_OUTPUT_DIR`. The job will update a sqlite database file while training. Returns the job id which can be used to check status of job. Training data must have training and evaluation CSV files and the training images with this kind of structure:
+
+```
+train.csv
+eval.csv
+path/to/image1.png
+path/to/image2.png
+...
+```
+
+Where CSVs look like:
+
+```
+"image_path","language_instruction"
+"path/to/image1.png","put the cube on top of the cylinder"
+"path/to/image2.png","Move the blue spoon to the left burner"
+...
+```
+
+NOTE: when using API the image paths in the CSVs have to be relative to the ZIP folder since it may get extracted to an arbitrary folder!
+
+- `/status/{job_id}`: returns status information about a job.
+- `/classes_inference`: runs inference using a finetuned model on a single image. Requires the job id, an image and a list of classes. OpenCLIP doesn't return text, it gives the text and images in an embedding space. This example inference gives the probabilities for the list of classes given.
+
+## Example run
+
+You can run these commands to give the API a try. Run in this folder:
+
+```
+# start a barebones Deployment and Service that git pulls dev-fastapi branch, installs reqs and starts uvicorn:
+kubectl create -f testing_deployment.yaml
+# when running, port forward to local on port 8080:
+kubectl port-forward services/clipora-testing-deployment 8080:80
+# use small test files to test things out. this should return successfully and give a job id
+curl -X POST "http://localhost:8080/train/" \\n  -F "config_str=<tests/fixtures/bridge_small/api_example_config.yml" \\n  -F "file=@tests/fixtures/bridge_small/bridge_dataset_small.zip;type=application/zip"
+# get the job id from last command and check for status:
+curl "http://localhost:8080/status/{job_id}"
+# when status is "complete", try running a single inference with the finetuned model (uses latest checkpoint):
+# Note: may take half a minute
+curl -X POST "http://localhost:8080/inference/?job_id=db65b5dd-508e-403c-bcee-2116ac27e205" \
+-F "image=@/Users/tman/data/bridge_dataset_small/episode_0001/step_0014.png" \
+-F "classes=Move the food item to the lower left side of the table
+Slide the green rag in front of the sushi.
+put pear in bowl"
+# Return value should have probabilities and classes
+# Download the finetuned lora layers and config as zip:
+curl http://localhost:8080/download_finetuned_model/db65b5dd-508e-403c-bcee-2116ac27e205 --output downloaded_model.zip
+# You can also upload a ZIP containing clipora config and lora layers, let's try with the downloaded zip:
+curl -X POST "http://localhost:8080/upload_finetuned_lora/" \
+-F "file=@/Users/tman/work/downloaded_model.zip;type=application/zip"
+# the command will create a new job where best_finetuned_lora_path will point to the uploaded folder
+# Clean up if needed:
+kubectl delete -f testing_deployment.yaml
+```
+
+Note that the results are underwhelming with the small test set and may not even be correct.
+
+By default checkpoints and db file etc. will be written to /tmp/something, check the env vars and defauls in `api_main.py`
+
+## Notes
+
+- Check tests/fixtures/bridge_small/api_example_config.yml for an example of a clipora config.
+- unit tests are NOT working yet, you can ignore them!
+- Launching the models is kinda slow?
+- using huggingface datasets not tested.
diff --git a/docker/vlm-lora-finetune/amd_requirements.txt b/docker/vlm-lora-finetune/amd_requirements.txt
new file mode 100644
index 0000000..5364d17
--- /dev/null
+++ b/docker/vlm-lora-finetune/amd_requirements.txt
@@ -0,0 +1,16 @@
+accelerate==0.32.0
+datasets[vision]==4.1.0
+# For API
+fastapi[all]==0.116.2
+huggingface-hub==0.35.0
+open_clip_torch==2.24.0
+peft==0.17.1
+pillow==11.0.0
+python-multipart==0.0.20
+scipy==1.14.1
+# Used for creating example dataset
+tensorflow==2.20.0
+# torch==2.1.0 - torch is preinstalled in the image with rocm support
+# bitsandbytes - some issues?
+# wandb - issue with python 3.12?
+tqdm==4.67.1
diff --git a/docker/vlm-lora-finetune/api_main.py b/docker/vlm-lora-finetune/api_main.py
new file mode 100644
index 0000000..cb03a87
--- /dev/null
+++ b/docker/vlm-lora-finetune/api_main.py
@@ -0,0 +1,327 @@
+# main.py
+import asyncio
+import io
+import os
+import re
+import shutil
+import time
+import uuid
+import zipfile
+from concurrent.futures import ProcessPoolExecutor
+from contextlib import asynccontextmanager
+from typing import List, Optional
+
+import job_db
+import merge_and_infer
+import yaml  # type: ignore[import-untyped]
+from clipora.config import TrainConfig, parse_yaml_to_config
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile, status
+from fastapi.concurrency import run_in_threadpool
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, ValidationError
+from train import main as train_main
+
+# Folder where user uploaded files like training data zips will be stored
+FILE_DOWNLOAD_DIR = os.getenv("FILE_DOWNLOAD_PATH", "/tmp/downloaded_data/")
+# base folder for training jobs, each job will have its own job id folder
+TRAIN_JOB_OUTPUT_DIR = os.getenv("TRAIN_JOB_OUTPUT_DIR", "/tmp/trained_models/")
+
+os.makedirs(FILE_DOWNLOAD_DIR, exist_ok=True)
+os.makedirs(TRAIN_JOB_OUTPUT_DIR, exist_ok=True)
+
+
+# --- Lifespan Manager for Executor and DB ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # On startup: Create process pool and initialize the database
+    # will be used to run CPU-intensive tasks in a separate process
+    # so that API will stay responsive
+    job_db.init_db()
+    app.state.process_pool = ProcessPoolExecutor()
+    yield
+    # On shutdown: Gracefully close the process pool
+    app.state.process_pool.shutdown()
+
+
+# --- (Modified) Background Task (runs in a separate process) ---
+def train_job(job_id: str, config: TrainConfig, zip_path: str | None = None):
+    """
+    This function is CPU-intensive and runs in a separate process.
+    It communicates status by calling functions from the `db` module.
+    """
+    # Create job specific output directory for training data
+    job_training_data_dir = os.path.join(TRAIN_JOB_OUTPUT_DIR, job_id, "training_data")
+    os.makedirs(job_training_data_dir, exist_ok=True)
+    if zip_path:
+        try:
+            job_db.update_job(job_id, status="extracting", detail=f"Extracting {os.path.basename(zip_path)}")
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
+                zip_ref.extractall(job_training_data_dir)
+        except Exception as e:
+            job_db.update_job(job_id, status="failed", detail=str(e))
+            return
+
+        # Change some paths in the config to point to the extracted data path
+        config.train_dataset = os.path.join(job_training_data_dir, os.path.basename(config.train_dataset))
+        config.eval_dataset = os.path.join(job_training_data_dir, os.path.basename(config.eval_dataset))
+
+    try:
+        # output trained loras to job specific output dir
+        config.output_dir = os.path.join(TRAIN_JOB_OUTPUT_DIR, job_id)
+
+        job_db.update_job(job_id, status="training", detail="Model training in progress...")
+        job_callback = job_db.create_job_callback(job_id)
+        train_main(config, job_callback)
+
+        job_db.update_job(job_id, status="complete", detail="Training finished successfully.")
+    except Exception as e:
+        job_db.update_job(job_id, status="failed", detail=str(e))
+
+
+# --- FastAPI App ---
+app = FastAPI(
+    title="Training API",
+    description="An API to submit training jobs with status tracking via SQLite.",
+    version="2.1.0",
+    lifespan=lifespan,
+)
+
+origins = [
+    "http://localhost",
+    "http://localhost:8080",
+    "http://localhost:8000",  # If you serve the html with `python -m http.server`
+    "null",  # Allow requests from local files (i.e., opening the HTML with file://)
+]
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+
+
+# --- Training Endpoint ---
+@app.post("/train/", summary="Submit a training job", status_code=status.HTTP_202_ACCEPTED)
+async def train_model(
+    config_str: str = Form(..., description="A YAML string for training config."),
+    file: Optional[UploadFile] = File(
+        None, description="A ZIP file with the dataset (required if not using HuggingFace datasets)."
+    ),
+):
+    job_id = str(uuid.uuid4())
+
+    try:
+        config_dict = yaml.safe_load(config_str)
+        config = TrainConfig(**config_dict)
+    except yaml.YAMLError:
+        raise HTTPException(status_code=400, detail="Config is not valid YAML.")
+
+    # Check if file is required based on config.datatype
+    if config.datatype != "hf":
+        if file is None:
+            raise HTTPException(
+                status_code=400,
+                detail="Custom data requires a ZIP file upload. Set 'datatype' to 'hf' to use HuggingFace datasets.",
+            )
+        if file.content_type not in ["application/zip", "application/x-zip-compressed"]:
+            raise HTTPException(status_code=400, detail="Invalid file type. Expected ZIP.")
+
+        zip_path = os.path.join(FILE_DOWNLOAD_DIR, f"{job_id}.zip")
+        try:
+            with open(zip_path, "wb") as buffer:
+                while chunk := await file.read(1024 * 1024):
+                    buffer.write(chunk)
+        finally:
+            await file.close()
+    else:
+        zip_path = None  # Not needed for HuggingFace datasets
+
+    job_db.create_job(job_id)
+
+    loop = asyncio.get_running_loop()
+    loop.run_in_executor(app.state.process_pool, train_job, job_id, config, zip_path)
+
+    return {
+        "message": "Training job accepted.",
+        "job_id": job_id,
+        "status_url": app.url_path_for("get_status", job_id=job_id),
+    }
+
+
+def _extract_percentage(s):
+    match = re.search(r"(\d+\.?\d*)%", s)
+    return float(match.group(1)) if match else None
+
+
+def _job_status_return(job_dict):
+    job_dict["progress"] = _extract_percentage(job_dict["detail"]) if job_dict["detail"] else None
+    return job_dict
+
+
+# --- Job Status Endpoint ---
+@app.get("/status/{job_id}", summary="Get job status", name="get_status")
+async def get_status(job_id: str):
+    """
+    Polls the database to get the current status of the training job.
+    """
+
+    job = job_db.get_job(job_id)
+    if not job:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job not found.")
+    return _job_status_return(job)
+
+
+@app.get("/list_jobs/", summary="Return all jobs", name="list_jobs")
+async def list_jobs():
+    """
+    Fetches all jobs from the database.
+    """
+
+    jobs = job_db.get_all_jobs()
+    if not jobs:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="No jobs found.")
+    return [_job_status_return(job) for job in jobs]
+
+
+@app.post("/inference/", summary="Run inference on a single image with specified classes", name="classes_inference")
+async def classes_inference(job_id: str, image: UploadFile, classes: str = Form(...)):
+    """
+    Accepts an image and a list of class names from a form submission,
+    processes them, and returns the details.
+
+    - **image**: The uploaded image file.
+    - **classes**: A list of strings separated by new lines, used as texts/"classes".
+    """
+
+    classes_list = [line.strip() for line in classes.split("\n") if line.strip()]
+    image_path = os.path.join("/tmp/", str(uuid.uuid4()) + "_" + image.filename)
+    with open(image_path, "wb") as buffer:
+        shutil.copyfileobj(image.file, buffer)
+
+    job_dict = job_db.get_job(job_id)
+    if job_dict is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Job ID not found.")
+
+    probabilities, classes_list = merge_and_infer.run_single_inference(job_dict, image_path, classes_list)
+    if probabilities is None:
+        raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Failed to run inference.")
+    return {"probabilities": probabilities, "classes": classes_list}
+
+
+@app.get("/download_finetuned_model/{job_id}")
+async def download_finetuned_model(job_id: str):
+    """
+    Packages the contents of a specified model folder into a ZIP file
+    and returns it for download.
+    """
+    job = job_db.get_job(job_id)
+    if not job:
+        raise HTTPException(status_code=404, detail=f"Job ID '{job_id}' not found in database.")
+    model_folder_path = job["best_finetuned_model_path"]
+    # if relative path, use TRAIN_JOB_OUTPUT_DIR/job_id/model_folder_path as base
+    if not os.path.isabs(model_folder_path):
+        model_folder_path = os.path.join(TRAIN_JOB_OUTPUT_DIR, job_id, model_folder_path)
+    # 2. Validate that the requested directory exists and is actually a directory.
+    if not os.path.isdir(model_folder_path):
+        raise HTTPException(
+            status_code=404, detail=f"Job ID '{job_id}' does not have a valid model folder at '{model_folder_path}'."
+        )
+
+    # 3. Create an in-memory binary stream (a virtual file) to hold the ZIP data.
+    # This avoids writing a temporary file to the disk, which is more efficient.
+    zip_io_buffer = io.BytesIO()
+
+    # 4. Create the ZIP file within the in-memory buffer.
+    with zipfile.ZipFile(zip_io_buffer, mode="w", compression=zipfile.ZIP_DEFLATED) as temp_zip_file:
+        for root, _, files in os.walk(model_folder_path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                # Define the name of the file inside the ZIP archive.
+                # os.path.relpath ensures the paths are relative to the model folder,
+                # recreating the directory structure correctly inside the zip.
+                archive_name = os.path.relpath(file_path, model_folder_path)
+                temp_zip_file.write(file_path, arcname=archive_name)
+
+    # 5. Rewind the in-memory buffer to the beginning.
+    zip_io_buffer.seek(0)
+    download_filename = f"{job_id}_model.zip"
+
+    return StreamingResponse(
+        content=zip_io_buffer,
+        media_type="application/zip",
+        headers={"Content-Disposition": f"attachment; filename={download_filename}"},
+    )
+
+
+def extract_zip_from_path_sync(zip_path: str, output_dir: str):
+    """Synchronously extracts a zip file from a path to a directory."""
+    with zipfile.ZipFile(zip_path, "r") as zip_ref:
+        zip_ref.extractall(output_dir)
+
+
+@app.post("/upload_finetuned_lora/", summary="Upload a pre-trained LoRA model", status_code=status.HTTP_202_ACCEPTED)
+async def upload_finetuned_lora(
+    file: UploadFile = File(..., description="A ZIP file containing the finetuned LoRA model artifacts.")
+):
+    """
+    Accepts a ZIP file containing a pre-trained model, extracts it,
+    and registers it as a completed job. This allows for using the model
+    for inference without running the training process through this API.
+
+    ZIP file needs to have the model weights and config files, including train_config.yaml.
+    train_config.yaml is needed so we know the base model
+    """
+    job_id = str(uuid.uuid4())
+    job_db.create_job(job_id, status="uploading", detail="Receiving LoRA model file.")
+
+    if file.content_type not in ["application/zip", "application/x-zip-compressed"]:
+        detail = "Invalid file type. Expected a ZIP file."
+        job_db.update_job(job_id, status="failed", detail=detail)
+        raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=detail)
+
+    model_folder_name = "uploaded_model"
+    job_uploaded_model_dir = os.path.join(job_id, model_folder_name)
+    output_dir = os.path.join(TRAIN_JOB_OUTPUT_DIR, job_uploaded_model_dir)
+    os.makedirs(output_dir, exist_ok=True)
+
+    try:
+        job_db.update_job(job_id, status="extracting", detail="Extracting model artifacts from ZIP file.")
+
+        zip_path = os.path.join(FILE_DOWNLOAD_DIR, f"{job_id}.zip")
+        with open(zip_path, "wb") as buffer:
+            while chunk := await file.read(1024 * 1024):
+                buffer.write(chunk)
+
+        await run_in_threadpool(extract_zip_from_path_sync, zip_path, output_dir)
+
+        job_db.update_job(
+            job_id,
+            status="complete",
+            detail="LoRA model successfully uploaded and registered.",
+            # save with the relative path in case job output dir changes later
+            best_finetuned_model_path=model_folder_name,
+        )
+    except Exception as e:
+        job_db.update_job(job_id, status="failed", detail=f"Failed to process ZIP file: {e}")
+        if os.path.exists(output_dir):
+            shutil.rmtree(output_dir)
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"An error occurred while processing the file: {e}",
+        )
+    finally:
+        await file.close()
+
+    return {
+        "message": "Finetuned LoRA model uploaded and registered successfully.",
+        "job_id": job_id,
+        "status_url": app.url_path_for("get_status", job_id=job_id),
+    }
+
+
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to the Training API. POST to /train/ to submit a job."}
diff --git a/docker/vlm-lora-finetune/clipora/__init__.py b/docker/vlm-lora-finetune/clipora/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/docker/vlm-lora-finetune/clipora/config/__init__.py b/docker/vlm-lora-finetune/clipora/config/__init__.py
new file mode 100644
index 0000000..d75cc70
--- /dev/null
+++ b/docker/vlm-lora-finetune/clipora/config/__init__.py
@@ -0,0 +1,65 @@
+from dataclasses import asdict, dataclass
+
+import yaml  # type: ignore[import-untyped]
+
+
+@dataclass
+class TrainConfig:
+    model_name: str = "ViT-H-14"
+    pretrained: str = ""
+    compile: bool = False
+    seed: int = 42
+
+    device: str = "cuda"
+    lora_text: bool = True
+    lora_vision: bool = True
+    vision_heads: int = 16  # This is true for ViT-L/14
+
+    output_dir: str = "./clipora_output"
+
+    wandb: bool = False
+    wandb_project: str = ""
+
+    train_dataset: str = "./data/train.csv"
+    eval_dataset: str = "./data/val.csv"
+    datatype: str = "csv"
+    image_col: str = "image"
+    text_col: str = "text"
+    csv_separator: str = ","
+    shuffle: bool = True
+    workers: int = 0
+
+    lora_rank: int = 16
+    lora_alpha: int = 32
+    lora_dropout: float = 0.0
+
+    precision: str = "fp32"
+    batch_size: int = 32
+    gradient_accumulation_steps: int = 1
+    gradient_checkpointing: bool = False
+
+    use_8bit_adam: bool = False
+
+    learning_rate: float = 5e-6
+    adam_beta1: float = 0.9
+    adam_beta2: float = 0.999
+    adam_epsilon: float = 1e-8
+
+    epochs: int = 5
+    warmup: int = 500
+    save_interval: int = 1000
+    eval_interval: int = 100
+    eval_steps: int = 100
+
+
+def parse_yaml_to_config(yaml_path: str) -> TrainConfig:
+    with open(yaml_path, "r") as f:
+        config_dict = yaml.safe_load(f)
+    return TrainConfig(**config_dict)
+
+
+def save_config_to_yaml(config: TrainConfig, filepath: str):
+    """Save a TrainConfig dataclass instance to a YAML file."""
+    config_dict = asdict(config, dict_factory=dict)
+    with open(filepath, "w") as f:
+        yaml.dump(config_dict, f, default_flow_style=False, indent=2)
diff --git a/docker/vlm-lora-finetune/clipora/data/__init__.py b/docker/vlm-lora-finetune/clipora/data/__init__.py
new file mode 100644
index 0000000..59dd60b
--- /dev/null
+++ b/docker/vlm-lora-finetune/clipora/data/__init__.py
@@ -0,0 +1,84 @@
+import logging
+import os
+
+import datasets
+import pandas as pd
+from open_clip import tokenize
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+
+
+class HFDataset(Dataset):
+    def __init__(self, data_location, transforms, image_col, text_col):
+        logging.debug(f"Loading HF dataset from {data_location}.")
+        self.dataset = datasets.load_dataset(data_location, split="train")
+        self.image_col = image_col
+        self.text_col = text_col
+        self.transforms = transforms
+        logging.debug("Done loading data.")
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, idx):
+        images = self.transforms(self.dataset[idx][self.image_col])
+        texts = tokenize([self.dataset[idx][self.text_col]])[0]
+        return images, texts
+
+
+class CSVDataset(Dataset):
+    def __init__(self, data_location, transforms, image_col, text_col, sep="\t"):
+        logging.debug(f"Loading csv data from {data_location}.")
+        self.data_location = data_location
+        df = pd.read_csv(data_location, sep=sep)
+
+        self.images = df[image_col].tolist()
+        self.captions = df[text_col].tolist()
+        self.transforms = transforms
+        logging.debug("Done loading data.")
+
+    def __len__(self):
+        return len(self.captions)
+
+    def __getitem__(self, idx):
+        # use data_location as dir so it also works when image path is relative to data_location
+        image_path = os.path.join(os.path.dirname(self.data_location), str(self.images[idx]))
+        with Image.open(image_path) as img:
+            img = img.convert("RGB")  # Ensure RGB before transform
+            images = self.transforms(img)
+
+        texts = tokenize([str(self.captions[idx])])[0]
+        return images, texts
+
+
+def get_dataloader(args, preprocess, split="train"):
+    if args.datatype == "hf":
+        dataset = HFDataset(
+            data_location=args.train_dataset if split == "train" else args.eval_dataset,
+            transforms=preprocess,
+            image_col=args.image_col,
+            text_col=args.text_col,
+        )
+    elif args.datatype == "csv":
+        dataset = CSVDataset(
+            data_location=args.train_dataset if split == "train" else args.eval_dataset,
+            transforms=preprocess,
+            image_col=args.image_col,
+            text_col=args.text_col,
+            sep=args.csv_separator,
+        )
+    num_samples = len(dataset)
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=args.shuffle,
+        num_workers=args.workers,
+        pin_memory=True,
+        drop_last=True,
+        prefetch_factor=2,
+    )
+
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+    return dataloader
diff --git a/docker/vlm-lora-finetune/clipora/lora/attention.py b/docker/vlm-lora-finetune/clipora/lora/attention.py
new file mode 100644
index 0000000..a4e4922
--- /dev/null
+++ b/docker/vlm-lora-finetune/clipora/lora/attention.py
@@ -0,0 +1,143 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class InjectedMultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        embed_dim=1024,
+        num_heads=16,
+        dropout=0.0,
+        bias=True,
+        kdim=None,
+        vdim=None,
+        batch_first=False,
+    ):
+        super().__init__()
+
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.batch_first = batch_first
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        if not self._qkv_same_embed_dim:
+            raise NotImplementedError
+        else:
+            self.qkv = nn.Linear(embed_dim, embed_dim * 3, bias=bias)
+        self.scaled_dot_product_attention = F.scaled_dot_product_attention
+
+        self.proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def init_weights(self):
+        pass
+
+    def forward(
+        self,
+        query,
+        key,
+        value,
+        key_padding_mask=None,
+        need_weights=True,
+        attn_mask=None,
+        average_attn_weights=True,
+        is_causal=False,
+    ):
+        if attn_mask is not None and is_causal:
+            raise AssertionError("Only allow causal mask or attn_mask")
+        is_batched = query.dim() == 3
+        key_padding_mask = F._canonical_mask(
+            mask=key_padding_mask,
+            mask_name="key_padding_mask",
+            other_type=F._none_or_dtype(attn_mask),
+            other_name="attn_mask",
+            target_type=query.dtype,
+        )
+
+        if self.batch_first and is_batched:
+            if key is value:
+                if query is key:
+                    query = key = value = query.transpose(1, 0)
+                else:
+                    query, key = [x.transpose(1, 0) for x in (query, key)]
+                    value = key
+            else:
+                query, key, value = [x.transpose(1, 0) for x in (query, key, value)]
+
+        tgt_len, bsz, embed_dim = query.shape
+        src_len, _, _ = key.shape
+
+        E = query.size(-1)
+        qkv = self.qkv(query)
+        qkv = qkv.unflatten(-1, (3, E)).unsqueeze(0).transpose(0, -2).squeeze(-2).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        attn_mask = F._canonical_mask(
+            mask=attn_mask,
+            mask_name="attn_mask",
+            other_type=F._none_or_dtype(key_padding_mask),
+            other_name="key_padding_mask",
+            target_type=q.dtype,
+            check_other=False,
+        )
+
+        if attn_mask is not None:
+            # ensure attn_mask's dim is 3
+            if attn_mask.dim() == 2:
+                correct_2d_size = (tgt_len, src_len)
+                if attn_mask.shape != correct_2d_size:
+                    raise RuntimeError(
+                        f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}."
+                    )
+                attn_mask = attn_mask.unsqueeze(0)
+            elif attn_mask.dim() == 3:
+                correct_3d_size = (bsz * self.num_heads, tgt_len, src_len)
+                if attn_mask.shape != correct_3d_size:
+                    raise RuntimeError(
+                        f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
+                    )
+            else:
+                raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+
+        if attn_mask is not None:
+            if attn_mask.size(0) == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.view(bsz, self.num_heads, -1, src_len)
+
+        dropout_p = self.dropout if self.training else 0.0
+
+        q = q.view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        k = k.view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        v = v.view(src_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        src_len = k.size(1)
+        q = q.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        k = k.view(bsz, self.num_heads, src_len, self.head_dim)
+        v = v.view(bsz, self.num_heads, src_len, self.head_dim)
+
+        attn_output = self.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
+        attn_output = self.proj(attn_output)
+        attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
+        if self.batch_first and is_batched:
+            return attn_output.transpose(1, 0), None
+        return attn_output, None
+
+    def set_parameters(self, torch_tgt_module):
+        assert isinstance(torch_tgt_module, nn.MultiheadAttention)
+        assert self.embed_dim == torch_tgt_module.embed_dim
+        # assert self.batch_first == torch_tgt_module.batch_first
+        # assert self.dropout == torch_tgt_module.dropout
+        # assert self.head_dim == torch_tgt_module.head_dim
+        # assert self.num_heads == torch_tgt_module.num_heads
+        # assert self.kdim == torch_tgt_module.kdim
+        # assert self.vdim == torch_tgt_module.vdim
+        self.qkv.weight.data = torch_tgt_module.in_proj_weight.data
+        self.qkv.bias.data = torch_tgt_module.in_proj_bias.data
+        self.proj.weight.data = torch_tgt_module.out_proj.weight.data
+        self.proj.bias.data = torch_tgt_module.out_proj.bias.data
diff --git a/docker/vlm-lora-finetune/clipora/lora/inject.py b/docker/vlm-lora-finetune/clipora/lora/inject.py
new file mode 100644
index 0000000..3e4cb36
--- /dev/null
+++ b/docker/vlm-lora-finetune/clipora/lora/inject.py
@@ -0,0 +1,29 @@
+from typing import Set
+
+import torch.nn as nn
+from clipora.lora.attention import InjectedMultiHeadAttention
+
+
+def inject_linear_attention(
+    model: nn.Module,
+    encoders: Set[str] = {"transformer", "visual"},
+    embed_dim: int = 768,
+    num_heads: int = 12,
+):
+    for encoder in encoders:
+        sub_modules = encoder.split(".")
+        target_module = model
+        for sub_module in sub_modules:
+            target_module = getattr(target_module, sub_module, None)
+            if target_module is None:
+                break
+
+        if target_module is not None and hasattr(target_module, "resblocks"):
+            for module in target_module.resblocks:
+                injection = InjectedMultiHeadAttention(
+                    embed_dim=embed_dim,
+                    num_heads=num_heads,
+                )
+                injection.set_parameters(module.attn)
+                module.attn = injection
+    return model
diff --git a/docker/vlm-lora-finetune/clipora/scheduler/cosine.py b/docker/vlm-lora-finetune/clipora/scheduler/cosine.py
new file mode 100644
index 0000000..7151ffb
--- /dev/null
+++ b/docker/vlm-lora-finetune/clipora/scheduler/cosine.py
@@ -0,0 +1,24 @@
+import numpy as np
+
+
+def assign_learning_rate(optimizer, new_lr):
+    for param_group in optimizer.param_groups:
+        param_group["lr"] = new_lr
+
+
+def _warmup_lr(base_lr, warmup_length, step):
+    return base_lr * (step + 1) / warmup_length
+
+
+def cosine_lr(optimizer, base_lr, warmup_length, steps):
+    def _lr_adjuster(step):
+        if step < warmup_length:
+            lr = _warmup_lr(base_lr, warmup_length, step)
+        else:
+            e = step - warmup_length
+            es = steps - warmup_length
+            lr = 0.5 * (1 + np.cos(np.pi * e / es)) * base_lr
+        assign_learning_rate(optimizer, lr)
+        return lr
+
+    return _lr_adjuster
diff --git a/docker/vlm-lora-finetune/job_db.py b/docker/vlm-lora-finetune/job_db.py
new file mode 100644
index 0000000..5442c8b
--- /dev/null
+++ b/docker/vlm-lora-finetune/job_db.py
@@ -0,0 +1,128 @@
+import datetime
+import logging
+import os
+import sqlite3
+from typing import Any, Dict, Optional
+
+TRAIN_JOB_OUTPUT_DIR = os.getenv("TRAIN_JOB_OUTPUT_DIR", "/tmp/trained_models/")
+DB_PATH = os.path.join(TRAIN_JOB_OUTPUT_DIR, "training_jobs.db")
+
+
+def get_db_connection():
+    """Establishes a database connection."""
+    logging.debug(f"Using sqlite database located in {DB_PATH}")
+    conn = sqlite3.connect(DB_PATH, timeout=15)
+    conn.execute("PRAGMA journal_mode=WAL;")
+    conn.row_factory = sqlite3.Row
+    return conn
+
+
+def init_db():
+    """
+    Initializes the database table.
+    This is the primary place to define the table schema.
+    """
+    conn = get_db_connection()
+    try:
+        # best_finetuned_model_path is the path to latest checkpoint
+        # folder will contain model weights and config that were saved with hf peft
+        conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS jobs (
+                id TEXT PRIMARY KEY,
+                status TEXT NOT NULL,
+                detail TEXT,
+                best_finetuned_model_path TEXT,
+                created_at TEXT NOT NULL,
+                updated_at TEXT NOT NULL
+            )
+        """
+        )
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def create_job(job_id: str, **kwargs: Any):
+    """
+    Creates a new job record using provided data.
+    """
+    now = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    job_data = {
+        "id": job_id,
+        "status": "queued",
+        "detail": "Job is waiting to start.",
+        "created_at": now,
+        "updated_at": now,
+    }
+    job_data.update(kwargs)  # Overwrite defaults with provided data
+
+    columns = ", ".join(job_data.keys())
+    placeholders = ", ".join(["?"] * len(job_data))
+    sql = f"INSERT INTO jobs ({columns}) VALUES ({placeholders})"
+
+    conn = get_db_connection()
+    try:
+        conn.execute(sql, tuple(job_data.values()))
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def update_job(job_id: str, **kwargs: Any):
+    """
+    Updates an existing job with the given key-value pairs.
+    """
+    if not kwargs:
+        return  # Nothing to update
+
+    print(f"Updating job {job_id}")
+    # Automatically update the 'updated_at' timestamp
+    kwargs["updated_at"] = datetime.datetime.now(datetime.timezone.utc).isoformat()
+
+    set_clause = ", ".join([f"{key} = ?" for key in kwargs.keys()])
+    sql = f"UPDATE jobs SET {set_clause} WHERE id = ?"
+
+    conn = get_db_connection()
+    try:
+        conn.execute(sql, (*kwargs.values(), job_id))
+        conn.commit()
+    finally:
+        conn.close()
+
+
+def get_job(job_id: str) -> Optional[Dict]:
+    """Fetches a job record by its ID and returns it as a dictionary."""
+    conn = get_db_connection()
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT * FROM jobs WHERE id = ?", (job_id,))
+        row = cursor.fetchone()
+        return dict(row) if row else None
+    finally:
+        conn.close()
+
+
+def get_all_jobs() -> Optional[list[Dict]]:
+    """Fetches all job records and returns them as a list of dictionaries."""
+    conn = get_db_connection()
+    try:
+        cursor = conn.cursor()
+        cursor.execute("SELECT * FROM jobs")
+        rows = cursor.fetchall()
+        return [dict(row) for row in rows] if rows else []
+    finally:
+        conn.close()
+
+
+def create_job_callback(job_id):
+    """Create a job callback function that you can pass to training loop if needed
+
+    Args:
+        job_id (str): The ID of the job to update.
+    """
+
+    def callback(**kwargs):
+        update_job(job_id, **kwargs)
+
+    return callback
diff --git a/docker/vlm-lora-finetune/merge_and_infer.py b/docker/vlm-lora-finetune/merge_and_infer.py
new file mode 100644
index 0000000..2ad2014
--- /dev/null
+++ b/docker/vlm-lora-finetune/merge_and_infer.py
@@ -0,0 +1,185 @@
+# Script should: merge models? do inference?
+
+# Will get the base model name from train config and merge it with trained
+# lora adapter weights.
+
+import argparse
+import os
+import re
+import shutil
+
+import open_clip
+import torch
+import visualize_results
+from clipora.config import TrainConfig, parse_yaml_to_config, save_config_to_yaml
+from peft import LoraConfig, PeftModel, get_peft_model
+from PIL import Image
+from safetensors.torch import load_file
+from train import get_dataloader, init_model
+from train import main as train_main
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def compute_clip_loss(model, X, Y):
+    loss = open_clip.ClipLoss()
+    image_features, text_features, logit_scale = model(X, Y)
+    total_loss = loss(image_features, text_features, logit_scale)
+    return total_loss
+
+
+@torch.no_grad()
+def evaluate(model, dataloader, config):
+    out = {}
+    model.eval()
+    losses = torch.zeros(config.eval_steps)
+    for k in range(config.eval_steps):
+        X, Y = next(iter(dataloader))
+        X, Y = X.to(device), Y.to(device)
+        loss = compute_clip_loss(model, X, Y)
+        losses[k] = loss.item()
+    out["eval_loss"] = losses.mean()
+    model.train()
+    return out
+
+
+def get_newest_checkpoint(output_dir):
+    # Get the checkpoint path with the highest iteration
+    if not os.path.exists(output_dir):
+        return None
+
+    files = os.listdir(output_dir)
+
+    # Filter for checkpoint files and extract their numbers
+    checkpoint_pattern = re.compile(r"^checkpoint_(\d+)$")
+    checkpoints = []
+
+    for file in files:
+        match = checkpoint_pattern.match(file)
+        if match:
+            checkpoint_num = int(match.group(1))
+            checkpoints.append((checkpoint_num, file))
+
+    if not checkpoints:
+        return None
+
+    latest_checkpoint = max(checkpoints, key=lambda x: x[0])
+
+    return os.path.join(output_dir, latest_checkpoint[1])
+
+
+def save_full_model_weights(model, lora_adapter_path, config, output_dir):
+    # It's not really needed to save the full weights as LORAs purpose is to be able to
+    # share them more easily?
+    output_path = os.path.join(lora_adapter_path, "merged_model_weights.pt")
+    print(f"Saving merged model weights to {output_path}")
+    merged_model = model.merge_and_unload()
+    torch.save(merged_model.state_dict(), output_path)
+
+
+def run_inference_comparison(lora_model, preprocess, config):
+    print("Running inference comparison...")
+
+    # Load the original CLIP model (no LORA)
+    original_model, _, _ = open_clip.create_model_and_transforms(
+        model_name=config.model_name,
+        pretrained=config.pretrained,
+    )
+
+    original_model = original_model.to(device)
+    lora_model = lora_model.to(device)
+    eval_dataloader = get_dataloader(config, preprocess, "val")
+    original_eval_loss = evaluate(original_model, eval_dataloader, config)
+    print("Original eval loss:")
+    print(original_eval_loss)
+    lora_eval_loss = evaluate(lora_model, eval_dataloader, config)
+    print("Lora eval loss:")
+    print(lora_eval_loss)
+    print("Visualizing results...")
+    visualize_results.main(original_model, lora_model, preprocess, config)
+
+
+def run_single_inference(job_dict, image_path, classes: list[str]):
+    # run inference on a single image using the LORA model
+    # gets the finetuned model that was saved in the training job job_id
+    if not job_dict:
+        print("No job info provided")
+        return None, None
+
+    TRAIN_JOB_OUTPUT_DIR = os.getenv("TRAIN_JOB_OUTPUT_DIR", "/tmp/trained_models/")
+    lora_adapter_path = job_dict["best_finetuned_model_path"]
+    # If relative path, assume it's relative to TRAIN_JOB_OUTPUT_DIR
+    if not os.path.isabs(lora_adapter_path):
+        lora_adapter_path = os.path.join(TRAIN_JOB_OUTPUT_DIR, job_dict["id"], lora_adapter_path)
+    config_path = os.path.join(lora_adapter_path, "train_config.yaml")
+    config = parse_yaml_to_config(config_path)
+    lora_model, preprocess = init_model(config, lora_adapter_path=lora_adapter_path)
+    lora_model.to(device)
+    processed_image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
+
+    text_tokens = open_clip.tokenize(classes).to(device)
+
+    with torch.no_grad(), torch.autocast("cuda"):
+        image_features = lora_model.encode_image(processed_image)
+        text_features = lora_model.encode_text(text_tokens)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+
+        text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1).cpu().numpy()
+
+    print(text_probs)
+    return text_probs.tolist(), classes
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Run inference comparison between original CLIP model and LORA fine-tuned model.",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=False,
+        default=None,
+        help="The path to the yaml file containing the training configuration. "
+        "If not provided, will try to load train_config.yaml from lora_adapter_path",
+    )
+
+    parser.add_argument(
+        "--save_full_model_weights",
+        action="store_true",
+        help="Save pytorch merged model weights. Often not needed if you use pretrained CLIP and trained LORAs",
+    )
+
+    parser.add_argument(
+        "--lora_adapter_path",
+        type=str,
+        required=False,
+        help="The path to the LoRA adapter weights, e.g. checkpoint. If not provided, will use newest checkpoint",
+        default=None,
+    )
+
+    args = parser.parse_args()
+    # TODO: exception handling
+    if args.lora_adapter_path is None and args.config is None:
+        raise Exception("Both lora_adapter_path and config cannot be None")
+
+    # If None, assume the train config that was used to train the model was saved in the checkpoint folder
+    lora_adapter_path = args.lora_adapter_path
+    if lora_adapter_path is None:
+        print(f"Loading config from: {args.config}")
+        config = parse_yaml_to_config(args.config)
+        # By default, load latest checkpoint
+        lora_adapter_path = (
+            args.lora_adapter_path if args.lora_adapter_path else get_newest_checkpoint(config.output_dir)
+        )
+
+    if args.config is None:
+        print("Trying to load train config from lora adapter/checkpoint path")
+        config_path = os.path.join(lora_adapter_path, "train_config.yaml")
+        config = parse_yaml_to_config(config_path)
+
+    print(f"Config output dir: {config.output_dir}, lora adapter path: {lora_adapter_path}")
+    lora_model, preprocess = init_model(config, lora_adapter_path=lora_adapter_path)
+    if args.save_full_model_weights:
+        save_full_model_weights(lora_model, lora_adapter_path, config, config.output_dir)
+    run_inference_comparison(lora_model, preprocess, config)
diff --git a/docker/vlm-lora-finetune/misc_test_files/bridge_small/api_example_config.yml b/docker/vlm-lora-finetune/misc_test_files/bridge_small/api_example_config.yml
new file mode 100644
index 0000000..0b38284
--- /dev/null
+++ b/docker/vlm-lora-finetune/misc_test_files/bridge_small/api_example_config.yml
@@ -0,0 +1,41 @@
+# OpenCLIP base model name
+model_name: "ViT-B-16"
+# Use these pretrained weights, provided by OpenCLIP
+pretrained: "datacomp_xl_s13b_b90k"
+compile: False
+seed: 1337
+
+device: "cuda"
+# Output directory for finetuned lora weights and config. Will contain checkpoint folders with
+# lora weights and config. Will be rewritten if using API
+output_dir: "/Users/tman/work/bridge_output"
+
+wandb: False
+wandb_project: test-clipora
+
+# Custom CSV dataset. If using API these paths will be rewritten to match the
+# the extracted ZIP file paths
+train_dataset: "/Users/tman/data/bridge_dataset_small/train.csv"
+eval_dataset: "/Users/tman/data/bridge_dataset_small/eval.csv"
+datatype: "csv"
+csv_separator: ","
+image_col: "image_path"
+text_col: "language_instruction"
+shuffle: True
+
+lora_rank: 16
+lora_alpha: 32
+lora_dropout: 0.0
+
+batch_size: 32
+gradient_accumulation_steps: 1
+gradient_checkpointing: False
+
+use_8bit_adam: False
+
+learning_rate: 1e-4
+epochs: 3
+warmup: 0.01
+save_interval: 200
+eval_interval: 30
+eval_steps: 10
diff --git a/docker/vlm-lora-finetune/misc_test_files/bridge_small/bridge_dataset_small.zip b/docker/vlm-lora-finetune/misc_test_files/bridge_small/bridge_dataset_small.zip
new file mode 100644
index 0000000..24cf4e9
Binary files /dev/null and b/docker/vlm-lora-finetune/misc_test_files/bridge_small/bridge_dataset_small.zip differ
diff --git a/docker/vlm-lora-finetune/misc_test_files/prepare_custom_dataset_relative_path.py b/docker/vlm-lora-finetune/misc_test_files/prepare_custom_dataset_relative_path.py
new file mode 100644
index 0000000..02629a2
--- /dev/null
+++ b/docker/vlm-lora-finetune/misc_test_files/prepare_custom_dataset_relative_path.py
@@ -0,0 +1,378 @@
+"""NOTE: SHIT SCRIPT LOTS OF HARDCODED STUFF
+
+Helper example script that creates a custom dataset from huggingface sample
+dataset 'dusty-nv/bridge_orig_ep100'. The sample is in tfrecords format and is
+converted to raw images and a CSV with path to image and its expected output (text).
+
+This is only used as a preparation step for a single example to show how a custom dataset
+for openCLIP finetuning looks like.
+"""
+
+import argparse
+import csv
+import json
+import os
+import random
+
+import numpy as np
+import tensorflow as tf
+import yaml  # type: ignore[import-untyped]
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+
+def get_tfrecord_dataset(repo_id="dusty-nv/bridge_orig_ep100"):
+    # TODO: how to make it so we don't need to hardcode filenames here
+    files = [
+        "1.0.0/bridge_orig_ep100-train.tfrecord-00000-of-00002",
+        "1.0.0/bridge_orig_ep100-train.tfrecord-00001-of-00002",
+        "1.0.0/dataset_info.json",
+        "1.0.0/features.json",
+    ]
+
+    tfrecord_files = []
+
+    features_json_path = None
+    for fname in files:
+        path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=fname)
+        # collect tfrecords for later use
+        if "tfrecord" in fname:
+            tfrecord_files.append(path)
+        elif "features.json" in fname:
+            features_json_path = path
+
+        print(path)
+
+    raw_dataset = tf.data.TFRecordDataset(tfrecord_files)
+    # features.json contains tfrecord schema
+    with open(features_json_path, "r") as f:
+        features_data = json.load(f)
+
+    return raw_dataset, features_data
+
+
+def build_feature_description_from_json(features_data):
+    # Map dtype strings to TensorFlow dtypes
+    # Note: VarLenFeature doesn't support bool, so we use int64 and convert later
+    dtype_mapping = {
+        "float32": tf.float32,
+        "int64": tf.int64,
+        "bool": tf.int64,  # Parse as int64, convert to bool later
+        "string": tf.string,
+        "uint8": tf.uint8,
+    }
+
+    # Keep track of fields that should be converted to bool after parsing
+    bool_fields = set()
+
+    feature_description = {}
+    reshape_info = {}
+
+    def process_features(features_dict, prefix=""):
+        """Recursively process features dictionary"""
+        for key, feature_spec in features_dict["features"].items():
+            full_key = f"{prefix}/{key}" if prefix else key
+
+            if feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.text_feature.Text":
+                if prefix and "steps" in prefix:  # Variable length sequence
+                    feature_description[full_key] = tf.io.VarLenFeature(tf.string)
+                else:  # Fixed length
+                    feature_description[full_key] = tf.io.FixedLenFeature([], tf.string)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.scalar.Scalar":
+                dtype_str = feature_spec["tensor"]["dtype"]
+                tf_dtype = dtype_mapping.get(dtype_str, tf.string)
+
+                # Track boolean fields for later conversion
+                if dtype_str == "bool":
+                    bool_fields.add(full_key)
+
+                if prefix and "steps" in prefix:  # Variable length sequence
+                    feature_description[full_key] = tf.io.VarLenFeature(tf_dtype)
+                else:  # Fixed length
+                    feature_description[full_key] = tf.io.FixedLenFeature([], tf_dtype)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.tensor_feature.Tensor":
+                tensor_info = feature_spec["tensor"]
+                dtype_str = tensor_info["dtype"]
+                tf_dtype = dtype_mapping.get(dtype_str, tf.string)
+
+                # Extract shape information
+                if "shape" in tensor_info and "dimensions" in tensor_info["shape"]:
+                    shape = [int(dim) for dim in tensor_info["shape"]["dimensions"]]
+                    if prefix and "steps" in prefix:  # Variable length sequence of tensors
+                        feature_description[full_key] = tf.io.VarLenFeature(tf_dtype)
+                        reshape_info[full_key] = shape  # Store original shape for reshaping
+                    else:  # Fixed length tensor
+                        feature_description[full_key] = tf.io.FixedLenFeature(shape, tf_dtype)
+                else:
+                    # Scalar tensor
+                    if prefix and "steps" in prefix:
+                        feature_description[full_key] = tf.io.VarLenFeature(tf_dtype)
+                    else:
+                        feature_description[full_key] = tf.io.FixedLenFeature([], tf_dtype)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.image_feature.Image":
+                # Images are stored as encoded strings (PNG/JPEG)
+                if prefix and "steps" in prefix:
+                    feature_description[full_key] = tf.io.VarLenFeature(tf.string)
+                else:
+                    feature_description[full_key] = tf.io.FixedLenFeature([], tf.string)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.features_dict.FeaturesDict":
+                # Recursively process nested features
+                process_features(feature_spec["featuresDict"], full_key)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.dataset_feature.Dataset":
+                # Process sequence features with "steps" prefix to indicate variable length
+                sequence_feature = feature_spec["sequence"]["feature"]
+                if "featuresDict" in sequence_feature:
+                    process_features(sequence_feature["featuresDict"], full_key)
+
+    # Process the root features
+    process_features(features_data["featuresDict"])
+
+    return feature_description, reshape_info, bool_fields
+
+
+def parse_bridge_episode(example_proto, features_data):
+    """Parse a Bridge dataset episode using features.json schema"""
+
+    feature_description, reshape_info, bool_fields = build_feature_description_from_json(features_data)
+
+    parsed = tf.io.parse_single_example(example_proto, feature_description)
+
+    # Convert sparse tensors to dense
+    for key in parsed:
+        if isinstance(parsed[key], tf.SparseTensor):
+            parsed[key] = tf.sparse.to_dense(parsed[key])
+
+    # Convert int64 fields back to bool where appropriate
+    for key in bool_fields:
+        if key in parsed:
+            parsed[key] = tf.cast(parsed[key], tf.bool)
+
+    # For debugging: let's check if the data is already properly shaped
+    # and avoid reshaping if it's not needed
+    if reshape_info:
+        step_keys = [k for k in parsed.keys() if k.startswith("steps/")]
+        if step_keys:
+            reference_key = next(k for k in step_keys if k in parsed)
+            num_steps = tf.shape(parsed[reference_key])[0]
+
+            for key, original_shape in reshape_info.items():
+                if key in parsed:
+                    current_tensor = parsed[key]
+                    current_shape = tf.shape(current_tensor)
+
+                    # Check if already has the right shape (might be [num_steps, feature_size] already)
+                    if len(original_shape) == 1:  # 1D feature like [7]
+                        expected_shape = [num_steps, original_shape[0]]
+
+                        # If current shape matches expected, no reshaping needed
+                        # If it's flattened [num_steps * feature_size], reshape it
+                        if tf.rank(current_tensor) == 1:
+                            # It's flattened, try to reshape
+                            total_elements = tf.size(current_tensor)
+                            feature_size = original_shape[0]
+
+                            # Only reshape if the math works out
+                            if tf.math.equal(total_elements % feature_size, 0):
+                                inferred_num_steps = total_elements // feature_size
+                                parsed[key] = tf.reshape(current_tensor, [inferred_num_steps, feature_size])
+
+    return parsed
+
+
+def save_episode_images(parsed_example, episode_num, base_output_dir):
+    """Save images and create CSV file for an episode"""
+
+    # Create episode directory
+    episode_dir = f"episode_{episode_num:04d}"
+    # episode_dir = os.path.join(base_output_dir, episode_str)
+    os.makedirs(os.path.join(base_output_dir, episode_dir), exist_ok=True)
+
+    # Get language instruction (should be the same for all steps in episode)
+    language_instructions = parsed_example["steps/language_instruction"]
+    if len(language_instructions) > 0:
+        instructions = [inst.numpy().decode("utf-8") for inst in language_instructions]
+
+    # Get images and already properly shaped action/state tensors
+    images = parsed_example["steps/observation/image"]
+    actions = parsed_example["steps/action"]
+    states = parsed_example["steps/observation/state"]
+
+    num_steps = len(images)
+
+    # Prepare CSV data
+    csv_data = []
+
+    # Save each image and record in CSV
+    for step_idx in range(num_steps):
+        # Decode image
+        image = tf.io.decode_png(images[step_idx], channels=3)
+        instruction = instructions[step_idx]
+        # Save image as PNG
+        image_filename = f"step_{step_idx:04d}.png"
+        image_path = os.path.join(episode_dir, image_filename)
+
+        # Convert to PIL Image and save
+        pil_image = Image.fromarray(image.numpy())
+        pil_image.save(os.path.join(base_output_dir, image_path))
+
+        # Add to CSV data
+        csv_data.append([image_path, instruction])
+
+    print(f"Saved {num_steps} images to {episode_dir}")
+
+    return {
+        "instructions": instructions,
+        "num_steps": num_steps,
+        "actions": actions.numpy(),  # Already correctly shaped
+        "states": states.numpy(),  # Already correctly shaped
+        "episode_dir": episode_dir,
+        "csv_data": csv_data,
+    }
+
+
+# Create a simple test to see what we're getting
+def debug_first_episode(raw_dataset, features_data):
+    """Debug function to see the raw shapes before processing"""
+    feature_description, reshape_info, bool_fields = build_feature_description_from_json(features_data)
+
+    # Parse one example without reshaping
+    for raw_example in raw_dataset.take(1):
+        parsed = tf.io.parse_single_example(raw_example, feature_description)
+
+        # Convert sparse tensors to dense
+        for key in parsed:
+            if isinstance(parsed[key], tf.SparseTensor):
+                parsed[key] = tf.sparse.to_dense(parsed[key])
+
+        print("Raw parsed shapes and info:")
+        for key, tensor in parsed.items():
+            print(f"  {key}: shape={tensor.shape}, size={tf.size(tensor).numpy()}")
+
+        print("\nReshape info from features.json:")
+        for key, shape in reshape_info.items():
+            print(f"  {key}: expected shape per step = {shape}")
+
+        break
+
+
+def split_csv_data(input_csv_path, train_csv_path, eval_csv_path, split_percent=80):
+    """Generate 'train.csv' and 'eval.csv' from input CSV file.
+    Files are saved in the same directory as input CSV.
+    """
+    if not (0 < split_percent < 100):
+        raise ValueError("split_percent must be between 1 and 99 (exclusive).")
+
+    all_rows = []
+    with open(input_csv_path, "r", newline="", encoding="utf-8") as infile:
+        reader = csv.reader(infile)
+        header = next(reader)  # Read the header row
+        for row in reader:
+            all_rows.append(row)
+
+    random.shuffle(all_rows)  # Shuffle the rows to ensure random distribution
+
+    num_rows = len(all_rows)
+    num_train_rows = int(num_rows * (split_percent / 100))
+
+    train_data = all_rows[:num_train_rows]
+    eval_data = all_rows[num_train_rows:]
+
+    # Write to train.csv
+    with open(train_csv_path, "w", newline="", encoding="utf-8") as train_file:
+        writer = csv.writer(train_file, quoting=csv.QUOTE_ALL)
+        writer.writerow(header)  # Write the header
+        writer.writerows(train_data)
+
+    # Write to eval.csv
+    with open(eval_csv_path, "w", newline="", encoding="utf-8") as eval_file:
+        writer = csv.writer(eval_file, quoting=csv.QUOTE_ALL)
+        writer.writerow(header)  # Write the header
+        writer.writerows(eval_data)
+
+    print("Data split complete:")
+    print(f"  Total rows: {num_rows}")
+    print(f"  Training rows ({split_percent}%): {len(train_data)}")
+    print(f"  Evaluation rows ({100 - split_percent}%): {len(eval_data)}")
+    print(f"  Train CSV saved to: {os.path.abspath(train_csv_path)}")
+    print(f"  Eval CSV saved to: {os.path.abspath(eval_csv_path)}")
+
+
+def main(repo_id, train_csv_path, eval_csv_path, split_percent=80, n_episodes=None):
+    """
+    Main function to process the dataset.
+
+    Args:
+        repo_id (str): The Hugging Face repository ID for the dataset.
+        base_output_dir (str): The base directory to save output files.
+    """
+    # Get the dataset from the specified repository
+    raw_dataset, features_data = get_tfrecord_dataset(repo_id=repo_id)
+    debug_first_episode(raw_dataset, features_data)
+
+    parsed_dataset = raw_dataset.map(lambda x: parse_bridge_episode(x, features_data))
+    # Output only this many episodes, or all if None
+
+    base_output_dir = os.path.dirname(train_csv_path)
+    csv_path = os.path.join(base_output_dir, "images_and_instructions.csv")
+    try:
+        with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
+            # Write the header row
+            writer.writerow(["image_path", "language_instruction"])
+
+            for i, episode in enumerate(parsed_dataset):
+                if n_episodes is not None and i >= n_episodes:
+                    break
+                episode_data = save_episode_images(episode, episode_num=i + 1, base_output_dir=base_output_dir)
+                for csv_row in episode_data["csv_data"]:
+                    writer.writerow(csv_row)
+
+        print(f"Successfully generated {csv_path}")
+        # Split the generated CSV into training and validation sets
+        if 0 < split_percent < 100:
+            print(f"Splitting CSV data into train and eval sets with split percent: {split_percent}")
+            split_csv_data(csv_path, train_csv_path, eval_csv_path, split_percent=split_percent)
+        else:
+            print("Skipping CSV split as split_percent is not between 1 and 99.")
+    except IOError as e:
+        print(f"Error saving images: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+
+if __name__ == "__main__":
+    # Set up the argument parser
+    parser = argparse.ArgumentParser(description="Process bridge dataset episodes from a Hugging Face repository.")
+
+    # Add the argument for the repository ID
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="dusty-nv/bridge_orig_ep100",
+        help="The Hugging Face repository ID to pull the dataset from.",
+    )
+
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="/configs/bridge_train_config.yml",
+        help="The path to the yaml file containing the training configuration.",
+    )
+
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config_dict = yaml.safe_load(f)
+    # Use config to get the output folder of data from train_dataset variable
+    # this way only need to change train_dataset in config to change data folder
+    base_output_dir = "/Users/tman/data/bridge_dataset_small"
+    print("Saving custom dataset to: ", base_output_dir)
+    os.makedirs(base_output_dir, exist_ok=True)
+    if len(os.listdir(base_output_dir)) > 60:
+        print(f"Output directory {base_output_dir} already contains >60 files, skipping dataset preparation.")
+    else:
+        main(args.repo_id, config_dict["train_dataset"], config_dict["eval_dataset"], n_episodes=20)
diff --git a/docker/vlm-lora-finetune/misc_test_files/testing_deployment.yaml b/docker/vlm-lora-finetune/misc_test_files/testing_deployment.yaml
new file mode 100644
index 0000000..7d6877c
--- /dev/null
+++ b/docker/vlm-lora-finetune/misc_test_files/testing_deployment.yaml
@@ -0,0 +1,76 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: clipora-testing-deployment
+  labels:
+    app: clipora-testing-deployment
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 80
+      targetPort: 8080
+  selector:
+    app: clipora-testing-deployment
+---
+# Source: dev-workspace-vscode/templates/deployment.yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: clipora-testing-deployment
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: clipora-testing-deployment
+  template:
+    metadata:
+      labels:
+        app: clipora-testing-deployment
+    spec:
+      containers:
+        - name: clipora-api-testing
+          args:
+          - |
+            rm -rf /workload/clipora
+            git clone --single-branch --branch dev-fastapi https://github.com/tkarkkai-amd/clipora /workload/clipora
+            cd /workload/clipora
+            pip install -r amd_requirements.txt
+            uvicorn api_main:app --host 0.0.0.0 --port 8080 --reload
+          command: ["sh", "-c"]
+          image: "rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1"
+          imagePullPolicy: "Always"
+          env:
+            - name: TRAIN_JOB_OUTPUT_DIR
+              value: /workload/clipora/tests/fixtures/test_lora_models/
+          ports:
+            - name: http
+              containerPort: 8080
+          resources:
+            requests:
+              memory: "128Gi"
+              cpu: "8"
+              amd.com/gpu: "1"
+            limits:
+              memory: "128Gi"
+              cpu: "8"
+              amd.com/gpu: "1"
+          volumeMounts:
+            - mountPath: /workload
+              name: ephemeral-storage
+            - mountPath: /dev/shm
+              name: dshm
+      volumes:
+        - ephemeral:
+            volumeClaimTemplate:
+              spec:
+                accessModes: [ReadWriteMany]
+                resources:
+                  requests:
+                    storage: 128Gi
+                storageClassName: mlstorage
+          name: ephemeral-storage
+        - emptyDir:
+            medium: Memory
+            sizeLimit: 16Gi
+          name: dshm
diff --git a/docker/vlm-lora-finetune/misc_test_files/testpage.html b/docker/vlm-lora-finetune/misc_test_files/testpage.html
new file mode 100644
index 0000000..b8bdf4c
--- /dev/null
+++ b/docker/vlm-lora-finetune/misc_test_files/testpage.html
@@ -0,0 +1,189 @@
+<!DOCTYPE html>
+<html lang="en" class="h-full bg-gray-50">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Model Training Submission</title>
+    <script src="https://cdn.tailwindcss.com"></script>
+    <link rel="preconnect" href="https://fonts.googleapis.com">
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+    <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
+    <style>
+        body { font-family: 'Inter', sans-serif; }
+        .progress-bar-inner {
+            transition: width 0.5s ease-in-out, background-color 0.5s ease-in-out;
+        }
+    </style>
+</head>
+<body class="bg-gray-100">
+    <div class="max-w-3xl w-full mx-auto my-10">
+        <!-- Form Card -->
+        <div class="bg-white p-8 rounded-xl shadow-lg mb-10">
+            <div class="text-center mb-8">
+                <h1 class="text-2xl font-bold text-gray-800">Submit Training Job</h1>
+                <p class="text-gray-500">Provide a YAML configuration and a ZIP file containing the dataset.</p>
+            </div>
+            <form id="trainingForm" class="space-y-6">
+                <!-- YAML Configuration Text Area -->
+                <div>
+                    <label for="config_str" class="block text-sm font-medium text-gray-700 mb-1">
+                        YAML Configuration
+                    </label>
+                    <textarea id="config_str" name="config_str" rows="8" class="block w-full px-3 py-2 border border-gray-300 rounded-md shadow-sm placeholder-gray-400 focus:outline-none focus:ring-indigo-500 focus:border-indigo-500 sm:text-sm" placeholder="Paste your YAML configuration here..." required></textarea>
+                </div>
+                <!-- File Upload Input -->
+                <div>
+                    <label for="file" class="block text-sm font-medium text-gray-700 mb-1">
+                        Dataset ZIP File
+                    </label>
+                    <input id="file" name="file" type="file" accept=".zip" class="block w-full text-sm text-gray-500 file:mr-4 file:py-2 file:px-4 file:rounded-md file:border-0 file:text-sm file:font-semibold file:bg-indigo-50 file:text-indigo-700 hover:file:bg-indigo-100" required>
+                </div>
+                <!-- Submission Button -->
+                <div>
+                    <button type="submit" id="submitButton" class="w-full flex justify-center py-3 px-4 border border-transparent rounded-md shadow-sm text-sm font-medium text-white bg-indigo-600 hover:bg-indigo-700 focus:outline-none focus:ring-2 focus:ring-offset-2 focus:ring-indigo-500 transition-colors">
+                        Send
+                    </button>
+                </div>
+            </form>
+        </div>
+
+        <!-- Jobs Container -->
+        <div id="jobsSection" class="bg-white p-8 rounded-xl shadow-lg" style="display: none;">
+             <h2 class="text-xl font-bold text-gray-800 mb-4">Tracked Jobs</h2>
+             <div id="jobsContainer" class="space-y-4">
+                <!-- Job rows will be inserted here -->
+             </div>
+        </div>
+    </div>
+
+    <script>
+        const form = document.getElementById('trainingForm');
+        const submitButton = document.getElementById('submitButton');
+        const jobsSection = document.getElementById('jobsSection');
+        const jobsContainer = document.getElementById('jobsContainer');
+
+        const jobIntervals = new Map();
+
+        /**
+         * Handles form submission to start a new training job.
+         */
+        form.addEventListener('submit', async (event) => {
+            event.preventDefault();
+            submitButton.disabled = true;
+            submitButton.textContent = 'Submitting...';
+
+            const formData = new FormData(form);
+
+            try {
+                const response = await fetch('http://localhost:8080/train', {
+                    method: 'POST',
+                    body: formData,
+                });
+
+                if (!response.ok) {
+                    const errorText = await response.text();
+                    throw new Error(`HTTP error! Status: ${response.status} - ${errorText}`);
+                }
+
+                const result = await response.json();
+                addJobRow(result.job_id, result.status_url);
+                form.reset(); // Clear the form for the next submission
+
+            } catch (error) {
+                console.error('Submission failed:', error);
+                alert(`Submission failed: ${error.message}`);
+            } finally {
+                submitButton.disabled = false;
+                submitButton.textContent = 'Send';
+            }
+        });
+
+        /**
+         * Creates and appends a new job row to the UI and starts polling for its status.
+         */
+        function addJobRow(jobId, statusUrl) {
+            if (jobsSection.style.display === 'none') {
+                jobsSection.style.display = 'block';
+            }
+
+            const jobRow = document.createElement('div');
+            jobRow.id = `job-${jobId}`;
+            jobRow.className = 'p-4 border rounded-lg bg-gray-50';
+            jobRow.innerHTML = `
+                <div class="flex justify-between items-center mb-2">
+                    <p class="text-sm font-medium text-gray-800">Job ID: <span class="font-mono bg-gray-200 px-2 py-1 rounded">${jobId}</span></p>
+                    <p class="status-text text-sm font-semibold text-gray-500">Initializing...</p>
+                </div>
+                <div class="w-full bg-gray-200 rounded-full h-2.5">
+                    <div class="progress-bar-inner bg-indigo-600 h-2.5 rounded-full" style="width: 0%"></div>
+                </div>
+                <p class="detail-text text-xs text-gray-500 mt-1 h-4"></p>
+            `;
+            jobsContainer.prepend(jobRow);
+
+            const fullStatusUrl = `http://localhost:8080${statusUrl}`;
+            const intervalId = setInterval(() => checkStatus(fullStatusUrl, jobId), 2000);
+            jobIntervals.set(jobId, intervalId);
+        }
+
+        /**
+         * Parses progress percentage from a detail string.
+         * Looks for a number followed by a '%' sign.
+         */
+        function parseProgress(detail) {
+            if (!detail) return 0;
+            const match = detail.match(/(\d{1,3})\s*%/);
+            return match ? parseInt(match[1], 10) : 0;
+        }
+
+        /**
+         * Polls the status endpoint for a specific job and updates its UI row.
+         */
+        async function checkStatus(url, jobId) {
+            const jobRow = document.getElementById(`job-${jobId}`);
+            if (!jobRow) return;
+
+            const progressBar = jobRow.querySelector('.progress-bar-inner');
+            const statusText = jobRow.querySelector('.status-text');
+            const detailText = jobRow.querySelector('.detail-text');
+
+            try {
+                const response = await fetch(url);
+                if (!response.ok) {
+                    throw new Error(`Status check failed! Status: ${response.status}`);
+                }
+
+                const data = await response.json();
+
+                // Update UI elements
+                statusText.textContent = data.status;
+                detailText.textContent = data.detail || '';
+                const progress = parseProgress(data.detail);
+                progressBar.style.width = `${progress}%`;
+
+                // Handle terminal states
+                const isFinished = data.status.toLowerCase() === 'completed' || data.status.toLowerCase() === 'failed';
+                if (isFinished) {
+                    clearInterval(jobIntervals.get(jobId));
+                    jobIntervals.delete(jobId);
+
+                    progressBar.style.width = '100%'; // Fill bar on completion
+                    if (data.status.toLowerCase() === 'completed') {
+                        statusText.classList.add('text-green-600');
+                        progressBar.classList.replace('bg-indigo-600', 'bg-green-500');
+                    } else {
+                        statusText.classList.add('text-red-600');
+                        progressBar.classList.replace('bg-indigo-600', 'bg-red-500');
+                    }
+                }
+            } catch (error) {
+                console.error(`Error checking status for job ${jobId}:`, error);
+                statusText.textContent = 'Error';
+                statusText.classList.add('text-red-600');
+                clearInterval(jobIntervals.get(jobId));
+                jobIntervals.delete(jobId);
+            }
+        }
+    </script>
+</body>
+</html>
diff --git a/docker/vlm-lora-finetune/misc_test_files/training_jobs.db b/docker/vlm-lora-finetune/misc_test_files/training_jobs.db
new file mode 100755
index 0000000..f0fd733
Binary files /dev/null and b/docker/vlm-lora-finetune/misc_test_files/training_jobs.db differ
diff --git a/docker/vlm-lora-finetune/old_requirements.txt b/docker/vlm-lora-finetune/old_requirements.txt
new file mode 100644
index 0000000..6811354
--- /dev/null
+++ b/docker/vlm-lora-finetune/old_requirements.txt
@@ -0,0 +1,10 @@
+accelerate==0.24.1
+bitsandbytes==0.41.1
+datasets[vision]
+open_clip_torch==2.23.0
+pandas==2.1.2
+peft
+scipy
+torch==2.1.0
+tqdm==4.66.1
+wandb==0.15.12
diff --git a/docker/vlm-lora-finetune/setup.py b/docker/vlm-lora-finetune/setup.py
new file mode 100644
index 0000000..e69de29
diff --git a/docker/vlm-lora-finetune/train.py b/docker/vlm-lora-finetune/train.py
new file mode 100644
index 0000000..05d616d
--- /dev/null
+++ b/docker/vlm-lora-finetune/train.py
@@ -0,0 +1,247 @@
+import argparse
+import itertools
+import logging
+import os
+import time
+from typing import Callable
+
+import numpy as np
+import open_clip
+import torch
+from accelerate import Accelerator
+from clipora.config import TrainConfig, parse_yaml_to_config, save_config_to_yaml
+from clipora.data import get_dataloader
+from clipora.lora.inject import inject_linear_attention
+from clipora.scheduler.cosine import cosine_lr
+from peft import LoraConfig, PeftModel, get_peft_model
+from tqdm.auto import tqdm
+
+logger = logging.getLogger(__name__)
+
+
+def compute_clip_loss(model, X, Y):
+    loss = open_clip.ClipLoss()
+    image_features, text_features, logit_scale = model(X, Y)
+    total_loss = loss(image_features, text_features, logit_scale)
+    return total_loss
+
+
+@torch.no_grad()
+def evaluate(model, dataloader, config):
+    out = {}
+    model.eval()
+    losses = torch.zeros(config.eval_steps)
+    for k in range(config.eval_steps):
+        X, Y = next(iter(dataloader))
+        loss = compute_clip_loss(model, X, Y)
+        losses[k] = loss.item()
+    out["eval_loss"] = losses.mean()
+    model.train()
+    return out
+
+
+def init_model(config: TrainConfig, lora_adapter_path=None, full_weights_path=None):
+    # lora_adapter_path = checkpoint path.
+    # full_weights_path = if given, dont download pretrained weights, instead load weights from this
+    pretrained = None if full_weights_path is not None else config.pretrained
+    model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(
+        model_name=config.model_name,
+        pretrained=pretrained,
+    )
+    model_config = open_clip.get_model_config(config.model_name)
+    if config.lora_text:
+        model = inject_linear_attention(
+            model=model,
+            encoders={"transformer"},
+            embed_dim=model_config["embed_dim"],
+            num_heads=model_config["text_cfg"]["heads"],
+        )
+    if config.lora_vision:
+        model = inject_linear_attention(
+            model=model,
+            encoders={"visual.transformer"},
+            embed_dim=model_config["vision_cfg"]["width"],
+            num_heads=config.vision_heads,
+        )
+
+    if full_weights_path:
+        model.load_state_dict(torch.load(full_weights_path))
+
+    # If not None, load existing loras from here
+    if lora_adapter_path:
+        model = PeftModel.from_pretrained(model, lora_adapter_path)
+
+    else:
+        lora_config = LoraConfig(
+            r=config.lora_rank,
+            lora_alpha=config.lora_alpha,
+            lora_dropout=config.lora_dropout,
+            target_modules=["qkv", "proj"],
+        )
+        model = get_peft_model(model, lora_config)
+
+    if config.compile:
+        model.compile()
+    return model, preprocess_train
+
+
+def main(config: TrainConfig, job_callback: Callable | None = None):
+    """Main training loop.
+
+    Args:
+        config (TrainConfig): clipora training config
+        job_callback (Callable | None, optional): A callback function to update job status
+        each iteration. Used by API. Defaults to None.
+    """
+    logging.basicConfig(level=logging.INFO)
+
+    accelerator = Accelerator(
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        log_with="wandb" if config.wandb else None,
+    )
+
+    if accelerator.is_main_process:
+        accelerator.print()
+        if config.output_dir is not None:
+            accelerator.print(f"Output directory: {config.output_dir}")
+            os.makedirs(config.output_dir, exist_ok=True)
+
+        if config.wandb:
+            accelerator.init_trackers(
+                project_name=config.wandb_project if config.wandb_project else None,
+            )
+
+    if config.seed is not None:
+        seed = config.seed
+        accelerator.print(f"Using seed {seed}")
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
+
+    model, preprocess_train = init_model(config)
+
+    train_dataloader = get_dataloader(config, preprocess_train, "train")
+    eval_dataloader = get_dataloader(config, preprocess_train, "val")
+    assert len(train_dataloader), "No data found, please check your data location."
+
+    if config.gradient_checkpointing:
+        model.set_grad_checkpointing(True)
+
+    if config.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError("To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`.")
+
+        optimizer_class = bnb.optim.AdamW8bit
+    else:
+        optimizer_class = torch.optim.AdamW
+
+    if isinstance(config.learning_rate, str):
+        config.learning_rate = float(config.learning_rate)
+
+    params_to_optimize = [
+        {
+            "params": itertools.chain(model.parameters()),
+            "lr": config.learning_rate,
+        },
+    ]
+
+    optimizer = optimizer_class(
+        params_to_optimize,
+        lr=config.learning_rate,
+        betas=(config.adam_beta1, config.adam_beta2),
+        eps=config.adam_epsilon,
+    )
+
+    # create scheduler if train
+    total_steps = train_dataloader.num_batches * config.epochs
+    # if args.warmup is float, it is a percentage of total_steps
+    if isinstance(config.warmup, float):
+        assert 0 <= config.warmup <= 1, "Warmup must be between 0 and 1 if not a fixed number of steps."
+        config.warmup = int(config.warmup * total_steps)
+
+    scheduler = cosine_lr(optimizer, config.learning_rate, config.warmup, total_steps)
+
+    model, optimizer, scheduler, train_dataloader, eval_dataloader = accelerator.prepare(
+        model, optimizer, scheduler, train_dataloader, eval_dataloader
+    )
+
+    print("***** Running training *****")
+    print(f"  Using device: {accelerator.device}")
+    print(f"  Num Iters = {len(train_dataloader)}")
+    print(f"  Num Epochs = {config.epochs}")
+    print(f"  Instantaneous batch size per device = {config.batch_size}")
+    print(f"  Gradient Accumulation steps = {config.gradient_accumulation_steps}")
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(config.epochs * len(train_dataloader)),
+        disable=not accelerator.is_local_main_process,
+    )
+    progress_bar.set_description("Steps")
+    global_step = 0
+    best_val_loss = float("inf")
+
+    for epoch in range(config.epochs):
+        model.train()
+        for step, batch in enumerate(train_dataloader):
+            if accelerator.is_local_main_process:
+                if global_step % config.eval_interval == 0:
+                    if accelerator.is_local_main_process:
+                        eval_loss = evaluate(model, eval_dataloader, config)
+                        accelerator.log(eval_loss, step=global_step)
+                        progress_bar.write(f"Step: {global_step}, Eval loss: {eval_loss['eval_loss']}")
+                        if eval_loss["eval_loss"] < best_val_loss:
+                            best_val_loss = eval_loss["eval_loss"]
+                            checkpoint_name = f"checkpoint_{global_step}"
+                            save_path = os.path.join(config.output_dir, checkpoint_name)
+                            model.save_pretrained(save_path)
+                            if job_callback:
+                                job_callback(best_finetuned_model_path=checkpoint_name)
+                            # save the clipora config we used for training for later use and bookkeeping
+                            save_config_to_yaml(config, os.path.join(save_path, "train_config.yaml"))
+
+            X, Y = batch
+            loss = compute_clip_loss(model, X, Y)
+            accelerator.backward(loss)
+            if accelerator.sync_gradients:
+                params_to_clip = model.parameters()
+                accelerator.clip_grad_norm_(params_to_clip, 1.0)  # args.max_grad_norm)
+            optimizer.step()
+            scheduler(global_step)
+            progress_bar.update(1)
+            if job_callback:
+                percent = int((epoch * len(train_dataloader) + step) / (config.epochs * len(train_dataloader)) * 100)
+                job_callback(status="training", detail=f"Training at {percent}%")
+            global_step += 1
+
+            logs = {
+                "loss": loss.item(),
+                "learning_rate": optimizer.param_groups[0]["lr"],
+                "step": global_step,
+                "epoch": epoch,
+            }
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+
+    accelerator.wait_for_everyone()
+
+    if accelerator.is_local_main_process:
+        save_path = os.path.join(config.output_dir, "final")
+        model.save_pretrained(save_path)
+        save_config_to_yaml(config, os.path.join(save_path, "train_config.yaml"))
+
+    accelerator.print("\n\nTraining completed.\n\n")
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="The path to the yaml file containing the training configuration.",
+    )
+    print(f"Starting clipora training with config: {parser.parse_args().config}")
+    config = parse_yaml_to_config(parser.parse_args().config)
+    main(config)
diff --git a/docker/vlm-lora-finetune/train_config.yml b/docker/vlm-lora-finetune/train_config.yml
new file mode 100644
index 0000000..d6455e2
--- /dev/null
+++ b/docker/vlm-lora-finetune/train_config.yml
@@ -0,0 +1,35 @@
+model_name: "ViT-L-14"
+pretrained: "datacomp_xl_s13b_b90k"
+compile: False
+seed: 1337
+
+device: "cuda"
+output_dir: "./output"
+
+wandb: True
+wandb_project: test-clipora
+
+train_dataset: "awilliamson/fashion-train"
+eval_dataset: "awilliamson/fashion-eval"
+datatype: "hf"
+csv_separator: "\t"
+image_col: "image"
+text_col: "text"
+shuffle: True
+
+lora_rank: 16
+lora_alpha: 32
+lora_dropout: 0.0
+
+batch_size: 32
+gradient_accumulation_steps: 1
+gradient_checkpointing: False
+
+use_8bit_adam: False
+
+learning_rate: 1e-4
+epochs: 3
+warmup: 0.01
+save_interval: 1000
+eval_interval: 100
+eval_steps: 100
diff --git a/docker/vlm-lora-finetune/visualize_results.py b/docker/vlm-lora-finetune/visualize_results.py
new file mode 100644
index 0000000..c497909
--- /dev/null
+++ b/docker/vlm-lora-finetune/visualize_results.py
@@ -0,0 +1,203 @@
+import argparse
+import os
+import random
+
+import matplotlib.patches as patches
+import matplotlib.pyplot as plt
+import numpy as np
+import open_clip  # or open_clip
+import pandas as pd
+import torch
+from clipora.config import parse_yaml_to_config
+from matplotlib.colors import Normalize
+from PIL import Image
+from train import init_model
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+
+def visualize_results(image_path, texts, before, after, output_dir):
+    """Saves an image with a table showing text probabilities before and after LORA fine-tuning.
+    texts, before and after have to be the same length
+    Args:
+        image_path (str): The path to the input image.
+        texts (list): The list of text prompts.
+        before (list): The probabilities before fine-tuning.
+        after (list): The probabilities after fine-tuning.
+        output_dir (str): The directory where the output image will be saved.
+    """
+    img = Image.open(image_path)
+    norm = Normalize(vmin=0, vmax=1)
+
+    # Find max indices
+    max_before_idx = np.argmax(before)
+    max_after_idx = np.argmax(after)
+
+    # Calculate dynamic column widths based on text length
+    max_text_length = max(len(text) for text in texts)
+
+    # Adjust column widths dynamically
+    if max_text_length > 50:  # Long text
+        col_widths = [0.6, 0.2, 0.2]  # Give more space to text
+    elif max_text_length > 30:  # Medium text
+        col_widths = [0.5, 0.25, 0.25]
+    else:  # Short text
+        col_widths = [0.4, 0.3, 0.3]  # Original proportions
+
+    # Create figure with dynamic width
+    fig_width = max(12, max_text_length * 0.15)  # Scale figure width with text length
+    fig, ax = plt.subplots(figsize=(fig_width, max(6, len(texts) * 0.5 + 2)))
+    ax.axis("off")
+
+    # Image panel - adjust based on figure width
+    img_width = min(0.25, 3.0 / fig_width)  # Cap image width but scale with figure
+    img_ax = fig.add_axes([0.05, 0.1, img_width, 0.8])
+    img_ax.imshow(img)
+    img_ax.axis("off")
+
+    # Table panel - use remaining space
+    table_start = 0.05 + img_width + 0.05
+    table_width = 0.9 - table_start
+    table_ax = fig.add_axes([table_start, 0.1, table_width, 0.8])
+    table_ax.axis("off")
+
+    # Calculate positions
+    x_positions = np.cumsum([0] + col_widths[:-1])
+
+    # Table headers
+    headers = ["Text", "Before", "After"]
+    for i, header in enumerate(headers):
+        table_ax.text(
+            x_positions[i], 1, header, ha="left", va="bottom", fontsize=12, weight="bold", transform=table_ax.transAxes
+        )
+
+    # Calculate row height based on number of items
+    row_height = min(0.08, 0.7 / len(texts))  # Dynamic row height
+
+    # Draw cells
+    for i, (text, b, a) in enumerate(zip(texts, before, after)):
+        y = 0.9 - i * row_height
+
+        # Handle long text with wrapping
+        if len(text) > 40:
+            # Split long text into multiple lines
+            words = text.split()
+            lines = []
+            current_line = ""
+            max_chars_per_line = int(40 * col_widths[0] / 0.4)  # Scale with column width
+
+            for word in words:
+                if len(current_line + " " + word) <= max_chars_per_line:
+                    current_line += " " + word if current_line else word
+                else:
+                    if current_line:
+                        lines.append(current_line)
+                    current_line = word
+            if current_line:
+                lines.append(current_line)
+
+            # Display multiline text
+            for j, line in enumerate(lines):
+                table_ax.text(
+                    x_positions[0], y - j * 0.02, line, ha="left", va="center", fontsize=9, transform=table_ax.transAxes
+                )
+        else:
+            # Display single line text
+            table_ax.text(x_positions[0], y, text, ha="left", va="center", fontsize=10, transform=table_ax.transAxes)
+
+        # Draw probability boxes
+        for j, val in enumerate([b, a], start=1):
+            color = plt.cm.Blues(norm(val)) if j == 1 else plt.cm.Reds(norm(val))
+            x = x_positions[j]
+            width = col_widths[j]
+
+            # Highlight max values with red border
+            edge_color = "red" if (j == 1 and i == max_before_idx) or (j == 2 and i == max_after_idx) else "black"
+
+            rect = patches.Rectangle(
+                (x, y - row_height / 2 + 0.01),
+                width,
+                row_height - 0.02,
+                transform=table_ax.transAxes,
+                color=color,
+                ec=edge_color,
+                lw=1.5,
+            )
+            table_ax.add_patch(rect)
+
+            table_ax.text(
+                x + width / 2,
+                y,
+                f"{val:.2f}",
+                ha="center",
+                va="center",
+                fontsize=10,
+                color="white" if val > 0.5 else "black",  # Better contrast
+                weight="bold",
+                transform=table_ax.transAxes,
+            )
+
+    plt.savefig(os.path.join(output_dir, "output_image.png"), bbox_inches="tight", dpi=150)
+    plt.close()
+
+
+def main(original_model, lora_model, preprocess, config, csv_path=None):
+    """Display probabilities before and after LORA fine-tuning for 10 random texts
+    from the evaluation dataset.
+    """
+    # === Load CSV and Select Data ===
+    csv_path = csv_path or config.eval_dataset
+    df = pd.read_csv(csv_path)
+    # Get 10 random texts from csv as classes
+    classes = df[config.text_col].drop_duplicates().sample(10).tolist()
+    # Get an image which has the first text as the correct one
+    correct_text = classes[0]
+    img_path = df[df[config.text_col] == correct_text][config.image_col].iloc[0]
+    # === Preprocess Inputs ===
+    image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
+    text_tokens = open_clip.tokenize(classes).to(device)
+
+    with torch.no_grad():
+        img_feat_before = original_model.encode_image(image)
+        txt_feat_before = original_model.encode_text(text_tokens)
+        probs_before = (img_feat_before @ txt_feat_before.T).softmax(dim=-1).squeeze().cpu().numpy()
+
+        img_feat_after = lora_model.encode_image(image)
+        txt_feat_after = lora_model.encode_text(text_tokens)
+        probs_after = (img_feat_after @ txt_feat_after.T).softmax(dim=-1).squeeze().cpu().numpy()
+
+    print("probs before:")
+    print(probs_before)
+    print("probs after:")
+    print(probs_after)
+
+    visualize_results(img_path, classes, probs_before, probs_after, config.output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config",
+        type=str,
+        help="The path to the yaml file containing the training configuration.",
+    )
+
+    parser.add_argument(
+        "--lora_adapter_path",
+        type=str,
+        help="The path to the LoRA adapter weights, e.g. checkpoint folder.",
+    )
+
+    args = parser.parse_args()
+    lora_adapter_path = args.lora_adapter_path
+    config = parse_yaml_to_config(args.config)
+    lora_model, preprocess = init_model(config, lora_adapter_path=lora_adapter_path)
+    # Load the original CLIP model (no LORA)
+    original_model, _, _ = open_clip.create_model_and_transforms(
+        model_name=config.model_name,
+        pretrained=config.pretrained,
+    )
+
+    original_model = original_model.to(device)
+    lora_model = lora_model.to(device)
+    main(original_model, lora_model, preprocess, config)
diff --git a/docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md b/docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md
new file mode 100644
index 0000000..f360c57
--- /dev/null
+++ b/docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md
@@ -0,0 +1,123 @@
+# Silogen-Engine Fine-Tuning Llama 3.1 8B Instruct with Open Protein Instructions Dataset
+
+## Introduction
+
+In this tutorial, we demonstrate LoRA fine-tuning of Llama 3.1 8B Instruct with the Open Protein Instructions dataset using the [Silogen fine-tuning engine](https://github.com/silogen/llm-finetuning) end-to-end, from downloading data to querying the fine-tuned model. Upon receiving a query containing a protein sequence as a input, the fine-tuned model attempts to provide an expert response about the protein sequence, such as its functional description. Depending on your fine-tuning configuration, the end result would be similar to [OPI-Llama](https://huggingface.co/BAAI/OPI-Llama-3.1-8B-Instruct) fine-tuned by the dataset authors.
+
+[Base model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) | [Fine-tuning dataset](https://huggingface.co/datasets/BAAI/OPI)
+
+Please note that the dataset is CC-BY-NC-4.0 licensed and used solely for demonstration purposes here with permission from the authors.
+
+## Prerequisites
+- MinIO cluster storage (or similar) with credentials configured in your Kubernetes namespace secrets
+- Huggingface token for data and model download in your Kubernetes namespace secrets
+
+## Running workloads
+The commands below are assumed to be run at the repository root.
+
+### Data download and preprocessing
+We will use the `workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml` override to download the dataset from Huggingface, convert it to the format expected by the Silogen fine-tuning engine, and persist the processed dataset to `bucketDataDir` configured in our override. Our `dataScript` will also create a sample of 1k rows for quick demonstration of the fine-tuning workflow.
+
+```
+helm template workloads/download-data-to-bucket/helm \
+  -f workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml \
+  --name-template "download-opi-data" \
+  | kubectl apply -f -
+```
+
+### Base model download
+We can download the base model to MinIO without customizing the existing override for our base model. Downloading this model requires a Huggingface token (assumed to be available in the namespace), which we specify in another override.
+```
+helm template workloads/download-huggingface-model-to-bucket/helm \
+  -f workloads/download-huggingface-model-to-bucket/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \
+  -f workloads/llm-finetune-silogen-engine/helm/overrides/utilities/hf-token.yaml \
+  --name-template "download-llama-31-8-instruct" \
+  | kubectl apply -f -
+```
+
+### Fine-tuning
+To start fine-tuning with the Silogen engine, we can use existing overrides for reasonable default fine-tuning parameters for the base model, and to enable Tensorboard monitoring. We can customize any parameters, such as the number of fine-tuning GPUs, with `workloads/llm-finetune-silogen-engine/helm/overrides/tutorial-05-llama-lora-opi-data.yaml`.
+
+```
+workloads_path="workloads/llm-finetune-silogen-engine/helm"
+helm template $workloads_path \
+  -f $workloads_path/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \
+  -f $workloads_path/overrides/utilities/tensorboard.yaml \
+  -f $workloads_path/overrides/tutorial-05-llama-lora-opi-data.yaml \
+  --name-template llm-finetune-llama-opi \
+  | kubectl apply -f -
+```
+
+To monitor fine-tuning progress with Tensorboard, we can forward the associated port to access with a local browser, e.g., `kubectl port-forward pods/<pod_name> 6006:6006`. Model checkpoints and logs will persist in the `checkpointsRemote` specified in our custom override file.
+
+### Inference
+
+#### Deploying each model
+To deploy the base model using vLLM, we can use the existing override
+```
+name="llama-31-8-instruct"
+helm template $name workloads/llm-inference-vllm/helm \
+-f workloads/llm-inference-vllm/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \
+--set "vllm_engine_args.served_model_name=$name" \
+| kubectl apply -f -
+```
+
+To deploy our fine-tuned model, we set the model path to our final experiment checkpoint
+```
+name="llama-31-8B-lora-opi-1k"
+helm template workloads/llm-inference-vllm/helm \
+  -f workloads/llm-inference-vllm/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml \
+  --set "model=s3://default-bucket/experiments/finetuning/$name/checkpoint-final" \
+  --set "vllm_engine_args.served_model_name=$name" \
+  --name-template "opi-llama" \
+  | kubectl apply -f -
+```
+#### Querying the deployed models
+
+Forward a port for each deployment
+```
+base_model="llama-31-8-instruct"
+ft_model="llama-31-8B-lora-opi-1k"
+port_1=8011
+port_2=8012
+
+kubectl port-forward svc/llm-inference-vllm-$base_model $port_1:80 > /dev/null & portforwardPID=$!
+
+kubectl port-forward svc/llm-inference-vllm-$ft_model $port_2:80 > /dev/null & portforwardPID=$!
+```
+
+Query each model to compare their outputs
+```
+question="Can you provide the functional description of the following protein sequence? Sequence: MRWQEMGYIFYPRKLR"
+
+# Base model
+curl http://localhost:$port_1/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "'$base_model'",
+        "messages": [
+            {"role": "user", "content": "'"$question"'"}
+        ]
+    }' | jq ".choices[0].message.content" --raw-output
+
+# [Example response] Unfortunately, I can't identify the exact function of the given protein sequence. However, ...
+
+# Fine-tuned model
+curl http://localhost:$port_2/v1/chat/completions \
+    -H "Content-Type: application/json" \
+    -d '{
+        "model": "'$ft_model'",
+        "messages": [
+            {"role": "user", "content": "'"$question"'"}
+        ]
+    }' | jq ".choices[0].message.content" --raw-output
+
+# [Example response] This protein is a ribonucleoprotein involved in the processing of rRNA and the assembly of ribosomes.
+```
+
+### Cleaning up
+We can delete our model deployments for example with kubectl
+```
+kubectl delete deployments/llm-inference-vllm-<model_name>
+kubectl delete svc/llm-inference-vllm-<model_name>
+```
diff --git a/docs/tutorials/tutorial-06-package-and-serve-wan2.1-with-torchserve.md b/docs/tutorials/tutorial-06-package-and-serve-wan2.1-with-torchserve.md
new file mode 100644
index 0000000..b3a6c74
--- /dev/null
+++ b/docs/tutorials/tutorial-06-package-and-serve-wan2.1-with-torchserve.md
@@ -0,0 +1,92 @@
+# Tutorial 06: Package and Serve Wan2.1 with TorchServe
+
+This tutorial shows how to prepare the Wan2.1 model for TorchServe, upload it to a cluster-internal MinIO storage, and then deploy a TorchServe workload that serves the model behind an API endpoint. The process consists of two steps: **packaging the model** and **serving it**.
+
+## 1. Setup
+
+Follow the setup in the [tutorial pre-requisites section](./tutorial-00-prerequisites.md).
+
+---
+
+## 2. Package model into MinIO
+
+Before TorchServe can serve a model, it needs a model `.zip` archive. We use the workload `torchserve-model-packager` to compress the Wan2.1 model and upload it to MinIO storage. Its user input file is:
+
+```bash
+workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml
+```
+
+Run:
+
+```bash
+helm template workloads/torchserve-model-packager/helm \
+  --values workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml \
+  --name-template wan21-packager \
+  | kubectl create -f -
+```
+
+This job downloads Wan2.1 weights, prepares TorchServe assets, and writes the `.zip` file into MinIO.
+
+You can follow logs and progress as described in [the monitoring section](./tutorial-00-prerequisites.md#monitoring-progress-logs-and-gpu-utilization-with-k9s).
+
+---
+
+## 3. Deploy TorchServe with Wan2.1
+
+Once the model archive is available in MinIO, we can deploy TorchServe itself.
+
+The workload for this is `media-torchserve-wan21`. Its user input file is:
+
+```bash
+workloads/media-torchserve-wan21/helm/overrides/tutorial-06-serve.yaml
+```
+
+Run:
+
+```bash
+helm template workloads/media-torchserve-wan21/helm \
+  --values workloads/torchserve-wan21/helm/overrides/tutorial-06-serve.yaml \
+  --name-template wan21-serve \
+  | kubectl apply -f -
+```
+
+This creates a GPU-enabled TorchServe deployment, installs dependencies, mounts configuration files, downloads the `.zip` from MinIO, creates `.mar` archive, and starts serving.
+
+---
+
+## 4. Access the API
+
+Forward TorchServe’s REST API to your local machine:
+
+```bash
+kubectl port-forward deployment/wan21-serve-media-torchserve-wan21 8080:8080
+```
+
+Now you can send a test request to generate video:
+
+```bash
+curl -X POST http://localhost:8080/predictions/wan21 \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "a scene of an astronaut riding a horse on mars",
+    "width": 480,
+    "height": 832,
+    "num_frames": 81,
+    "num_inference_steps": 40
+  }' \
+  --output "output-$(date +%Y%m%d%H%M%S).mp4"
+```
+
+The output video will be written to the current directory.
+
+When finished, you can stop the port-forwarding process and clean up the deployment:
+
+```bash
+kubectl delete deployment wan21-serve-media-torchserve-wan21
+```
+
+---
+
+## Next Steps
+
+This tutorial showed how to package Wan2.1 and serve it through TorchServe. The natural next step is to deploy additional handlers for different models.
diff --git a/docs/tutorials/tutorial-07-wan-video-finetuning.md b/docs/tutorials/tutorial-07-wan-video-finetuning.md
new file mode 100644
index 0000000..d86fd2b
--- /dev/null
+++ b/docs/tutorials/tutorial-07-wan-video-finetuning.md
@@ -0,0 +1,268 @@
+# Tutorial 07: Fine-tuning Wan 2.2 for Video Generation
+
+This tutorial shows how to fine-tune Wan 2.2 models for custom video generation using the DiffSynth framework. Wan 2.2 is a state-of-the-art text-to-video generation model available in 5B and 14B parameter versions. We'll demonstrate both LoRA (Low-Rank Adaptation) and full parameter fine-tuning approaches, using the Disney VideoGeneration Dataset as an example.
+
+The tutorial covers the complete pipeline: downloading models and datasets to cluster MinIO storage, setting up the training environment, running fine-tuning jobs with multi-GPU support, and uploading trained checkpoints for future use.
+
+We'll start with the 5B parameter model using LoRA fine-tuning, which provides a good balance between training efficiency and model quality. The approach can be scaled up to the 14B model or full parameter training for more demanding use cases.
+
+## 1. Setup
+
+Follow the setup in the [tutorial pre-requisites section](./tutorial-prereqs.md).
+
+## 2. Download model and dataset
+
+We'll use the workloads to download the Wan 2.2 model and the Disney VideoGeneration Dataset to cluster MinIO storage.
+
+### Download Wan 2.2 5B Model
+
+First, download the Wan 2.2 5B parameter text-to-video model:
+
+```bash
+helm template workloads/download-huggingface-model-to-bucket/helm \
+  -f workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml \
+  --name-template download-wan2-2-ti2v-5b \
+  | kubectl apply -f -
+```
+
+### Download Disney VideoGeneration Dataset
+
+Next, download and preprocess the Disney VideoGeneration Dataset (Steamboat Willy):
+
+```bash
+helm template workloads/download-data-to-bucket/helm \
+  -f workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml \
+  --name-template download-disney-dataset \
+  | kubectl apply -f -
+```
+
+Monitor the downloads using [k9s or kubectl logs](./tutorial-prereqs.md#monitoring-progress-logs-and-gpu-utilization-with-k9s). The model download includes:
+- Main diffusion model (3 safetensors files, ~18.7 GiB)
+- VAE model (Wan2.2_VAE.pth, ~2.6 GiB)
+- Text encoder (T5-XXL, ~11 GiB)
+- Configuration and tokenizer files
+
+## 3. Interactive exploration (optional)
+
+For testing and experimentation, you can launch an interactive version that provides a ready environment without automatic training:
+
+```bash
+helm template workloads/media-finetune-wan/helm \
+  --name-template wan-finetune-interactive \
+  | kubectl apply -f -
+```
+
+Connect to the interactive pod:
+```bash
+kubectl exec -it wan-finetune-interactive -- /bin/bash
+```
+
+This gives you access to explore the DiffSynth framework, examine the downloaded models and data, and test training configurations manually.
+
+## 4. LoRA fine-tuning on multiple GPUs
+
+LoRA (Low-Rank Adaptation) provides efficient fine-tuning by updating only a small number of parameters. This approach is recommended for most use cases as it requires less compute resources while maintaining good performance.
+
+### Launch 5B LoRA Fine-tuning
+
+Run LoRA fine-tuning on the 5B model using our optimized configuration:
+
+```bash
+helm template workloads/media-finetune-wan/helm \
+  -f workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml \
+  --name-template wan-finetune-5b-lora \
+  | kubectl apply -f -
+```
+
+This configuration uses:
+- **4 GPUs** with **32 CPU cores** for reasonable compilation performance
+- **LoRA rank 32** for good parameter efficiency
+- **DeepSpeed ZeRO Stage 2** for distributed training
+- **BF16 precision** for memory efficiency
+- **5 epochs** with gradient accumulation
+
+### Monitor training progress
+
+Follow the training progress using:
+
+```bash
+# Watch job status
+kubectl get jobs -w
+
+# View training logs (replace with your actual job name)
+kubectl logs job/wan-finetune-5b-lora -f
+
+# Check GPU utilization with k9s
+k9s
+```
+
+Training phases you'll observe:
+1. **Installation**: DiffSynth framework setup
+2. **Resource Download**: Model and dataset download from MinIO
+3. **Compilation**: PyTorch model compilation for AMD GPUs
+4. **Training**: 5 epochs with progress bars
+5. **Upload**: Checkpoint upload to MinIO
+
+Total training time: ~90-120 minutes depending on cluster load.
+
+## 5. Scale to larger configurations
+
+For more advanced use cases, different configurations can be explored.
+
+### 14B Model Fine-tuning
+
+For higher quality results, use the 14B parameter model by using the relevant configuration from among the override files inside the workload.
+
+### Full Parameter Fine-tuning
+
+For maximum customization, use the relevant override file for `architecture: "full"` instead of LoRA. This requires:
+- More GPU resources (4+ GPUs recommended)
+- Increased memory allocation
+- Longer training time but potentially better results
+
+## 6. Working with checkpoints
+
+Trained checkpoints are automatically uploaded to MinIO with organized paths:
+
+```
+default-bucket/models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2-TI2V-5B_lora/20250925-141325/
+├── epoch-0.safetensors
+├── epoch-1.safetensors
+├── epoch-2.safetensors
+├── epoch-3.safetensors
+├── epoch-4.safetensors
+├── adapter_config.json
+├── adapter_model.safetensors
+└── training_args.bin
+```
+
+### Using trained models
+
+The checkpoints can be used for:
+
+1. **Further fine-tuning**: Resume training from any epoch
+2. **Inference deployment**: Load for video generation
+3. **Model evaluation**: Compare different configurations
+4. **Checkpoint merging**: Combine LoRA weights with base model
+
+### Download checkpoints locally
+
+```bash
+# Setup MinIO client (if not already configured)
+mc alias set minio-cluster http://minio.cluster.local
+
+# Download specific checkpoint
+mc cp --recursive \
+  minio-cluster/default-bucket/models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2-TI2V-5B_lora/20250925-141325/ \
+  ./local-checkpoints/
+```
+
+## 7. Hyperparameter tuning
+
+Experiment with different LoRA ranks to find optimal configurations:
+
+```bash
+run_id=wan-lora-sweep
+for rank in 8 16 32 64 128; do
+  name="wan-5b-lora-r$rank-$run_id"
+  helm template workloads/media-finetune-wan/helm \
+    -f workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml \
+    --name-template $name \
+    --set finetune_config.lora_rank=$rank \
+    | kubectl apply -f -
+done
+```
+
+This creates parallel jobs testing different LoRA ranks, allowing you to compare training efficiency and model quality.
+
+## 8. Advanced configurations
+
+### Custom datasets
+
+To use your own video dataset:
+
+1. **Prepare data**: Organize videos and captions in the required format
+2. **Upload to MinIO**: Use the data upload workload
+3. **Update configuration**: Modify `datasetId` and paths in your override file
+4. **Adjust parameters**: Tune learning rate, batch size, and epochs based on your data size
+
+### Multi-node training
+
+For very large models or datasets, distribute across multiple nodes:
+
+```yaml
+resources:
+  cpu: 64
+  gpus: 8  # Use all 8 GPUs per node
+  memory: 512Gi
+```
+
+### Memory optimization
+
+For memory-constrained scenarios:
+
+```yaml
+finetune_config:
+  # Use gradient checkpointing
+  gradient_checkpointing: true
+  # Reduce batch size
+  train_batch_size: 8
+  # Increase gradient accumulation
+  gradient_accumulation_steps: 8
+```
+
+## 9. Troubleshooting
+
+### Common issues and solutions
+
+**Compilation taking too long**:
+- Increase CPU allocation to 32+ cores
+- Use at least 3-4 GPUs for better parallelization
+
+**Out of memory errors**:
+- Reduce batch size: `train_batch_size: 8`
+- Enable gradient checkpointing
+- Use smaller LoRA rank: `lora_rank: 16`
+
+**Slow model download**:
+- Check MinIO cluster connectivity
+- Verify bucket credentials and permissions
+
+**Training divergence**:
+- Lower learning rate: `learning_rate: 1e-5`
+- Increase warmup steps
+- Use different noise schedules
+
+### Monitoring resources
+
+```bash
+# Check GPU usage across cluster
+kubectl top nodes
+
+# View detailed pod resource usage
+kubectl describe pod wan-finetune-5b-lora-xyz
+
+# Monitor job events
+kubectl describe job wan-finetune-5b-lora
+```
+
+## 10. Next steps
+
+After successful fine-tuning:
+
+1. **Deploy for inference**: Use trained checkpoints in inference workloads
+2. **Quality evaluation**: Generate test videos and evaluate results
+3. **Dataset expansion**: Add more diverse training data
+4. **Architecture experiments**: Try different model variants and training strategies
+
+The fine-tuned Wan 2.2 models can be integrated into video generation pipelines, content creation tools, or further research projects focusing on controllable video synthesis.
+
+## Configuration files reference
+
+The tutorial uses these override configurations:
+
+- `workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml`: Model download configuration
+- `workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml`: Dataset download configuration
+- `workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml`: Optimized 5B LoRA fine-tuning configuration
+
+Each configuration includes detailed comments explaining the parameter choices and trade-offs for different use cases.
diff --git a/mkdocs.yml b/mkdocs.yml
index aa820b6..0b94ca2 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -93,7 +93,8 @@ nav:
     - llm-finetune-llama-factory: workloads/llm-finetune-llama-factory/helm/README.md
     - llm-finetune-silogen-engine:
       - Overview: workloads/llm-finetune-silogen-engine/helm/README.md
-      - Finetuning Config: workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
+      - SFT Config: workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md
+      - DPO Config: workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md
     - llm-finetune-verl: workloads/llm-finetune-verl/helm/README.md
     - llm-inference-llamacpp-mi300x: workloads/llm-inference-llamacpp-mi300x/helm/README.md
     - llm-inference-megatron-lm: workloads/llm-inference-megatron-lm/helm/README.md
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/Chart.yaml b/workloads/benchmark-lifescience-reinvent4/helm/Chart.yaml
new file mode 100644
index 0000000..07074d8
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: benchmark-lifescience-reinvent4
+description: A Helm chart for Reinvent4 inference
+version: 0.0.1
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/README.md b/workloads/benchmark-lifescience-reinvent4/helm/README.md
new file mode 100644
index 0000000..1a44170
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/README.md
@@ -0,0 +1,75 @@
+# Life Science - Reinvent4
+
+This Helm Chart deploys a workload as a Kubernetes job for REINVENT4 run mode Transfer Learning (TL)
+
+## Prerequisites
+
+Ensure the following prerequisites are met before deploying any workloads:
+
+1. **Helm**: Install `helm`. Refer to the [Helm documentation](https://helm.sh/) for instructions.
+
+## Deploying the Workload
+
+It is recommended to use `helm template` and pipe the result to `kubectl create` , rather than using `helm install`. Generally, a command looks as follows
+
+```bash
+helm template [optional-release-name] <helm-dir> -f <overrides/xyz.yaml> --set <name>=<value> | kubectl apply -f -
+```
+
+The chart provides three main ways to deploy models, detailed below.
+
+## User Input Values
+
+Refer to the `values.yaml` file for the user input values you can provide, along with instructions.
+
+### Verify Job
+
+Check the job status:
+
+```bash
+kubectl get jobs
+```
+
+# Running Reinvent inference interactively
+
+Connect to the pod with your favorite terminal.
+
+The job runs the script `docker/lifescience/reinvent4/Reinvent_TLRL_clean.py` automatically. The logs can be followed by running `kubectl logs <pod_name> -f`
+
+Alternatively, you can uncomment the bottom part of the `values.yaml` file to run `Reinvent_demo_clean.py` as well. Or you can interactively connect to the job and run either of the notebooks manually. Just make sure you don't run two scripts at the same time by accident.
+
+```sh
+# Connect to the pod
+kubectl exec -it <pod_name> -- /bin/bash
+
+python3 notebooks/<notebook_name.py>
+```
+
+Alternatively, Reinvent jobs can be run by:
+```sh
+reinvent -l <log_name> <config_name>
+```
+
+## Expected outputs from the demo runs
+
+- `Reinvent_demo_clean.py`
+
+|    Agent   |   Prior   |   Target   |     Score    |                                               SMILES                                               | SMILES_state |     QED     | QED (raw) | Stereo | Stereo (raw) | Alerts | Alerts (raw) | step |
+|------------|-----------|------------|--------------|-----------------------------------------------------------------------------------------------------|--------------|-------------|-----------|--------|---------------|--------|---------------|------|
+|  38.0957   |  38.0957  | -38.0276   |  0.000531    | Cc1ccc(C2CC(=O)Nc3cccc(NC45CC6CC(CC4C6)C5)c32)cc1                                               |      1       |  0.751039   |   0.7510  |  0.0   |      3.0      |  1.0   |      1.0      |  1   |
+|  29.6628   |  29.6628  |  89.1671   |  0.928359    | Cc1cc(-c2nnn(CC3CCCCC3)n2)nc(C(=O)NC2CCC2)n1                                                    |      1       |  0.883472   |   0.8835  |  1.0   |      0.0      |  1.0   |      1.0      |  1   |
+|  33.0351   |  33.0351  | -33.0351   |  0.000000    | COc1ccc(CCCCON=C2CCC3CC2CN3C(=O)OCc2ccccc2)cc1                                                  |      1       |  0.000000   |   0.0000  |  0.0   |      0.0      |  0.0   |      0.0      |  1   |
+|  22.6197   |  22.6197  | -22.6197   |  0.000000    | CCOC(=O)C1C(=O)N=C(N)NC1c1ccc2c(c1)OCO2                                                         |      1       |  0.000000   |   0.0000  |  0.0   |      0.0      |  0.0   |      0.0      |  1   |
+|  28.3920   |  28.3920  | -28.3144   |  0.000606    | CC(NC(=O)c1nccs1)c1ccc2c(c1)COC2                                                                  |      1       |  0.935827   |   0.9358  |  0.0   |      1.0      |  1.0   |      1.0      |  1   |
+
+Total number of SMILES generated: 30000\
+Total number of invalid SMILES: 283\
+Total number of batch duplicate SMILES: 8\
+Total number of duplicate SMILES: 1317
+
+- `Reinvent_TLRL_clean.py`
+
+(This is the number of produced "good binders" defined by `QED < 0.8` and `ChemProp (raw) < -25.0` before and after removing duplicates.)
+
+4\
+4
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/mount/README.md b/workloads/benchmark-lifescience-reinvent4/helm/mount/README.md
new file mode 100644
index 0000000..75734b3
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/mount/README.md
@@ -0,0 +1,3 @@
+Files in this directory are mounted to the workload at `/workload/mount`.
+
+**Note:** Subdirectories and binary files are not supported.
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/benchmark-lifescience-reinvent4/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/templates/_helpers.tpl b/workloads/benchmark-lifescience-reinvent4/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..0861d3a
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/templates/_helpers.tpl
@@ -0,0 +1,101 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Init container resources helper
+{{- define "init_container.resources" -}}
+requests:
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+limits:
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+limits:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: /workload
+  name: ephemeral-storage
+- mountPath: /workload/mount
+  name: workload-mount
+- mountPath: /dev/shm
+  name: dshm
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+{{- if .Values.storage.ephemeral.storageClassName -}}
+- ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.storage.ephemeral.quantity }}
+        storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+  name: ephemeral-storage
+{{- else }}
+- emptyDir: {}
+  name: ephemeral-storage
+  sizeLimit: {{ .Values.storage.ephemeral.quantity }}
+{{- end }}
+- emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+  name: dshm
+- configMap:
+    name: {{ include "release.fullname" . }}
+  name: workload-mount
+{{- end -}}
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/templates/configmap.yaml b/workloads/benchmark-lifescience-reinvent4/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/templates/job.yaml b/workloads/benchmark-lifescience-reinvent4/helm/templates/job.yaml
new file mode 100644
index 0000000..5f3d079
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/templates/job.yaml
@@ -0,0 +1,60 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  activeDeadlineSeconds: 7200 # 2 hours timeout
+  backoffLimit: 1
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      {{- if .Values.nodeSelector }}
+      nodeSelector:
+        {{- .Values.nodeSelector | toYaml | nindent 8 }}
+      {{- end }}
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      {{- if .Values.init_args }}
+      initContainers:
+        - name: {{ .Chart.Name }}-init
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          command: ["sh", "-c"]
+          args:
+          - |
+            {{- .Values.init_args | nindent 12 }}
+          resources:
+            {{- include "init_container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      {{- end}}
+      containers:
+        - name: {{ .Chart.Name }}
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          {{- if .Values.entrypoint }}
+          command: ["sh", "-c"]
+          args:
+          - |
+            {{- .Values.entrypoint | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+
+      restartPolicy: Never
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/values.schema.json b/workloads/benchmark-lifescience-reinvent4/helm/values.schema.json
new file mode 100644
index 0000000..de2555c
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/values.schema.json
@@ -0,0 +1,133 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "metadata": {
+      "type": "object",
+      "description": "Metadata for the deployment",
+      "properties": {
+        "labels": {
+          "type": "object",
+          "description": "Labels to apply to the deployment",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": ["labels"]
+    },
+    "image": {
+      "type": "string",
+      "description": "Docker image to use for the deployment"
+    },
+    "imagePullPolicy": {
+      "type": "string",
+      "description": "Image pull policy",
+      "enum": ["Always", "IfNotPresent", "Never"]
+    },
+    "imagePullSecrets": {
+      "type": "array",
+      "description": "Image pull secrets for private registries"
+    },
+    "entrypoint": {
+      "type": "string",
+      "description": "Entrypoint for the container"
+    },
+    "init_args": {
+      "type": "string",
+      "description": "Commands for the initContainer"
+    },
+    "gpus": {
+      "type": "integer",
+      "description": "Number of GPUs to allocate",
+      "minimum": 1
+    },
+    "memory_per_gpu": {
+      "type": "integer",
+      "description": "Memory per GPU in Gi",
+      "minimum": 1
+    },
+    "cpu_per_gpu": {
+      "type": "integer",
+      "description": "CPU cores per GPU",
+      "minimum": 1
+    },
+    "vllm_engine_args": {
+      "type": "object",
+      "description": "Arguments for the vllm engine",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "env_vars": {
+      "type": "object",
+      "description": "Environment variables for the deployment"
+    },
+    "storage": {
+      "type": "object",
+      "description": "Storage configuration",
+      "properties": {
+        "ephemeral": {
+          "type": "object",
+          "description": "Ephemeral storage configuration",
+          "properties": {
+            "quantity": {
+              "type": "string",
+              "description": "Quantity of ephemeral storage"
+            },
+            "storageClassName": {
+              "type": "string",
+              "description": "Storage class name for ephemeral storage"
+            },
+            "accessModes": {
+              "type": "array",
+              "description": "Access modes for ephemeral storage",
+              "items": {
+                "type": "string"
+              }
+            }
+          },
+          "required": ["quantity", "storageClassName", "accessModes"]
+        },
+        "dshm": {
+          "type": "object",
+          "description": "Shared memory configuration",
+          "properties": {
+            "sizeLimit": {
+              "type": "string",
+              "description": "Size limit for shared memory"
+            }
+          },
+          "required": ["sizeLimit"]
+        }
+      },
+      "required": ["ephemeral", "dshm"]
+    },
+    "nodeSelector": {
+      "type": "object",
+      "properties": {
+        "dev": {
+          "type": "string",
+          "description": "If true, use the dev node selector"
+        }
+      }
+    },
+    "startupProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Startup probe configuration for the container"
+    },
+    "livenessProbe": {
+      "type": "object",
+      "additionalProperties": true,
+      "description": "Liveness probe configuration for the container"
+    },
+    "readinessProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Readiness probe configuration for the container"
+    }
+  },
+  "required": ["metadata", "image", "imagePullPolicy", "gpus", "memory_per_gpu", "cpu_per_gpu", "storage"],
+  "additionalProperties": false
+}
diff --git a/workloads/benchmark-lifescience-reinvent4/helm/values.yaml b/workloads/benchmark-lifescience-reinvent4/helm/values.yaml
new file mode 100644
index 0000000..8766630
--- /dev/null
+++ b/workloads/benchmark-lifescience-reinvent4/helm/values.yaml
@@ -0,0 +1,48 @@
+metadata:
+  labels: {}
+
+# The build steps for this file can be found under ai-workloads/docker/lifescience/reinvent4/Dockerfile
+image: ghcr.io/silogen/reinvent4:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.5.1
+imagePullPolicy: Always
+
+gpus: 1
+memory_per_gpu: 64
+cpu_per_gpu: 12
+
+storage:
+  ephemeral:
+    quantity: 128Gi
+    # Change the storageClassName to standard if mlstorage is not available.
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+  dshm:
+    sizeLimit: 32Gi
+
+entrypoint: |
+  # Move the REINVENT4 directory to the ephemeral storage
+  mv /REINVENT4 /workload/REINVENT4
+
+  cd /workload/REINVENT4
+
+  START_TIME=$(date +%s)
+
+  # Run the REINVENT4 demo notebook
+  if python3 ./notebooks/Reinvent_demo_clean.py; then
+    echo "REINVENT4 demo workflow completed successfully"
+  else
+    echo "REINVENT4 demo workflow failed with exit code: $?"
+  fi
+
+  # # Uncomment the lines below if you want to run the TLRL notebook instead
+  # if python3 ./notebooks/Reinvent_TLRL_clean.py; then
+  #   echo "REINVENT4 TLRL workflow completed successfully"
+  # else
+  #   echo "REINVENT4 TLRL workflow failed with exit code: $?"
+  # fi
+
+  END_TIME=$(date +%s)
+  DURATION=$((END_TIME - START_TIME))
+
+  echo "Benchmark completed at: $(date)"
+  echo "Total benchmark duration: ${DURATION} seconds ($(date -u -d @${DURATION} +%H:%M:%S))"
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/Chart.yaml b/workloads/benchmark-lifescience-semlaflow/helm/Chart.yaml
new file mode 100644
index 0000000..fc336d6
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: benchmark-lifescience-semlaflow
+description: A Helm chart for SemlaFlow inference
+version: 0.0.1
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/README.md b/workloads/benchmark-lifescience-semlaflow/helm/README.md
new file mode 100644
index 0000000..0a76468
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/README.md
@@ -0,0 +1,90 @@
+# Life Science - SemlaFlow
+
+This Helm Chart deploys the LLM Inference SemlaFlow workload.
+
+# SemlaFlow model
+
+Original repo is [here](https://github.com/rssrwn/semla-flow)
+
+This project creates a novel equivariant attention-based message passing architecture, Semla, for molecular design and dynamics tasks. We train a molecular generation model, SemlaFlow, using flow matching with optimal transport to generate realistic 3D molecular structures.
+
+## Scripts
+
+There are 4 scripts in the original semlaflow repository:
+* `preprocess` - Used for preprocessing larger datasets into the internal representation used by the model for training
+* `train` - Trains a MolFlow model on preprocessed data
+* `evaluate` - Evaluates a trained model and prints the results
+* `predict` - Runs the sampling for a trained model and saves the generated molecules
+
+## Instructions on choosing a GPU to attach a docker container to
+ 1. check with `amd-smi process` which GPU is free
+ 2. check with `rocm-smi` what is the node id of the free GPU (Note: node id is not the same as the device id and is displayed in the second column of the rocm-smi output)
+ 3. If say, the node id 2 gpu is free, the device to be added to docker run is given by `cat /sys/class/kfd/kfd/topology/nodes/2/properties | grep drm_render_minor`
+ 4. you can directly create a container using docker run --device=/dev/kfd --device=/dev/dri/renderD<ID output from step 3>
+
+## Running inference interactively
+
+Start a container with the above mentioned image on a cluster and connect.
+
+Each script can be run as follows (where `<script>` is replaced by the script name above without `.py`): `python -m semlaflow.<script> --data_path <path/to/data> <other_args>`
+
+Example:
+
+```
+cd semla-flow
+python -m semlaflow.evaluate --data_path data/qm9/smol --ckpt_path models/300epochs.ckpt --dataset qm9
+```
+
+This workload evaluates a pretrained SemlaFlow model. By changing the script name and arguments in the values.yaml file, you can use the same container to either train a new model or run predictions with a pretrained one.
+
+### Expected outputs
+
+The training script trains a model and saves checkpoints in the lightning_logs folder. The evaluate script then assesses a specified model, producing results like these.
+
+
+| Metric              | Result                     |
+|---------------------|----------------------------|
+| connected-validity  | 0.91710 ± 0.0014142        |
+| energy              | 113.95503 ± 0.3756851      |
+| energy-per-atom     | 2.45049 ± 0.0050934        |
+| energy-validity     | 0.94130 ± 0.0024590        |
+| novelty             | 0.99664 ± 0.0001804        |
+| opt-energy-validity | 0.94130 ± 0.0024590        |
+| opt-rmsd            | 0.86588 ± 0.0062672        |
+| strain              | 73.87280 ± 0.2942116       |
+| strain-per-atom     | 1.55302 ± 0.0092829        |
+| uniqueness          | 0.99986 ± 0.0001997        |
+| validity            | 0.94203 ± 0.0023099        |
+| atom-stability      | 0.99887 ± 0.0000082        |
+| molecule-stability  | 0.97513 ± 0.0005249        |
+
+
+## Prerequisites
+
+Ensure the following prerequisites are met before deploying any workloads:
+
+1. **Helm**: Install `helm`. Refer to the [Helm documentation](https://helm.sh/) for instructions.
+
+## Deploying the Workload
+
+It is recommended to use `helm template` and pipe the result to `kubectl create` , rather than using `helm install`. Generally, a command looks as follows
+
+```bash
+helm template [optional-release-name] <helm-dir> -f <overrides/xyz.yaml> --set <name>=<value> | kubectl apply -f -
+```
+
+The chart provides three main ways to deploy models, detailed below.
+
+## User Input Values
+
+Refer to the `values.yaml` file for the user input values you can provide, along with instructions.
+
+## Interacting with Deployed Model
+
+### Verify Job
+
+Check the job status:
+
+```bash
+kubectl get jobs
+```
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/mount/README.md b/workloads/benchmark-lifescience-semlaflow/helm/mount/README.md
new file mode 100644
index 0000000..75734b3
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/mount/README.md
@@ -0,0 +1,3 @@
+Files in this directory are mounted to the workload at `/workload/mount`.
+
+**Note:** Subdirectories and binary files are not supported.
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/mount/entrypoint.sh.old b/workloads/benchmark-lifescience-semlaflow/helm/mount/entrypoint.sh.old
new file mode 100644
index 0000000..0f676b2
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/mount/entrypoint.sh.old
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SCRIPT="$1"
+OUTPUT_FILE="$2"
+shift
+shift
+
+echo ${OUTPUT_FILE}
+# Check if SCRIPT is one of the allowed values
+if [[ "$SCRIPT" != "preprocess" && "$SCRIPT" != "train" && "$SCRIPT" != "evaluate" && "$SCRIPT" != "predict" ]]; then
+  echo "Error: SCRIPT must be one of 'preprocess', 'train', 'evaluate', or 'predict'."
+  exit 1
+fi
+
+python -m semlaflow."$SCRIPT" "$@" &> /output/${OUTPUT_FILE}
+
+if [[ "$SCRIPT" == "train" ]]; then
+  # Copy the model checkpoint to the output directory
+  cp -r lightning_logs/version_* /output/
+fi
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/benchmark-lifescience-semlaflow/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/templates/_helpers.tpl b/workloads/benchmark-lifescience-semlaflow/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..0861d3a
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/templates/_helpers.tpl
@@ -0,0 +1,101 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Init container resources helper
+{{- define "init_container.resources" -}}
+requests:
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+limits:
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+limits:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: /workload
+  name: ephemeral-storage
+- mountPath: /workload/mount
+  name: workload-mount
+- mountPath: /dev/shm
+  name: dshm
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+{{- if .Values.storage.ephemeral.storageClassName -}}
+- ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.storage.ephemeral.quantity }}
+        storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+  name: ephemeral-storage
+{{- else }}
+- emptyDir: {}
+  name: ephemeral-storage
+  sizeLimit: {{ .Values.storage.ephemeral.quantity }}
+{{- end }}
+- emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+  name: dshm
+- configMap:
+    name: {{ include "release.fullname" . }}
+  name: workload-mount
+{{- end -}}
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/templates/configmap.yaml b/workloads/benchmark-lifescience-semlaflow/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/templates/job.yaml b/workloads/benchmark-lifescience-semlaflow/helm/templates/job.yaml
new file mode 100644
index 0000000..0a42e70
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/templates/job.yaml
@@ -0,0 +1,61 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  activeDeadlineSeconds: 7200 # 2 hours timeout
+  backoffLimit: 1
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      {{- if .Values.nodeSelector }}
+      nodeSelector:
+        {{- .Values.nodeSelector | toYaml | nindent 8 }}
+      {{- end }}
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      {{- if .Values.init_args }}
+      initContainers:
+        - name: {{ .Chart.Name }}-init
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          command: ["sh", "-c"]
+          args:
+          - |
+            {{- .Values.init_args | nindent 12 }}
+          resources:
+            {{- include "init_container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      {{- end }}
+
+      containers:
+        - name: {{ .Chart.Name }}
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          {{- if .Values.entrypoint }}
+          command: ["sh", "-c"]
+          args:
+          - |
+            {{- .Values.entrypoint | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+
+      restartPolicy: Never
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/values.schema.json b/workloads/benchmark-lifescience-semlaflow/helm/values.schema.json
new file mode 100644
index 0000000..de2555c
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/values.schema.json
@@ -0,0 +1,133 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "metadata": {
+      "type": "object",
+      "description": "Metadata for the deployment",
+      "properties": {
+        "labels": {
+          "type": "object",
+          "description": "Labels to apply to the deployment",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": ["labels"]
+    },
+    "image": {
+      "type": "string",
+      "description": "Docker image to use for the deployment"
+    },
+    "imagePullPolicy": {
+      "type": "string",
+      "description": "Image pull policy",
+      "enum": ["Always", "IfNotPresent", "Never"]
+    },
+    "imagePullSecrets": {
+      "type": "array",
+      "description": "Image pull secrets for private registries"
+    },
+    "entrypoint": {
+      "type": "string",
+      "description": "Entrypoint for the container"
+    },
+    "init_args": {
+      "type": "string",
+      "description": "Commands for the initContainer"
+    },
+    "gpus": {
+      "type": "integer",
+      "description": "Number of GPUs to allocate",
+      "minimum": 1
+    },
+    "memory_per_gpu": {
+      "type": "integer",
+      "description": "Memory per GPU in Gi",
+      "minimum": 1
+    },
+    "cpu_per_gpu": {
+      "type": "integer",
+      "description": "CPU cores per GPU",
+      "minimum": 1
+    },
+    "vllm_engine_args": {
+      "type": "object",
+      "description": "Arguments for the vllm engine",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "env_vars": {
+      "type": "object",
+      "description": "Environment variables for the deployment"
+    },
+    "storage": {
+      "type": "object",
+      "description": "Storage configuration",
+      "properties": {
+        "ephemeral": {
+          "type": "object",
+          "description": "Ephemeral storage configuration",
+          "properties": {
+            "quantity": {
+              "type": "string",
+              "description": "Quantity of ephemeral storage"
+            },
+            "storageClassName": {
+              "type": "string",
+              "description": "Storage class name for ephemeral storage"
+            },
+            "accessModes": {
+              "type": "array",
+              "description": "Access modes for ephemeral storage",
+              "items": {
+                "type": "string"
+              }
+            }
+          },
+          "required": ["quantity", "storageClassName", "accessModes"]
+        },
+        "dshm": {
+          "type": "object",
+          "description": "Shared memory configuration",
+          "properties": {
+            "sizeLimit": {
+              "type": "string",
+              "description": "Size limit for shared memory"
+            }
+          },
+          "required": ["sizeLimit"]
+        }
+      },
+      "required": ["ephemeral", "dshm"]
+    },
+    "nodeSelector": {
+      "type": "object",
+      "properties": {
+        "dev": {
+          "type": "string",
+          "description": "If true, use the dev node selector"
+        }
+      }
+    },
+    "startupProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Startup probe configuration for the container"
+    },
+    "livenessProbe": {
+      "type": "object",
+      "additionalProperties": true,
+      "description": "Liveness probe configuration for the container"
+    },
+    "readinessProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Readiness probe configuration for the container"
+    }
+  },
+  "required": ["metadata", "image", "imagePullPolicy", "gpus", "memory_per_gpu", "cpu_per_gpu", "storage"],
+  "additionalProperties": false
+}
diff --git a/workloads/benchmark-lifescience-semlaflow/helm/values.yaml b/workloads/benchmark-lifescience-semlaflow/helm/values.yaml
new file mode 100644
index 0000000..5a7f8f3
--- /dev/null
+++ b/workloads/benchmark-lifescience-semlaflow/helm/values.yaml
@@ -0,0 +1,65 @@
+metadata:
+  labels: {}
+
+image: rocm/pytorch:rocm7.0_ubuntu24.04_py3.12_pytorch_release_2.8.0
+imagePullPolicy: Always
+
+gpus: 1
+memory_per_gpu: 64
+cpu_per_gpu: 12
+
+init_args: |
+  chmod +x /workload/mount/entrypoint.sh.old
+
+  apt-get update -y
+  DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y ca-certificates
+  apt-get autoremove -y
+  apt-get clean
+
+  pip install numpy==1.26.2 pandas==2.2.2 scipy==1.11.4 rdkit lightning torchmetrics openbabel-wheel typing_extensions wandb numba hiredis tqdm ipython certifi
+
+  # Move all the pip packages to a shared location
+  mkdir -p /workload/lib/bin
+  cp -r /opt/venv/bin/* /workload/lib/bin/
+  cp -r /opt/venv/lib/python3.12/site-packages/* /workload/lib/
+
+  cd /workload
+  git clone https://github.com/rssrwn/semla-flow.git
+
+  mkdir -p /workload/semla-flow/data/qm9/smol;
+  mkdir /workload/semla-flow/models;
+  wget -q 'https://drive.usercontent.google.com/download?id=1TaqaNebW4fs9swFkYyOfaLf7TNrs3I_g&export=download&confirm=t&uuid=150c3e68-f755-4573-9989-1c99d40e1a76' -O /workload/semla-flow/models/300epochs.ckpt;
+  wget -q 'https://drive.usercontent.google.com/download?id=1nUSR8c3K2qqp5Z6S6f_iL89bj5Hqmdo-&export=download&confirm=t&uuid=65137bab-648f-4b45-b906-0c6677b909cf' -O /workload/semla-flow/data/qm9/smol/train.smol;
+  wget -q 'https://drive.usercontent.google.com/download?id=1WSAZ7F_XwA7R0NMFAgT6lh8Y-GV_OHnJ&export=download&confirm=t&uuid=58f38167-f4ea-412c-afcb-482f65f42caf' -O /workload/semla-flow/data/qm9/smol/val.smol;
+  wget -q 'https://drive.usercontent.google.com/download?id=1FYBMPqVkevfKinc4EQUf_wfKzKx2HqEy&export=download&confirm=t&uuid=ccd41eb9-b44e-4ab8-b492-b60567a54162' -O /workload/semla-flow/data/qm9/smol/test.smol;
+
+  echo "Initialization completed successfully"
+
+
+entrypoint: |
+  # Include the exports here since $dynamic variables are not available in the k8s env definitions.
+  export PATH="/workload/lib/bin:$PATH"
+  export PYTHONPATH="/workload/lib:$PYTHONPATH"
+
+  cd /workload/semla-flow
+
+  START_TIME=$(date +%s)
+
+  # Run evaluation for the QM9 dataset
+  python -m semlaflow.evaluate --data_path data/qm9/smol --ckpt_path models/300epochs.ckpt --dataset qm9
+
+  END_TIME=$(date +%s)
+  DURATION=$((END_TIME - START_TIME))
+
+  echo "Benchmark completed at: $(date)"
+  echo "Total benchmark duration: ${DURATION} seconds ($(date -u -d @${DURATION} +%H:%M:%S))"
+
+storage:
+  ephemeral:
+    quantity: 128Gi
+    # Change the storageClassName to standard or other name
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+  dshm:
+    sizeLimit: 32Gi
diff --git a/workloads/benchmark-robotics-opensplat/helm/Chart.yaml b/workloads/benchmark-robotics-opensplat/helm/Chart.yaml
new file mode 100644
index 0000000..8fd5f87
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: benchmark-robotics-opensplat
+description: A Helm chart for OpenSplat inference
+version: 0.0.1
diff --git a/workloads/benchmark-robotics-opensplat/helm/README.md b/workloads/benchmark-robotics-opensplat/helm/README.md
new file mode 100644
index 0000000..c3a8dc8
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/README.md
@@ -0,0 +1,44 @@
+# OpenSplat by Silogen using ROCm
+
+https://github.com/pierotofy/OpenSplat
+
+A free and open source implementation of 3D gaussian splatting written in C++, focused on being portable, lean and fast.
+
+OpenSplat takes camera poses + sparse points in COLMAP, OpenSfM, ODM, OpenMVG or nerfstudio project format and computes a scene file (.ply or .splat) that can be later imported for viewing, editing and rendering in other software.
+
+This Helm Chart deploys the OpenSplat workload for AMD GPU's.
+
+## Prerequisites
+
+Ensure the following prerequisites are met before deploying any workloads:
+
+1. **Helm**: Install `helm`. Refer to the [Helm documentation](https://helm.sh/) for instructions.
+2. **Kubectl** Install [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl).
+3. Get access to a kubernetes cluster with AMD GPU nodes with ROCm support.
+
+## Running the workload
+
+It is recommended to use `helm template` and pipe the result to `kubectl apply` , rather than using `helm install`. Generally, a command looks as follows
+
+```bash
+helm template [optional-release-name] <helm-dir> -f <overrides/xyz.yaml> --set <name>=<value> | kubectl apply -f -
+```
+
+For example, to launch the workload in your default namespace use the following command:
+
+```bash
+helm template testrun workloads/benchmark-robotics-opensplat/helm | kubectl apply -f -
+```
+
+## User input values
+
+Refer to the `values.yaml` file for the user input values you can provide, along with instructions.
+
+## Benchmarking results
+
+To see the output of the benchmark run, e.g.:
+
+```bash
+kubectl logs -f benchmark-robotics-opensplat-testrun-f7vfv -c benchmark-robotics-opensplat
+```
+The summary table will be printed to the standard output in the end of the run. Note that your specific pod name used in the above command will be different. You can check it by reading the output of `kubectl get pods` command.
diff --git a/workloads/benchmark-robotics-opensplat/helm/mount/README.md b/workloads/benchmark-robotics-opensplat/helm/mount/README.md
new file mode 100644
index 0000000..75734b3
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/mount/README.md
@@ -0,0 +1,3 @@
+Files in this directory are mounted to the workload at `/workload/mount`.
+
+**Note:** Subdirectories and binary files are not supported.
diff --git a/workloads/benchmark-robotics-opensplat/helm/mount/benchmark.sh b/workloads/benchmark-robotics-opensplat/helm/mount/benchmark.sh
new file mode 100644
index 0000000..8adb6d4
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/mount/benchmark.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# The source for this dataset is available at:
+# https://github.com/pierotofy/OpenSplat?tab=readme-ov-file#run
+
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./banana -n 2000/5000/10000/30000
+# time opensplat ./truck -n 2000/5000/10000/30000
+# time simple_trainer --width 640 --height 360 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+# time simple_trainer --width 1280 --height 720 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+# time simple_trainer --width 1920 --height 1080 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+# time simple_trainer --width 3840 --height 2160 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000
+
+set -e  # Exit on any error
+
+if [ "$#" -lt 1 ]; then
+  echo "Usage: $0 <opensplat_path>"
+  exit 1
+fi
+opensplat_path="$1"
+
+echo "Benchmark Banana"
+for itern_num in 2000 5000 10000 30000
+do
+  echo "itern_num: $itern_num"
+  time $opensplat_path/opensplat ./banana -n $itern_num
+done
+
+echo "Benchmark Truck"
+for itern_num in 2000 5000 10000 30000
+do
+  echo "itern_num: $itern_num"
+  time $opensplat_path/opensplat ./truck -n $itern_num
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (640, 360)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time $opensplat_path/simple_trainer --width 640 --height 360 --iters $itern_num --points $num_points
+    done
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (1280, 720)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time $opensplat_path/simple_trainer --width 1280 --height 720 --iters $itern_num --points $num_points
+    done
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (1920, 1080)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time $opensplat_path/simple_trainer --width 1920 --height 1080 --iters $itern_num --points $num_points
+    done
+done
+
+echo "Benchmark the rasterization process on a set of random gaussians on a single training image with size (3840, 2160)"
+for itern_num in 2000 5000 10000 30000
+do
+    echo "itern_num: $itern_num"
+    for num_points in 2000 10000 100000 1000000
+    do
+        echo "num_points: $num_points"
+        time $opensplat_path/simple_trainer --width 3840 --height 2160 --iters $itern_num --points $num_points
+    done
+done
diff --git a/workloads/benchmark-robotics-opensplat/helm/mount/reformat_log.py b/workloads/benchmark-robotics-opensplat/helm/mount/reformat_log.py
new file mode 100644
index 0000000..1620aff
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/mount/reformat_log.py
@@ -0,0 +1,160 @@
+import os
+import re
+
+
+def safe_get(lst, index, default="N/A"):
+    """Safely get an element from a list, returning default if index doesn't exist"""
+    return lst[index] if index < len(lst) else default
+
+
+def convert_time_to_seconds(time_str):
+    """
+    Converts a time string like "0m18.236s" to total seconds.
+
+    Args:
+        time_str (str): The time string in "XhYmZs" format (e.g., "0m18.236s", "1h2m3.4s").
+                        Hours and minutes are optional. Milliseconds are optional.
+
+    Returns:
+        float: The total time in seconds, or None if the format is invalid.
+    """
+    total_seconds = 0.0
+
+    # Regular expression to capture hours, minutes, and seconds (with optional milliseconds)
+    # The groups are: (hours), (minutes), (seconds_whole), (milliseconds)
+    match = re.match(r"(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)(?:\.(\d+))?s)?", time_str)
+
+    if not match:
+        print(f"Error: Invalid time format for '{time_str}'. Expected formats like '0m18.236s', '1h2m3s', '5s'.")
+        return None
+
+    hours_str, minutes_str, seconds_whole_str, milliseconds_str = match.groups()
+
+    if hours_str:
+        total_seconds += int(hours_str) * 3600
+    if minutes_str:
+        total_seconds += int(minutes_str) * 60
+    if seconds_whole_str:
+        total_seconds += int(seconds_whole_str)
+    if milliseconds_str:
+        # Pad milliseconds to 3 digits and convert to float
+        total_seconds += float(f"0.{milliseconds_str.ljust(3, '0')[:3]}")
+
+    return total_seconds
+
+
+def main():
+    """
+    Reads the output log generated by benchmark.sh, filters and processes relevant benchmark lines,
+    and reformats the timing results into Markdown tables for easier analysis.
+
+    Specifically:
+    - Extracts timing and error information for Banana and Truck benchmarks, and writes a Markdown table to 'output_table1.txt'.
+    - Extracts timing and error information for rasterization benchmarks with various input sizes and writes a Markdown table to 'output_table2.txt'.
+
+    The function expects 'opensplat.log' to be present in the current directory.
+    """
+    with open("opensplat.log", "r") as f:
+        lines = f.readlines()
+
+    filtered_lines = []
+    idx = 0
+    for line in lines:
+        if (
+            "Benchmark Banana" in line
+            or "Benchmark Truck" in line
+            or "iter_num:" in line
+            or "num_points:" in line
+            or "real\t" in line
+            or "Benchmark the rasterization" in line
+            or "terminate" in line
+        ):
+            if "terminate" in line and (idx + 1 < len(lines) and "negative dimension" in lines[idx + 1]):
+                filtered_lines.append("Memory overflow\n")
+            elif "terminate" in line:
+                filtered_lines.append("Error\n")
+            else:
+                filtered_lines.append(line)
+        idx += 1
+
+    # reformat output of time opensplat ./banana -n 2000/5000/10000/30000 ...
+    # and opensplat ./truck -n 2000/5000/10000/30000 to a markdown table
+    output = ["|Input|#iters: 2000|#iters: 5000|#iters: 10000|#iters: 30000|\n", "|-----|-----|-----|-----|-----|\n"]
+    for i, line in enumerate(filtered_lines):
+        if "Benchmark Banana" in line or "Benchmark Truck" in line:
+            times = []
+            errors = []
+            for j in range(i, i + 20):
+                if "real" in filtered_lines[j]:
+                    times.append(convert_time_to_seconds(filtered_lines[j].split("\t")[1].split("\n")[0]))
+                    if "Memory overflow" in filtered_lines[j - 1]:
+                        errors.append(True)
+                    else:
+                        errors.append(False)
+            times = ["**Memory overflow**" if flag else val for val, flag in zip(times, errors)]
+            if "Benchmark Banana" in line:
+                output.append(
+                    f"|Banana|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|\n"
+                )
+            elif "Benchmark Truck" in line:
+                output.append(
+                    f"|Truck|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|\n"
+                )
+                break
+
+    if os.path.exists("output_table1.txt"):
+        os.remove("output_table1.txt")
+
+    with open("output_table1.txt", "a") as f:
+        for line in output:
+            f.write(line)
+
+    # reformat output of time simple_trainer --width 640/1280/1920/3840 --height 360/720/1080/2160 --iters 2000/5000/10000/30000 --points 2000/10000/100000/1000000  ...
+    # to a markdown table
+    output = [
+        "|Input size|#iters: 2000| | | |#iters: 5000| | | |#iters: 10000| | | |#iters: 30000| | | |\n",
+        "|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|-----|\n",
+        "| |#points: 2000|#points: 10000|#points: 100000|#points: 1000000|#points: 2000|#points: 10000|#points: 100000|#points: 1000000|#points: 2000|#points: 10000|#points: 100000|#points: 1000000|#points: 2000|#points: 10000|#points: 100000|#points: 1000000|\n",
+    ]
+    for i, line in enumerate(filtered_lines):
+        if "Benchmark the rasterization" in line:
+            times = []
+            errors = []
+            for j in range(i, i + 100):
+                if j >= len(filtered_lines):
+                    break
+                if "real" in filtered_lines[j]:
+                    times.append(convert_time_to_seconds(filtered_lines[j].split("\t")[1].split("\n")[0]))
+                    if "Memory overflow" in filtered_lines[j - 1]:
+                        errors.append(True)
+                    else:
+                        errors.append(False)
+            times = ["**Memory overflow**" if flag else val for val, flag in zip(times, errors)]
+            if "(640, 360)" in line:
+                output.append(
+                    f"|(640 360)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+            elif "(1280, 720)" in line:
+                output.append(
+                    f"|(1280 720)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+            elif "(1920, 1080)" in line:
+                output.append(
+                    f"|(1920 1080)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+            elif "(3840, 2160)" in line:
+                output.append(
+                    f"|(3840 2160)|{safe_get(times, 0)}|{safe_get(times, 1)}|{safe_get(times, 2)}|{safe_get(times, 3)}|{safe_get(times, 4)}|{safe_get(times, 5)}|{safe_get(times, 6)}|{safe_get(times, 7)}|{safe_get(times, 8)}|{safe_get(times, 9)}|{safe_get(times, 10)}|{safe_get(times, 11)}|{safe_get(times, 12)}|{safe_get(times, 13)}|{safe_get(times, 14)}|{safe_get(times, 15)}|\n"
+                )
+                break
+
+    if os.path.exists("output_table2.txt"):
+        os.remove("output_table2.txt")
+
+    with open("output_table2.txt", "a") as f:
+        for line in output:
+            f.write(line)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workloads/benchmark-robotics-opensplat/helm/templates/_helpers.tpl b/workloads/benchmark-robotics-opensplat/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..fac89ee
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/templates/_helpers.tpl
@@ -0,0 +1,91 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+limits:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  memory: "{{ max (mul .Values.gpus .Values.memory_per_gpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpu_per_gpu) 1 }}"
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: /workload
+  name: ephemeral-storage
+- mountPath: /workload/mount
+  name: workload-mount
+- mountPath: /dev/shm
+  name: dshm
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+{{- if .Values.storage.ephemeral.storageClassName -}}
+- ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.storage.ephemeral.quantity }}
+        storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+  name: ephemeral-storage
+{{- else }}
+- emptyDir: {}
+  name: ephemeral-storage
+  sizeLimit: {{ .Values.storage.ephemeral.quantity }}
+{{- end }}
+- emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+  name: dshm
+- configMap:
+    name: {{ include "release.fullname" . }}
+  name: workload-mount
+{{- end -}}
diff --git a/workloads/benchmark-robotics-opensplat/helm/templates/configmap.yaml b/workloads/benchmark-robotics-opensplat/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/benchmark-robotics-opensplat/helm/templates/job.yaml b/workloads/benchmark-robotics-opensplat/helm/templates/job.yaml
new file mode 100644
index 0000000..e00d5c5
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/templates/job.yaml
@@ -0,0 +1,64 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  activeDeadlineSeconds: 86400 # 24 hours timeout
+  backoffLimit: 1
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      {{- if .Values.nodeSelector }}
+      nodeSelector:
+        {{- .Values.nodeSelector | toYaml | nindent 8 }}
+      {{- end }}
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      initContainers:
+        - name: {{ .Chart.Name }}-init
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          {{- if .Values.init_args }}
+          command: ["/bin/bash", "-c"]
+          args:
+          - |
+            {{- .Values.init_args | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+          env:
+          - name: DEBIAN_FRONTEND
+            value: "noninteractive"
+
+      containers:
+        - name: {{ .Chart.Name }}
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          {{- if .Values.entrypoint }}
+          command: ["/bin/bash", "-c"]
+          args:
+          - |
+            {{- .Values.entrypoint | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+
+      restartPolicy: Never
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
diff --git a/workloads/benchmark-robotics-opensplat/helm/values.schema.json b/workloads/benchmark-robotics-opensplat/helm/values.schema.json
new file mode 100644
index 0000000..de2555c
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/values.schema.json
@@ -0,0 +1,133 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "metadata": {
+      "type": "object",
+      "description": "Metadata for the deployment",
+      "properties": {
+        "labels": {
+          "type": "object",
+          "description": "Labels to apply to the deployment",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": ["labels"]
+    },
+    "image": {
+      "type": "string",
+      "description": "Docker image to use for the deployment"
+    },
+    "imagePullPolicy": {
+      "type": "string",
+      "description": "Image pull policy",
+      "enum": ["Always", "IfNotPresent", "Never"]
+    },
+    "imagePullSecrets": {
+      "type": "array",
+      "description": "Image pull secrets for private registries"
+    },
+    "entrypoint": {
+      "type": "string",
+      "description": "Entrypoint for the container"
+    },
+    "init_args": {
+      "type": "string",
+      "description": "Commands for the initContainer"
+    },
+    "gpus": {
+      "type": "integer",
+      "description": "Number of GPUs to allocate",
+      "minimum": 1
+    },
+    "memory_per_gpu": {
+      "type": "integer",
+      "description": "Memory per GPU in Gi",
+      "minimum": 1
+    },
+    "cpu_per_gpu": {
+      "type": "integer",
+      "description": "CPU cores per GPU",
+      "minimum": 1
+    },
+    "vllm_engine_args": {
+      "type": "object",
+      "description": "Arguments for the vllm engine",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "env_vars": {
+      "type": "object",
+      "description": "Environment variables for the deployment"
+    },
+    "storage": {
+      "type": "object",
+      "description": "Storage configuration",
+      "properties": {
+        "ephemeral": {
+          "type": "object",
+          "description": "Ephemeral storage configuration",
+          "properties": {
+            "quantity": {
+              "type": "string",
+              "description": "Quantity of ephemeral storage"
+            },
+            "storageClassName": {
+              "type": "string",
+              "description": "Storage class name for ephemeral storage"
+            },
+            "accessModes": {
+              "type": "array",
+              "description": "Access modes for ephemeral storage",
+              "items": {
+                "type": "string"
+              }
+            }
+          },
+          "required": ["quantity", "storageClassName", "accessModes"]
+        },
+        "dshm": {
+          "type": "object",
+          "description": "Shared memory configuration",
+          "properties": {
+            "sizeLimit": {
+              "type": "string",
+              "description": "Size limit for shared memory"
+            }
+          },
+          "required": ["sizeLimit"]
+        }
+      },
+      "required": ["ephemeral", "dshm"]
+    },
+    "nodeSelector": {
+      "type": "object",
+      "properties": {
+        "dev": {
+          "type": "string",
+          "description": "If true, use the dev node selector"
+        }
+      }
+    },
+    "startupProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Startup probe configuration for the container"
+    },
+    "livenessProbe": {
+      "type": "object",
+      "additionalProperties": true,
+      "description": "Liveness probe configuration for the container"
+    },
+    "readinessProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Readiness probe configuration for the container"
+    }
+  },
+  "required": ["metadata", "image", "imagePullPolicy", "gpus", "memory_per_gpu", "cpu_per_gpu", "storage"],
+  "additionalProperties": false
+}
diff --git a/workloads/benchmark-robotics-opensplat/helm/values.yaml b/workloads/benchmark-robotics-opensplat/helm/values.yaml
new file mode 100644
index 0000000..60105e9
--- /dev/null
+++ b/workloads/benchmark-robotics-opensplat/helm/values.yaml
@@ -0,0 +1,99 @@
+metadata:
+  labels: {}
+
+image: rocm/pytorch:rocm6.4.3_ubuntu22.04_py3.10_pytorch_release_2.6.0
+imagePullPolicy: Always
+
+gpus: 1
+memory_per_gpu: 64
+cpu_per_gpu: 12
+
+
+init_args: |
+  rocminfo && echo "ROCm is working!" || echo "ROCm is not working!"
+
+  # Copy files from readonly ConfigMap to writable location
+  cp /workload/mount/* /workload/
+  chmod +x /workload/benchmark.sh
+  cd /workload
+
+  apt-get update && apt-get install -y git
+  apt-get install -y libopencv-dev
+  apt-get clean && rm -rf /var/lib/apt/lists/*
+
+  # Build OpenSplat
+  git clone https://github.com/pierotofy/OpenSplat
+  cd /workload/OpenSplat
+  git checkout 86d2500d3f50de3fee933525a1217c06634f791f
+
+  source activate py_3.10
+  mkdir build
+  cd build
+  cmake .. \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DGPU_RUNTIME=HIP \
+    -DHIP_ROOT_DIR=/opt/rocm \
+    -DOPENSPLAT_BUILD_SIMPLE_TRAINER=ON \
+    -DCMAKE_PREFIX_PATH=/opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch \
+    -DCMAKE_INSTALL_PREFIX=/code/install
+  make
+
+  # Install gdown for downloading data
+  pip install --upgrade gdown
+
+  # download data
+  cd /workload
+  if [ ! -f /workload/banana.zip ]; then
+      echo "Download banana data"
+      gdown --id 1mUUZFDo2swd6CE5vwPPkjN63Hyf4XyEv --no-check-certificate -O banana.zip
+      unzip banana.zip
+      rm banana.zip
+  fi
+
+  if [ ! -f /workload/truck.zip ]; then
+      echo "Download truck data"
+      gdown --id 1WWXo-GKo6d-yf-K1T1CswIdkdZrBNZ_e --no-check-certificate -O truck.zip
+      unzip truck.zip
+      rm truck.zip
+  fi
+
+  echo "Setup completed successfully"
+
+
+entrypoint: |
+  echo "Starting OpenSplat operations..."
+
+  cd /workload
+
+  # Run benchmark and capture all output
+  # The benchmark will most likely fail at some point. Once it fails, we will capture all the logs and reformat it.
+  ./benchmark.sh /workload/OpenSplat/build 2>&1 | tee opensplat.log
+  BENCHMARK_EXIT_CODE=${PIPESTATUS[0]}
+
+  # Now check the captured exit code
+  if [ $BENCHMARK_EXIT_CODE -eq 0 ]; then
+    echo "Benchmark completed at $(date) with exit code: $BENCHMARK_EXIT_CODE"
+  else
+    echo "Benchmark failed at $(date) with exit code: $BENCHMARK_EXIT_CODE"
+  fi
+
+  # Process logs
+  echo "Continuing to process logs..."
+  python3 /workload/mount/reformat_log.py
+  echo "Log processing completed"
+
+  # Print the two outputs from the reformat_log.py script.
+  cat output_table1.txt
+  cat output_table2.txt
+
+  echo "Job completed at $(date)"
+
+storage:
+  ephemeral:
+    quantity: 128Gi
+    # Change the storageClassName to standard or other name
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+  dshm:
+    sizeLimit: 32Gi
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/Chart.yaml b/workloads/dev-lifescience-swinunetr-training/helm/Chart.yaml
new file mode 100644
index 0000000..5952c70
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: dev-lifescience-swinunetr-training
+description: A Helm chart for training of the MONAI SwinUNETR model on the NSCLC-Radiomics dataset.
+version: 0.0.1
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/README.md b/workloads/dev-lifescience-swinunetr-training/helm/README.md
new file mode 100644
index 0000000..1aeb596
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/README.md
@@ -0,0 +1,43 @@
+# Life Science - SwinUNETR Training
+
+This Helm Chart to train the MONAI [SwinUNETR](https://arxiv.org/abs/2201.01266) model on the [nsclc-radiomics](https://www.cancerimagingarchive.net/collection/nsclc-radiomics/) dataset.
+
+The [nsclc-radiomics](https://www.cancerimagingarchive.net/collection/nsclc-radiomics/) is loaded using the MONAI [TciaDataset](https://github.com/Project-MONAI/MONAI/blob/b58e883c887e0f99d382807550654c44d94f47bd/monai/apps/datasets.py#L404).
+
+# SwinUNETR model
+
+SwinUNETR is a deep learning architecture designed for medical image segmentation, particularly in 3D volumetric data
+such as CT or MRI scans with the aim to detect tumors in the images. It combines the strengths of two powerful
+models: 1. Swin Transformer - a hierarchical vision transformer that captures long-range dependencies and contextual
+information efficiently and 2. UNETR (UNet with Transformers) - a transformer-based encoder-decoder architecture
+tailored for medical image segmentation.
+
+## Prerequisites
+
+Ensure the following prerequisites are met before deploying any workloads:
+
+1. **Helm**: Install `helm`. Refer to the [Helm documentation](https://helm.sh/) for instructions.
+
+## Deploying the Workload
+
+It is recommended to use `helm template` and pipe the result to `kubectl create`, rather than using `helm install`. Generally, a command looks as follows
+
+```bash
+helm template [your-release-name] ./helm | kubectl apply -f -
+```
+
+The chart provides three main ways to deploy models, detailed below.
+
+## User Input Values
+
+Refer to the `values.yaml` file for the user input values you can provide, along with instructions.
+
+## Interacting with Deployed Model
+
+### Verify Deployment
+
+Check the your deployed job status:
+
+```bash
+kubectl get jobs
+```
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/README.md b/workloads/dev-lifescience-swinunetr-training/helm/mount/README.md
new file mode 100644
index 0000000..75734b3
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/README.md
@@ -0,0 +1,3 @@
+Files in this directory are mounted to the workload at `/workload/mount`.
+
+**Note:** Subdirectories and binary files are not supported.
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/data_utils.py b/workloads/dev-lifescience-swinunetr-training/helm/mount/data_utils.py
new file mode 100644
index 0000000..5d23133
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/data_utils.py
@@ -0,0 +1,328 @@
+# Copyright 2020 - 2022 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified by Reference Models team (AMD) on 2025:
+# - Adaptation and transforms for training on the NSCLCL-Radiomics dataset
+# - Dataloader optimizations
+
+import logging
+import math
+from typing import List, Union
+
+import numpy as np
+import torch
+from monai import data, transforms
+from monai.apps import TciaDataset
+from monai.apps.tcia import TCIA_LABEL_DICT
+from utils import SafeLoadImaged
+
+if not logging.getLogger().hasHandlers():
+    logging.basicConfig(level=logging.INFO)
+
+IMAGE_DATA = "image"
+LABEL_DATA = "seg"
+SPACING_MODES = ("bilinear", "nearest")
+ORIENTATION = "RAS"
+COLLECTION = "NSCLC-Radiomics"
+SEG_TYPE = "SEG"
+
+
+class Sampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True, make_even=True):
+        if num_replicas is None:
+            if not torch.distributed.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = torch.distributed.get_world_size()
+        if rank is None:
+            if not torch.distributed.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = torch.distributed.get_rank()
+        self.shuffle = shuffle
+        self.make_even = make_even
+        self.dataset = dataset
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+        self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
+        self.total_size = self.num_samples * self.num_replicas
+        indices = list(range(len(self.dataset)))
+        self.valid_length = len(indices[self.rank : self.total_size : self.num_replicas])
+
+    def __iter__(self):
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = list(range(len(self.dataset)))
+        if self.make_even:
+            if len(indices) < self.total_size:
+                if self.total_size - len(indices) < len(indices):
+                    indices += indices[: (self.total_size - len(indices))]
+                else:
+                    extra_ids = np.random.randint(low=0, high=len(indices), size=self.total_size - len(indices))
+                    indices += [indices[ids] for ids in extra_ids]
+            assert len(indices) == self.total_size
+        indices = indices[self.rank : self.total_size : self.num_replicas]
+        self.num_samples = len(indices)
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
+
+
+def print_tensor_shape(tensor, key_name="tensor"):
+    """Simple function to print the shape and dtype of a tensor."""
+    if isinstance(tensor, torch.Tensor):
+        print(
+            f"--- Debug Shape ({key_name}): {tensor.shape}, min/max val: {tensor.min()}/{tensor.max()} - dtype: {tensor.dtype} ---"
+        )
+    else:
+        print(f"--- Debug Type ({key_name}): {type(tensor)} ---")  # If not a tensor
+    return tensor
+
+
+def select_channels(tensor: Union[torch.Tensor, np.ndarray], channel_indices: List[int]):
+    """Simple function to select specific channel from the segmentation mask"""
+    if not isinstance(tensor, (torch.Tensor, np.ndarray)):
+        raise TypeError(f"Input 'tensor' must be a torch.Tensor or np.ndarray, got {type(tensor)}")
+    return tensor[..., channel_indices]
+
+
+def get_transforms(name: str, args):
+    labels = args.dataset_labels.split(",")
+    if len(labels) != args.out_channels:
+        raise ValueError(
+            f"Number of labels {len(labels)} doesn't match the number of out_channels {args.out_channels}: {labels}"
+        )
+
+    channel_indices = [TCIA_LABEL_DICT[COLLECTION][label] for label in labels]
+    print(f"Predicting labels: {labels} - Channel: {channel_indices} (available labels: {TCIA_LABEL_DICT[COLLECTION]}")
+
+    if name == "training":
+        train_transform = transforms.Compose(
+            [
+                SafeLoadImaged(
+                    reader="PydicomReader",
+                    keys=[IMAGE_DATA, LABEL_DATA],
+                    label_dict=TCIA_LABEL_DICT[COLLECTION],
+                    image_only=False,
+                    fname_regex=r".*\.dcm$",
+                ),
+                transforms.Lambdad(keys=LABEL_DATA, func=lambda x: select_channels(x, channel_indices=channel_indices)),
+                transforms.EnsureChannelFirstd(keys=[IMAGE_DATA], channel_dim="no_channel"),
+                transforms.EnsureChannelFirstd(keys=[LABEL_DATA], channel_dim=3),
+                transforms.Orientationd(keys=[IMAGE_DATA, LABEL_DATA], axcodes=ORIENTATION),
+                transforms.Spacingd(
+                    keys=[IMAGE_DATA, LABEL_DATA], pixdim=(args.space_x, args.space_y, args.space_z), mode=SPACING_MODES
+                ),
+                transforms.ScaleIntensityRanged(
+                    keys=[IMAGE_DATA], a_min=args.a_min, a_max=args.a_max, b_min=args.b_min, b_max=args.b_max, clip=True
+                ),
+                transforms.CropForegroundd(keys=[IMAGE_DATA, LABEL_DATA], source_key=IMAGE_DATA, allow_smaller=False),
+                transforms.SpatialPadd(
+                    keys=[IMAGE_DATA, LABEL_DATA], spatial_size=(args.roi_x, args.roi_y, args.roi_z), method="symmetric"
+                ),
+                transforms.RandCropByPosNegLabeld(
+                    keys=[IMAGE_DATA, LABEL_DATA],
+                    label_key=LABEL_DATA,
+                    spatial_size=(args.roi_x, args.roi_y, args.roi_z),
+                    pos=1,
+                    neg=1,
+                    num_samples=4,
+                    image_key=IMAGE_DATA,
+                    image_threshold=0,
+                ),
+                transforms.RandFlipd(keys=[IMAGE_DATA, LABEL_DATA], prob=args.RandFlipd_prob, spatial_axis=0),
+                transforms.RandFlipd(keys=[IMAGE_DATA, LABEL_DATA], prob=args.RandFlipd_prob, spatial_axis=1),
+                transforms.RandFlipd(keys=[IMAGE_DATA, LABEL_DATA], prob=args.RandFlipd_prob, spatial_axis=2),
+                transforms.RandRotate90d(keys=[IMAGE_DATA, LABEL_DATA], prob=args.RandRotate90d_prob, max_k=3),
+                transforms.RandScaleIntensityd(keys=IMAGE_DATA, factors=0.1, prob=args.RandScaleIntensityd_prob),
+                transforms.RandShiftIntensityd(keys=IMAGE_DATA, offsets=0.1, prob=args.RandShiftIntensityd_prob),
+                transforms.ToTensord(keys=[IMAGE_DATA, LABEL_DATA]),
+            ]
+        )
+        return train_transform
+
+    elif name == "validation":
+        val_transform = transforms.Compose(
+            [
+                SafeLoadImaged(
+                    reader="PydicomReader",
+                    keys=[IMAGE_DATA, LABEL_DATA],
+                    label_dict=TCIA_LABEL_DICT[COLLECTION],
+                    image_only=False,
+                    fname_regex=r".*\.dcm$",
+                ),
+                transforms.Lambdad(keys=LABEL_DATA, func=lambda x: select_channels(x, channel_indices=channel_indices)),
+                transforms.EnsureChannelFirstd(keys=[IMAGE_DATA], channel_dim="no_channel"),
+                transforms.EnsureChannelFirstd(keys=[LABEL_DATA], channel_dim=3),
+                transforms.Orientationd(keys=[IMAGE_DATA, LABEL_DATA], axcodes=ORIENTATION),
+                transforms.Spacingd(
+                    keys=[IMAGE_DATA, LABEL_DATA], pixdim=(args.space_x, args.space_y, args.space_z), mode=SPACING_MODES
+                ),
+                transforms.ScaleIntensityRanged(
+                    keys=[IMAGE_DATA], a_min=args.a_min, a_max=args.a_max, b_min=args.b_min, b_max=args.b_max, clip=True
+                ),
+                transforms.CropForegroundd(keys=[IMAGE_DATA, LABEL_DATA], source_key=IMAGE_DATA, allow_smaller=False),
+                transforms.SpatialPadd(
+                    keys=[IMAGE_DATA, LABEL_DATA], spatial_size=(args.roi_x, args.roi_y, args.roi_z), method="symmetric"
+                ),
+                transforms.ToTensord(keys=[IMAGE_DATA, LABEL_DATA]),
+            ]
+        )
+        return val_transform
+
+    elif name == "test":
+        test_transform = transforms.Compose(
+            [
+                SafeLoadImaged(
+                    reader="PydicomReader",
+                    keys=[IMAGE_DATA, LABEL_DATA],
+                    label_dict=TCIA_LABEL_DICT[COLLECTION],
+                    image_only=False,
+                    fname_regex=r".*\.dcm$",
+                ),
+                transforms.Lambdad(keys=LABEL_DATA, func=lambda x: select_channels(x, channel_indices=channel_indices)),
+                transforms.EnsureChannelFirstd(keys=[IMAGE_DATA], channel_dim="no_channel"),
+                transforms.EnsureChannelFirstd(keys=[LABEL_DATA], channel_dim=3),
+                transforms.Orientationd(keys=[IMAGE_DATA, LABEL_DATA], axcodes=ORIENTATION),
+                transforms.Spacingd(
+                    keys=[IMAGE_DATA, LABEL_DATA],
+                    pixdim=(args.space_x, args.space_y, args.space_z),
+                    mode=SPACING_MODES[1],
+                ),
+                transforms.ScaleIntensityRanged(
+                    keys=[IMAGE_DATA], a_min=args.a_min, a_max=args.a_max, b_min=args.b_min, b_max=args.b_max, clip=True
+                ),
+                transforms.ToTensord(keys=[IMAGE_DATA, LABEL_DATA]),
+            ]
+        )
+        return test_transform
+
+    elif name == "inference":
+        inference_transform = transforms.Compose(
+            [
+                transforms.LoadImaged(keys=[IMAGE_DATA]),
+                transforms.EnsureChannelFirstd(keys=[IMAGE_DATA], channel_dim="no_channel"),
+                transforms.Orientationd(keys=[IMAGE_DATA], axcodes=ORIENTATION),
+                transforms.Spacingd(
+                    keys=[IMAGE_DATA], pixdim=(args.space_x, args.space_y, args.space_z), mode=SPACING_MODES[0]
+                ),
+                transforms.ScaleIntensityRanged(
+                    keys=[IMAGE_DATA], a_min=args.a_min, a_max=args.a_max, b_min=args.b_min, b_max=args.b_max, clip=True
+                ),
+            ]
+        )
+        return inference_transform
+
+    else:
+        raise ValueError(f"Unknown transform with name {name}")
+
+
+def get_loader(args):
+    if args.test_mode:
+        test_transform = get_transforms(name="test", args=args)
+
+        test_ds = TciaDataset(
+            root_dir=args.data_root_dir,
+            collection=COLLECTION,
+            section="validation",
+            download=args.download_data,
+            seg_type=SEG_TYPE,
+            progress=True,
+            cache_rate=0.0,
+            transform=test_transform,
+            runtime_cache=False,
+        )
+        test_sampler = Sampler(test_ds, shuffle=False) if args.distributed else None
+        test_loader = data.DataLoader(
+            test_ds,
+            batch_size=1,
+            shuffle=False,
+            num_workers=args.workers,
+            sampler=test_sampler,
+            pin_memory=True,
+            prefetch_factor=2,
+        )
+        loader = test_loader
+    else:
+
+        train_transform = get_transforms(name="training", args=args)
+        train_ds = TciaDataset(
+            root_dir=args.data_root_dir,
+            collection=COLLECTION,
+            section="training",
+            download=args.download_data,
+            seg_type=SEG_TYPE,
+            progress=True,
+            cache_num=24,
+            cache_rate=1.0,
+            val_frac=0.2,
+            num_workers=args.workers,
+            transform=train_transform,
+            runtime_cache=False,
+        )
+        train_sampler = Sampler(train_ds) if args.distributed else None
+        train_loader = data.DataLoader(
+            train_ds,
+            batch_size=args.batch_size,
+            shuffle=(train_sampler is None),
+            num_workers=args.workers,
+            sampler=train_sampler,
+            pin_memory=True,
+            prefetch_factor=2,
+            persistent_workers=True,
+        )
+
+        val_transform = get_transforms(name="validation", args=args)
+        val_ds = TciaDataset(
+            root_dir=args.data_root_dir,
+            collection=COLLECTION,
+            section="validation",
+            download=args.download_data,
+            seg_type=SEG_TYPE,
+            progress=True,
+            cache_rate=0.0,
+            val_frac=0.2,
+            num_workers=args.workers,
+            transform=val_transform,
+            runtime_cache=False,
+        )
+        val_sampler = Sampler(val_ds, shuffle=True) if args.distributed else None
+        val_loader = data.DataLoader(
+            val_ds,
+            batch_size=1,
+            shuffle=True,
+            num_workers=args.workers,
+            sampler=val_sampler,
+            pin_memory=True,
+            prefetch_factor=2,
+        )
+        loader = [train_loader, val_loader]
+
+    return loader
+
+
+def get_post_transforms_inverter(forward_transforms_obj):
+    """Transform to revert all transforms previously applied."""
+    return transforms.Invertd(
+        keys=IMAGE_DATA,
+        transform=forward_transforms_obj,
+        orig_keys=IMAGE_DATA,
+        orig_meta_keys=f"{IMAGE_DATA}_meta_dict",
+        nearest_interp=True,
+        to_tensor=True,
+        device="cuda",
+    )
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/entrypoint.sh b/workloads/dev-lifescience-swinunetr-training/helm/mount/entrypoint.sh
new file mode 100644
index 0000000..de5e28b
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/entrypoint.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# Exit immediately if a command exits with a non-zero status
+set -e
+
+# Change directory to the location of this script
+cd "$(dirname "$0")"
+
+echo "---------------------------------"
+echo "Installing MinIO Client (mc)..."
+echo "---------------------------------"
+
+# Download and install MinIO client for linux-amd64
+curl -L https://dl.min.io/client/mc/release/linux-amd64/mc -o /usr/local/bin/mc
+chmod +x /usr/local/bin/mc
+
+# Verify installation
+echo "✓ MinIO client installed successfully"
+mc --version
+
+# Configure MinIO alias if credentials are provided
+if [[ -n "$BUCKET_STORAGE_HOST" && -n "$BUCKET_STORAGE_ACCESS_KEY" && -n "$BUCKET_STORAGE_SECRET_KEY" ]]; then
+    echo "Configuring MinIO alias..."
+    mc alias set minio-host "$BUCKET_STORAGE_HOST" "$BUCKET_STORAGE_ACCESS_KEY" "$BUCKET_STORAGE_SECRET_KEY"
+    echo "✓ MinIO connection configured"
+else
+    echo "MinIO credentials not provided - skipping alias configuration"
+fi
+
+echo "---------------------------------"
+echo "Installing Python dependencies..."
+echo "---------------------------------"
+
+pip install --no-cache-dir -r requirements.txt
+
+echo "✓ Python dependencies installed successfully"
+
+# Construct the training command from environment variables
+CMD_ARGS=""
+
+# Loop through all environment variables with the prefix 'TRAINING_ARG_'
+for var in $(env | grep "^TRAINING_ARG_"); do
+    # Extract the original key and value
+    KEY_UPPER=$(echo "$var" | sed -e 's/=.*//' -e 's/TRAINING_ARG_//')
+    VALUE=$(echo "$var" | sed 's/.*=//')
+
+    KEY_LOWER=$(echo "$KEY_UPPER" | tr '[:upper:]' '[:lower:]')
+
+    # Handle boolean flags (e.g., --save_checkpoint)
+    if [[ "$VALUE" == "true" ]]; then
+        CMD_ARGS="$CMD_ARGS --$KEY_LOWER"
+    # Handle key-value pairs (e.g., --batch_size=1)
+    # This ignores arguments that are explicitly set to 'false' or are empty
+    elif [[ "$VALUE" != "false" && -n "$VALUE" ]]; then
+        CMD_ARGS="$CMD_ARGS --$KEY_LOWER=$VALUE"
+    fi
+done
+
+# Build the final command
+COMMAND="python main.py $CMD_ARGS"
+
+echo "---------------------------------"
+echo "Executing command:"
+echo "$COMMAND"
+echo "---------------------------------"
+
+# Execute the command
+eval "$COMMAND"
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/lr_scheduler.py b/workloads/dev-lifescience-swinunetr-training/helm/mount/lr_scheduler.py
new file mode 100644
index 0000000..57c159f
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/lr_scheduler.py
@@ -0,0 +1,171 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import warnings
+from typing import List
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR, _LRScheduler
+
+__all__ = ["LinearLR", "ExponentialLR"]
+
+
+class _LRSchedulerMONAI(_LRScheduler):
+    """Base class for increasing the learning rate between two boundaries over a number
+    of iterations"""
+
+    def __init__(self, optimizer: Optimizer, end_lr: float, num_iter: int, last_epoch: int = -1) -> None:
+        """
+        Args:
+            optimizer: wrapped optimizer.
+            end_lr: the final learning rate.
+            num_iter: the number of iterations over which the test occurs.
+            last_epoch: the index of last epoch.
+        Returns:
+            None
+        """
+        self.end_lr = end_lr
+        self.num_iter = num_iter
+        super(_LRSchedulerMONAI, self).__init__(optimizer, last_epoch)
+
+
+class LinearLR(_LRSchedulerMONAI):
+    """Linearly increases the learning rate between two boundaries over a number of
+    iterations.
+    """
+
+    def get_lr(self):
+        r = self.last_epoch / (self.num_iter - 1)
+        return [base_lr + r * (self.end_lr - base_lr) for base_lr in self.base_lrs]
+
+
+class ExponentialLR(_LRSchedulerMONAI):
+    """Exponentially increases the learning rate between two boundaries over a number of
+    iterations.
+    """
+
+    def get_lr(self):
+        r = self.last_epoch / (self.num_iter - 1)
+        return [base_lr * (self.end_lr / base_lr) ** r for base_lr in self.base_lrs]
+
+
+class WarmupCosineSchedule(LambdaLR):
+    """Linear warmup and then cosine decay.
+    Based on https://huggingface.co/ implementation.
+    """
+
+    def __init__(
+        self, optimizer: Optimizer, warmup_steps: int, t_total: int, cycles: float = 0.5, last_epoch: int = -1
+    ) -> None:
+        """
+        Args:
+            optimizer: wrapped optimizer.
+            warmup_steps: number of warmup iterations.
+            t_total: total number of training iterations.
+            cycles: cosine cycles parameter.
+            last_epoch: the index of last epoch.
+        Returns:
+            None
+        """
+        self.warmup_steps = warmup_steps
+        self.t_total = t_total
+        self.cycles = cycles
+        super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch)
+
+    def lr_lambda(self, step):
+        if step < self.warmup_steps:
+            return float(step) / float(max(1.0, self.warmup_steps))
+        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+
+
+class LinearWarmupCosineAnnealingLR(_LRScheduler):
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_epochs: int,
+        max_epochs: int,
+        warmup_start_lr: float = 0.0,
+        eta_min: float = 0.0,
+        last_epoch: int = -1,
+    ) -> None:
+        """
+        Args:
+            optimizer (Optimizer): Wrapped optimizer.
+            warmup_epochs (int): Maximum number of iterations for linear warmup
+            max_epochs (int): Maximum number of iterations
+            warmup_start_lr (float): Learning rate to start the linear warmup. Default: 0.
+            eta_min (float): Minimum learning rate. Default: 0.
+            last_epoch (int): The index of last epoch. Default: -1.
+        """
+        self.warmup_epochs = warmup_epochs
+        self.max_epochs = max_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.eta_min = eta_min
+
+        super(LinearWarmupCosineAnnealingLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self) -> List[float]:
+        """
+        Compute learning rate using chainable form of the scheduler
+        """
+        if not self._get_lr_called_within_step:
+            warnings.warn(
+                "To get the last learning rate computed by the scheduler, " "please use `get_last_lr()`.", UserWarning
+            )
+
+        if self.last_epoch == 0:
+            return [self.warmup_start_lr] * len(self.base_lrs)
+        elif self.last_epoch < self.warmup_epochs:
+            return [
+                group["lr"] + (base_lr - self.warmup_start_lr) / (self.warmup_epochs - 1)
+                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
+            ]
+        elif self.last_epoch == self.warmup_epochs:
+            return self.base_lrs
+        elif (self.last_epoch - 1 - self.max_epochs) % (2 * (self.max_epochs - self.warmup_epochs)) == 0:
+            return [
+                group["lr"]
+                + (base_lr - self.eta_min) * (1 - math.cos(math.pi / (self.max_epochs - self.warmup_epochs))) / 2
+                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
+            ]
+
+        return [
+            (1 + math.cos(math.pi * (self.last_epoch - self.warmup_epochs) / (self.max_epochs - self.warmup_epochs)))
+            / (
+                1
+                + math.cos(
+                    math.pi * (self.last_epoch - self.warmup_epochs - 1) / (self.max_epochs - self.warmup_epochs)
+                )
+            )
+            * (group["lr"] - self.eta_min)
+            + self.eta_min
+            for group in self.optimizer.param_groups
+        ]
+
+    def _get_closed_form_lr(self) -> List[float]:
+        """
+        Called when epoch is passed as a param to the `step` function of the scheduler.
+        """
+        if self.last_epoch < self.warmup_epochs:
+            return [
+                self.warmup_start_lr + self.last_epoch * (base_lr - self.warmup_start_lr) / (self.warmup_epochs - 1)
+                for base_lr in self.base_lrs
+            ]
+
+        return [
+            self.eta_min
+            + 0.5
+            * (base_lr - self.eta_min)
+            * (1 + math.cos(math.pi * (self.last_epoch - self.warmup_epochs) / (self.max_epochs - self.warmup_epochs)))
+            for base_lr in self.base_lrs
+        ]
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/main.py b/workloads/dev-lifescience-swinunetr-training/helm/mount/main.py
new file mode 100644
index 0000000..45ab82a
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/main.py
@@ -0,0 +1,391 @@
+# Copyright 2020 - 2022 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified by Reference Models team (AMD) on 2025:
+# - Adaptations for training on the NSCLCL-Radiomics dataset
+# - Parameter changes
+# - Logging of standard output and error to file
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime as dt
+from functools import partial
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+import torch.nn.parallel
+import torch.utils.data.distributed
+from data_utils import get_loader
+from lr_scheduler import LinearWarmupCosineAnnealingLR
+from monai.inferers import sliding_window_inference
+from monai.losses import DiceCELoss
+from monai.metrics import DiceMetric
+from monai.networks.nets import SwinUNETR
+from monai.transforms import Activations, AsDiscrete, Compose, EnsureType
+from monai.utils.enums import MetricReduction
+from trainer import run_training
+from utils import Tee, download_dataset_from_minio, upload_directory_to_minio
+
+parser = argparse.ArgumentParser(description="Swin UNETR segmentation pipeline")
+parser.add_argument("--checkpoint", default=None, help="start training from saved checkpoint")
+parser.add_argument(
+    "--logdir",
+    default="test",
+    type=str,
+    help=(
+        "Directory to save tensorboard logs and other artifacts."
+        "The files will be located under `./runs/<LOG-DIR>/<RUN-ID>/`, where <RUN-ID> is assigned at runtime based on the current time."
+    ),
+)
+parser.add_argument(
+    "--pretrained_dir", default="./pretrained_models/", type=str, help="pretrained checkpoint directory"
+)
+parser.add_argument("--data_root_dir", default="/workload_outputs/data", type=str, help="dataset root directory")
+parser.add_argument("--download_data", action="store_true", help="Download the dataset")
+parser.add_argument(
+    "--dataset_labels",
+    default="GTV-1",
+    type=str,
+    help="One or more comma-separated labels (segmentation mask) to predict from the dataset",
+)
+parser.add_argument(
+    "--pretrained_model_name",
+    default="swin_unetr.epoch.b4_5000ep_f48_lr2e-4_pretrained.pt",
+    type=str,
+    help="pretrained model name",
+)
+parser.add_argument("--save_checkpoint", action="store_true", help="save checkpoint during training")
+parser.add_argument("--max_epochs", default=5000, type=int, help="max number of training epochs")
+parser.add_argument("--batch_size", default=1, type=int, help="number of batch size")
+parser.add_argument("--n_crops", default=1, type=int, help="number of crops")
+parser.add_argument("--n_crops_val", default=4, type=int, help="number of crops validation")
+parser.add_argument("--sw_batch_size", default=4, type=int, help="number of sliding window batch size")
+parser.add_argument("--optim_lr", default=1e-4, type=float, help="optimization learning rate")
+parser.add_argument("--optim_name", default="adamw", type=str, help="optimization algorithm")
+parser.add_argument("--reg_weight", default=1e-5, type=float, help="regularization weight")
+parser.add_argument("--momentum", default=0.99, type=float, help="momentum")
+parser.add_argument("--noamp", action="store_true", help="do NOT use amp for training")
+parser.add_argument("--val_every", default=100, type=int, help="validation frequency")
+parser.add_argument("--distributed", action="store_true", help="start distributed training")
+parser.add_argument("--world_size", default=1, type=int, help="number of nodes for distributed training")
+parser.add_argument(
+    "--local-rank", "--local_rank", "--rank", default=0, type=int, help="node rank for distributed training"
+)
+parser.add_argument("--dist-url", default="tcp://127.0.0.1:23456", type=str, help="distributed url")
+parser.add_argument("--dist-backend", default="nccl", type=str, help="distributed backend")
+parser.add_argument("--norm_name", default="instance", type=str, help="normalization name")
+parser.add_argument("--workers", default=8, type=int, help="number of workers")
+parser.add_argument("--feature_size", default=48, type=int, help="feature size")
+parser.add_argument("--in_channels", default=1, type=int, help="number of input channels")
+parser.add_argument("--out_channels", default=1, type=int, help="number of output channels")
+parser.add_argument("--a_min", default=-1000.0, type=float, help="a_min in ScaleIntensityRanged")
+parser.add_argument("--a_max", default=1000.0, type=float, help="a_max in ScaleIntensityRanged")
+parser.add_argument("--b_min", default=0.0, type=float, help="b_min in ScaleIntensityRanged")
+parser.add_argument("--b_max", default=1.0, type=float, help="b_max in ScaleIntensityRanged")
+parser.add_argument("--space_x", default=1.0, type=float, help="spacing in x direction")
+parser.add_argument("--space_y", default=1.0, type=float, help="spacing in y direction")
+parser.add_argument("--space_z", default=3.0, type=float, help="spacing in z direction")
+parser.add_argument("--roi_x", default=96, type=int, help="roi size in x direction")
+parser.add_argument("--roi_y", default=96, type=int, help="roi size in y direction")
+parser.add_argument("--roi_z", default=96, type=int, help="roi size in z direction")
+parser.add_argument("--dropout_rate", default=0.0, type=float, help="dropout rate")
+parser.add_argument("--dropout_path_rate", default=0.0, type=float, help="drop path rate")
+parser.add_argument("--RandFlipd_prob", default=0.2, type=float, help="RandFlipd aug probability")
+parser.add_argument("--RandRotate90d_prob", default=0.2, type=float, help="RandRotate90d aug probability")
+parser.add_argument("--RandScaleIntensityd_prob", default=0.1, type=float, help="RandScaleIntensityd aug probability")
+parser.add_argument("--RandShiftIntensityd_prob", default=0.1, type=float, help="RandShiftIntensityd aug probability")
+parser.add_argument("--infer_overlap", default=0.5, type=float, help="sliding window inference overlap")
+parser.add_argument("--lrschedule", default="warmup_cosine", type=str, help="type of learning rate scheduler")
+parser.add_argument("--warmup_epochs", default=50, type=int, help="number of warmup epochs")
+parser.add_argument("--resume_ckpt", action="store_true", help="resume training from pretrained checkpoint")
+parser.add_argument("--use_checkpoint", action="store_true", help="use gradient checkpointing to save memory")
+parser.add_argument("--use_ssl_pretrained", action="store_true", help="use self-supervised pretrained weights")
+parser.add_argument("--spatial_dims", default=3, type=int, help="spatial dimension of input data")
+parser.add_argument("--loss_lambda_dice", default=1.5, type=float, help="lambda_dice value for the DiceCELoss")
+parser.add_argument("--loss_lambda_ce", default=1, type=float, help="lambda_ce value for the DiceCELoss")
+parser.add_argument("--loss_ce_class_weights", default=5, type=float, help="weight value for the DiceCELoss")
+parser.add_argument("--squared_dice", action="store_true", help="use squared Dice")
+parser.add_argument("--smooth_dr", default=1e-6, type=float, help="constant added to dice denominator to avoid nan")
+parser.add_argument("--smooth_nr", default=0.0, type=float, help="constant added to dice numerator to avoid zero")
+parser.add_argument("--save_results", action="store_true", help="save examples of validation results")
+parser.add_argument("--profile", action="store_true", help="enable the torch profiler")
+parser.add_argument(
+    "--profiler_schedule",
+    default="1,1,3,1",
+    type=str,
+    help="Comma separated values for the profiler wait, warmup, active, repeat",
+)
+parser.add_argument(
+    "--dataset_bucket_name", required=True, type=str, help="Name of the storage bucket to download the dataset"
+)
+parser.add_argument(
+    "--dataset_bucket_directory",
+    required=True,
+    type=str,
+    help="Directory in the storage bucket to download the dataset",
+)
+
+parser.add_argument(
+    "--storage_bucket_name", required=True, type=str, help="Name of the storage bucket to upload the training outputs"
+)
+parser.add_argument(
+    "--storage_destination_directory",
+    required=True,
+    type=str,
+    help="Destination directory in the storage bucket to upload the training outputs",
+)
+
+
+def main():
+    args = parser.parse_args()
+    args.amp = not args.noamp
+    args.rank = args.local_rank
+    args.logdir = os.path.join("/workload_outputs", args.logdir, dt.now().strftime("%Y%m%d-%H%M%S"))
+    os.makedirs(args.logdir, exist_ok=False)
+    print(f"Log dir: {args.logdir}")
+
+    # record input arguments
+    with open(os.path.join(args.logdir, "args.json"), "wt") as fo:
+        json.dump(vars(args), fo, indent=2)
+
+    data_root_dir = args.data_root_dir
+    Path(data_root_dir).mkdir(exist_ok=True, parents=True)
+    print(f"Using data root dir: {data_root_dir} - Download: {args.download_data}")
+
+    # Download dataset
+    download_dataset_from_minio(args.dataset_bucket_name, args.dataset_bucket_directory, data_root_dir)
+
+    if args.distributed:
+        args.ngpus_per_node = torch.cuda.device_count()
+        print("Found total gpus", args.ngpus_per_node)
+        args.world_size = args.ngpus_per_node * args.world_size
+        mp.spawn(main_worker, nprocs=args.ngpus_per_node, args=(args,))
+    else:
+        main_worker(gpu=0, args=args)
+
+
+def main_worker(gpu, args):
+    original_stdout = sys.stdout
+    original_stderr = sys.stderr
+    tee_stdout = None
+    log_file_handle = None
+
+    if args.distributed:
+        torch.multiprocessing.set_start_method("fork", force=True)
+    np.set_printoptions(formatter={"float": "{: 0.3f}".format}, suppress=True)
+    args.gpu = gpu
+    if args.distributed:
+        args.rank = args.rank * args.ngpus_per_node + gpu
+        dist.init_process_group(
+            backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank
+        )
+
+    if args.distributed:
+        log_file_path = os.path.join(args.logdir, f"output_rank_{args.rank}.log")
+    else:
+        # Single process, single log file.
+        log_file_path = os.path.join(args.logdir, "output.log")
+
+    try:
+        log_file_handle = open(log_file_path, "a", encoding="utf-8")  # 'a' for append
+        tee_stdout = Tee(log_file_handle, original_stdout)
+        tee_stderr = Tee(log_file_handle, original_stderr)  # Log stderr to the same file
+        sys.stdout = tee_stdout
+        sys.stderr = tee_stderr
+        print(f"Worker {args.rank} (GPU {gpu}) logging to: {log_file_path}")
+
+        torch.cuda.set_device(args.gpu)
+        torch.backends.cudnn.benchmark = True
+        args.test_mode = False
+        loader = get_loader(args)
+        print(args.rank, " gpu", args.gpu)
+        if args.rank == 0:
+            print("Batch size is:", args.batch_size, "epochs", args.max_epochs)
+        inf_size = [args.roi_x, args.roi_y, args.roi_z]
+
+        pretrained_dir = args.pretrained_dir
+
+        model = SwinUNETR(
+            in_channels=args.in_channels,
+            out_channels=args.out_channels,
+            feature_size=args.feature_size,
+            drop_rate=0.0,
+            attn_drop_rate=0.0,
+            dropout_path_rate=args.dropout_path_rate,
+            use_checkpoint=args.use_checkpoint,
+        )
+
+        if args.resume_ckpt:
+            model_dict = torch.load(os.path.join(pretrained_dir, args.pretrained_model_name))["state_dict"]
+            model.load_state_dict(model_dict)
+            print("Use pretrained weights")
+
+        if args.use_ssl_pretrained:
+            try:
+                model_dict = torch.load("pretrained_models/model_swinvit.pt")
+                state_dict = model_dict["state_dict"]
+                # fix potential differences in state dict keys from pre-training to
+                # fine-tuning
+                if "module." in list(state_dict.keys())[0]:
+                    print("Tag 'module.' found in state dict - fixing!")
+                    for key in list(state_dict.keys()):
+                        state_dict[key.replace("module.", "")] = state_dict.pop(key)
+                if "swin_vit" in list(state_dict.keys())[0]:
+                    print("Tag 'swin_vit' found in state dict - fixing!")
+                    for key in list(state_dict.keys()):
+                        state_dict[key.replace("swin_vit", "swinViT")] = state_dict.pop(key)
+                # We now load model weights, setting param `strict` to False, i.e.:
+                # this load the encoder weights (Swin-ViT, SSL pre-trained), but leaves
+                # the decoder weights untouched (CNN UNet decoder).
+                model.load_state_dict(state_dict, strict=False)
+                print("Using pretrained self-supervised Swin UNETR backbone weights !")
+            except ValueError:
+                raise ValueError("Self-supervised pre-trained weights not available for" + str(args.model_name))
+
+        if args.squared_dice:
+            dice_loss = DiceCELoss(
+                to_onehot_y=True, softmax=True, squared_pred=True, smooth_nr=args.smooth_nr, smooth_dr=args.smooth_dr
+            )
+        else:
+            lambda_dice_weight = args.loss_lambda_dice
+            lambda_ce_weight = args.loss_lambda_ce
+
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+            if args.out_channels > 1:
+                ce_class_weights = torch.ones(args.out_channels, dtype=torch.float32, device=device)
+                ce_class_weights[1:] = args.loss_ce_class_weights
+
+                dice_loss = DiceCELoss(
+                    sigmoid=True, lambda_dice=lambda_dice_weight, lambda_ce=lambda_ce_weight, weight=ce_class_weights
+                )
+            else:
+                ce_class_weights = torch.tensor(args.loss_ce_class_weights, dtype=torch.float32, device=device)
+
+                dice_loss = DiceCELoss(
+                    softmax=True, lambda_dice=lambda_dice_weight, lambda_ce=lambda_ce_weight, weight=ce_class_weights
+                )
+
+        post_label = EnsureType()
+        post_pred = Compose(
+            [Activations(sigmoid=True), AsDiscrete(threshold=0.5)]
+        )  # TODO: confirm which post_label and post_pred needs to be used
+
+        dice_acc = DiceMetric(include_background=True, reduction=MetricReduction.MEAN, get_not_nans=True)
+
+        model_inferer = partial(
+            sliding_window_inference,
+            roi_size=inf_size,
+            sw_batch_size=args.sw_batch_size,
+            predictor=model,
+            overlap=args.infer_overlap,
+        )
+
+        pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print("Total parameters count", pytorch_total_params)
+
+        best_acc = 0
+        start_epoch = 0
+
+        if args.checkpoint is not None:
+            checkpoint = torch.load(args.checkpoint, map_location="cpu")
+            from collections import OrderedDict
+
+            new_state_dict = OrderedDict()
+            for k, v in checkpoint["state_dict"].items():
+                new_state_dict[k.replace("backbone.", "")] = v
+            model.load_state_dict(new_state_dict, strict=False)
+            if "epoch" in checkpoint:
+                start_epoch = checkpoint["epoch"]
+            if "best_acc" in checkpoint:
+                best_acc = checkpoint["best_acc"]
+            print("=> loaded checkpoint '{}' (epoch {}) (bestacc {})".format(args.checkpoint, start_epoch, best_acc))
+
+        model.cuda(args.gpu)
+
+        if args.distributed:
+            torch.cuda.set_device(args.gpu)
+            if args.norm_name == "batch":
+                model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+            model.cuda(args.gpu)
+            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], output_device=args.gpu)
+        if args.optim_name == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=args.optim_lr, weight_decay=args.reg_weight)
+        elif args.optim_name == "adamw":
+            optimizer = torch.optim.AdamW(model.parameters(), lr=args.optim_lr, weight_decay=args.reg_weight)
+        elif args.optim_name == "sgd":
+            optimizer = torch.optim.SGD(
+                model.parameters(),
+                lr=args.optim_lr,
+                momentum=args.momentum,
+                nesterov=True,
+                weight_decay=args.reg_weight,
+            )
+        else:
+            raise ValueError("Unsupported Optimization Procedure: " + str(args.optim_name))
+
+        if args.lrschedule == "warmup_cosine":
+            scheduler = LinearWarmupCosineAnnealingLR(
+                optimizer, warmup_epochs=args.warmup_epochs, max_epochs=args.max_epochs
+            )
+        elif args.lrschedule == "cosine_anneal":
+            scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=args.max_epochs)
+            if args.checkpoint is not None:
+                scheduler.step(epoch=start_epoch)
+        else:
+            scheduler = None
+
+        accuracy = run_training(
+            model=model,
+            train_loader=loader[0],
+            val_loader=loader[1],
+            optimizer=optimizer,
+            loss_func=dice_loss,
+            acc_func=dice_acc,
+            args=args,
+            model_inferer=model_inferer,
+            scheduler=scheduler,
+            start_epoch=start_epoch,
+            post_label=post_label,
+            post_pred=post_pred,
+        )
+
+        upload_directory_to_minio(
+            bucket_name=args.storage_bucket_name,
+            local_directory=args.logdir,
+            destination_directory=args.storage_destination_directory,
+        )
+
+        return accuracy
+
+    except Exception as e:
+        print(f"!!!!!!!!!! ERROR IN WORKER RANK {args.rank} (GPU {gpu}) !!!!!!!!!!")
+        print(e)
+        import traceback
+
+        print(traceback.format_exc())
+    finally:
+        sys.stdout = original_stdout
+        sys.stderr = original_stderr
+        if tee_stdout:
+            tee_stdout.close()
+        elif log_file_handle:
+            log_file_handle.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/requirements.txt b/workloads/dev-lifescience-swinunetr-training/helm/mount/requirements.txt
new file mode 100644
index 0000000..e82f893
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/requirements.txt
@@ -0,0 +1,8 @@
+connected-components-3d
+einops==0.4.1
+matplotlib
+monai[nibabel,pillow,ignite,tqdm,pydicom]==1.5.0
+numpy==1.26.4
+scipy==1.10.1
+tensorboard==2.13.0
+tensorboardX==2.1
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/trainer.py b/workloads/dev-lifescience-swinunetr-training/helm/mount/trainer.py
new file mode 100644
index 0000000..fc016db
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/trainer.py
@@ -0,0 +1,430 @@
+# Copyright 2020 - 2022 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified by Reference Models team (AMD) on 2025:
+# - Adaptations for training on the NSCLCL-Radiomics dataset
+# - Profiling support
+
+import os
+import shutil
+import time
+from pathlib import Path
+
+import numpy as np
+import torch
+import torch.nn.parallel
+import torch.profiler
+import torch.utils.data.distributed
+from data_utils import IMAGE_DATA, LABEL_DATA
+from monai.data import decollate_batch
+from torch.cuda.amp import GradScaler, autocast
+from torch.profiler import record_function
+from torch.utils.tensorboard import SummaryWriter
+from utils import AverageMeter, distributed_all_gather
+
+
+def train_epoch(model, loader, optimizer, scaler, epoch, loss_func, args):
+    model.train()
+    start_time = time.time()
+    run_loss = AverageMeter()
+    loader_iter = iter(loader)
+    idx = 0
+
+    with record_function("__sec:loader_first"):
+        batch_data = next(loader_iter, None)
+
+    while batch_data:
+        with record_function("__sec:step"):
+            if isinstance(batch_data, list):
+                data, target = batch_data
+            else:
+                data, target = batch_data[IMAGE_DATA], batch_data[LABEL_DATA]
+            data, target = data.cuda(args.rank), target.cuda(args.rank)
+            for param in model.parameters():
+                param.grad = None
+
+            with autocast(enabled=args.amp, dtype=torch.bfloat16):
+                with record_function("__sec:forward"):
+                    model_forward = model(data)
+                with record_function("__sec:loss"):
+                    loss = loss_func(model_forward, target)
+
+            with record_function("__sec:backward"):
+                if args.amp:
+                    scaler.scale(loss).backward()
+                    scaler.step(optimizer)
+                    scaler.update()
+                else:
+                    loss.backward()
+                    optimizer.step()
+
+            if args.distributed:
+                with record_function("__sec:loss_gather"):
+                    loss_list = distributed_all_gather(
+                        [loss], out_numpy=True, is_valid=idx < loader.sampler.valid_length
+                    )
+                    run_loss.update(
+                        np.mean(np.mean(np.stack(loss_list, axis=0), axis=0), axis=0),
+                        n=args.batch_size * args.world_size,
+                    )
+            else:
+                run_loss.update(loss.item(), n=args.batch_size)
+
+            if args.rank == 0:
+                print(
+                    "Epoch {}/{} {}/{}".format(epoch, args.max_epochs, idx, len(loader)),
+                    "loss: {:.4f}".format(run_loss.avg),
+                    "time {:.4f}s".format(time.time() - start_time),
+                )
+            start_time = time.time()
+
+            idx += 1
+            with record_function("__sec:loader_next"):
+                batch_data = next(loader_iter, None)
+
+    for param in model.parameters():
+        param.grad = None
+
+    return run_loss.avg
+
+
+def val_epoch(model, loader, epoch, acc_func, args, model_inferer=None, post_label=None, post_pred=None, writer=None):
+    model.eval()
+    run_acc = AverageMeter()
+    start_time = time.time()
+    with torch.no_grad():
+        for idx, batch_data in enumerate(loader):
+            if isinstance(batch_data, list):
+                data, target = batch_data
+            else:
+                data, target = batch_data[IMAGE_DATA], batch_data[LABEL_DATA]
+            data, target = data.cuda(args.rank), target.cuda(args.rank)
+            with autocast(enabled=args.amp):
+                if model_inferer is not None:
+                    logits = model_inferer(data)
+                else:
+                    logits = model(data)
+
+            if idx == 0 and args.rank == 0 and writer is not None:
+                try:
+                    save_dir = args.logdir
+                    os.makedirs(save_dir, exist_ok=True)
+
+                    data_cpu = data.detach().cpu()
+                    target_cpu = target.detach().cpu()
+                    logits_cpu = logits.detach().cpu()
+                    print(f"Validation input shape: {data_cpu.shape}")
+
+                    if data_cpu.ndim != 5:
+                        print(
+                            f"Warning: Expected 5D data (B, C, H, W, D), but got {data_cpu.ndim}D. Skipping image logging."
+                        )
+                    else:
+                        if args.out_channels != 1:
+                            print(
+                                "Warning: logging of images to tensorboard is only implemented for args.out_channels=1. Only the first channel will be logged."
+                            )
+
+                        if args.save_results:
+                            # --- Numpy volumne saving ---
+                            # 1. Image Volume (Channel 0)
+                            image_volume = data_cpu[0, 0].numpy()  # Shape [H, W, D]
+                            img_filename = os.path.join(save_dir, f"epoch_{epoch}_image.npy")
+                            np.save(img_filename, image_volume)
+
+                            # 2. Label Volume (Channel 0)
+                            label_volume = target_cpu[0, 0].numpy()
+                            lbl_filename = os.path.join(save_dir, f"epoch_{epoch}_label.npy")
+                            np.save(lbl_filename, label_volume)
+
+                            # 3. Prediction Volume
+                            # Apply sigmoid and threshold to the whole volume's logits
+                            logits_sample = logits_cpu[0, 0]
+                            probs_volume = torch.sigmoid(logits_sample)
+                            pred_volume = (probs_volume > 0.5).float().numpy()
+                            pred_filename = os.path.join(save_dir, f"epoch_{epoch}_prediction.npy")
+                            np.save(pred_filename, pred_volume)
+
+                            print(f"Saved validation sample volumes for epoch {epoch} to {save_dir}")
+
+                        # --- Tensorboard saving ---
+
+                        # Get middle slice index
+                        slice_idx = data_cpu.shape[-1] // 2
+
+                        print(f"Log Check - Data shape: {data_cpu.shape}")
+                        print(f"Log Check - Target shape: {target_cpu.shape}")
+                        print(f"Log Check - Logits shape: {logits_cpu.shape}")
+                        print(f"Log Check - Logits min/max: {logits_cpu.min()} / {logits_cpu.max()}")
+                        # Check if guidance channel exists
+                        if data_cpu.shape[1] > 1:
+                            print(f"Log Check - Guidance min/max: {data_cpu[0, 1].min()} / {data_cpu[0, 1].max()}")
+
+                        img_slice = data_cpu[0, 0, :, :, slice_idx].float().unsqueeze(0)
+                        label_slice = target_cpu[0, 0, :, :, slice_idx].float().unsqueeze(0)
+
+                        # 1. Select the single logit channel for the slice
+                        logit_slice = logits_cpu[0, 0, :, :, slice_idx].float()
+                        # 2. Apply Sigmoid to get probabilities
+                        prob_slice = torch.sigmoid(logit_slice)
+                        # 3. Threshold probabilities to get binary prediction
+                        pred_slice = (prob_slice > 0.5).float().unsqueeze(0)
+
+                        # Normalize Image/Guidance for Display
+                        img_min = img_slice.min()
+                        img_max = img_slice.max()
+                        img_slice = (img_slice - img_min) / (img_max - img_min + 1e-6)
+
+                        # Convert to Tensor (if MetaTensor)
+                        img_slice_tensor = img_slice.as_tensor()
+                        label_slice_tensor = label_slice.as_tensor()
+                        pred_slice_tensor = pred_slice
+
+                        # Add images to TensorBoard
+                        writer.add_image("Validation/Input_Slice", img_slice_tensor, epoch, dataformats="CHW")
+                        writer.add_image("Validation/Target_Slice", label_slice_tensor, epoch, dataformats="CHW")
+                        writer.add_image("Validation/Prediction_Slice", pred_slice_tensor, epoch, dataformats="CHW")
+                        print(
+                            f"Logged validation images for epoch {epoch}, batch {idx}, slice {slice_idx} to {save_dir}"
+                        )
+
+                except Exception as e:
+                    import traceback
+
+                    print(f"Warning: Could not log validation images for epoch {epoch}. Error: {e}")
+                    print(traceback.format_exc())
+
+            if not logits.is_cuda:
+                target = target.cpu()
+
+            val_labels_list = decollate_batch(target)
+            val_labels_convert = [post_label(val_label_tensor) for val_label_tensor in val_labels_list]
+            val_outputs_list = decollate_batch(logits)
+            val_output_convert = [post_pred(val_pred_tensor) for val_pred_tensor in val_outputs_list]
+            acc_func.reset()
+            acc_func(y_pred=val_output_convert, y=val_labels_convert)
+            acc, not_nans = acc_func.aggregate()
+            acc = acc.cuda(args.rank)
+
+            if args.distributed:
+                acc_list, not_nans_list = distributed_all_gather(
+                    [acc, not_nans], out_numpy=True, is_valid=idx < loader.sampler.valid_length
+                )
+                for al, nl in zip(acc_list, not_nans_list):
+                    run_acc.update(al, n=nl)
+
+            else:
+                run_acc.update(acc.cpu().numpy(), n=not_nans.cpu().numpy())
+
+            if args.rank == 0:
+                avg_acc = np.mean(run_acc.avg)
+                print(
+                    "Val {}/{} {}/{}".format(epoch, args.max_epochs, idx, len(loader)),
+                    "acc",
+                    avg_acc,
+                    "time {:.2f}s".format(time.time() - start_time),
+                )
+            start_time = time.time()
+    return run_acc.avg
+
+
+def save_checkpoint(model, epoch, args, filename="model.pt", best_acc=0, optimizer=None, scheduler=None):
+    state_dict = model.state_dict() if not args.distributed else model.module.state_dict()
+    save_dict = {"epoch": epoch, "best_acc": best_acc, "state_dict": state_dict}
+    if optimizer is not None:
+        save_dict["optimizer"] = optimizer.state_dict()
+    if scheduler is not None:
+        save_dict["scheduler"] = scheduler.state_dict()
+    filename = os.path.join(args.logdir, filename)
+    torch.save(save_dict, filename)
+    print("Saving checkpoint", filename)
+
+
+def start_profiler(args) -> torch.profiler.profile | None:
+
+    profiler_log_dir = os.path.join(args.logdir, "profiler")
+    Path(profiler_log_dir).mkdir(parents=True, exist_ok=True)
+
+    wait, warmup, active, repeat = (int(i) for i in args.profiler_schedule.split(","))
+
+    # tb_trace_handler = torch.profiler.tensorboard_trace_handler(
+    #     profiler_log_dir, use_gzip=True
+    # )
+
+    def on_trace_ready(prof: torch.profiler.profile):
+
+        # for simplicity we'll disable memory recording after the first trace is ready
+        # which should capture the majority of use cases.
+        # note that everything up to this point will be capture (up to `max_entries` events),
+        # not just the profiler's active steps.
+        memory_path = os.path.join(profiler_log_dir, "memory.pickle")
+        if not os.path.exists(memory_path):
+            try:
+                print("Exporting memory timeline")
+                torch.cuda.memory._dump_snapshot()
+            except Exception as e:
+                print(f"Exporting memory failed with error: {e}")
+            finally:
+                torch.cuda.memory._record_memory_history(enabled=False)
+
+        print("Exporting chrome trace")
+        prof.export_chrome_trace(os.path.join(profiler_log_dir, f"{prof.step_num}-chrome_trace.json.gz"))
+
+        # print("Exporting tensorboard trace")
+        # tb_trace_handler(prof)
+
+        print("Saving trace summary")
+        with open(os.path.join(profiler_log_dir, f"{prof.step_num}-trace_summary.txt"), "wt") as fo:
+            fo.write(str(prof.key_averages().table(sort_by="cuda_time_total", row_limit=1000)))
+
+    wait, warmup, active, repeat = (int(i) for i in args.profiler_schedule.split(","))
+    print(
+        f"INFO: PyTorch Profiler enabled. Waiting {wait}, warming up {warmup}, recording {active} epochs and repeat {repeat}."
+    )
+
+    prof = torch.profiler.profile(
+        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
+        schedule=torch.profiler.schedule(wait=wait, warmup=warmup, active=active, repeat=repeat),
+        record_shapes=True,
+        profile_memory=False,
+        with_stack=True,
+        on_trace_ready=on_trace_ready,
+    )
+    prof.start()
+
+    torch.cuda.memory._record_memory_history(max_entries=100000)
+
+    return prof
+
+
+def run_training(
+    model,
+    train_loader,
+    val_loader,
+    optimizer,
+    loss_func,
+    acc_func,
+    args,
+    model_inferer=None,
+    scheduler=None,
+    start_epoch=0,
+    post_label=None,
+    post_pred=None,
+):
+    writer = None
+    if args.logdir is not None and args.rank == 0:
+        writer = SummaryWriter(log_dir=args.logdir)
+        if args.rank == 0:
+            print("Writing Tensorboard logs to ", args.logdir)
+    scaler = None
+    if args.amp:
+        scaler = GradScaler()
+    val_acc_max = 0.0
+
+    prof = None
+    if args.rank == 0 and args.profile:
+        prof = start_profiler(args)
+
+    if args.rank == 0:
+        torch.cuda.reset_peak_memory_stats(device=args.gpu)
+
+    start_time_global = time.time()
+
+    for epoch in range(start_epoch, args.max_epochs):
+        if args.distributed:
+            train_loader.sampler.set_epoch(epoch)
+            torch.distributed.barrier()
+        print(args.rank, time.ctime(), "Epoch:", epoch)
+        epoch_time = time.time()
+
+        train_loss = train_epoch(
+            model, train_loader, optimizer, scaler=scaler, epoch=epoch, loss_func=loss_func, args=args
+        )
+        if args.rank == 0:
+            print(
+                "Final training  {}/{}".format(epoch, args.max_epochs - 1),
+                "loss: {:.4f}".format(train_loss),
+                "time {:.2f}s".format(time.time() - epoch_time),
+            )
+            print("Time since start: {:.2f}s".format(time.time() - start_time_global))
+
+        if args.rank == 0 and writer is not None:
+            writer.add_scalar("train_loss", train_loss, epoch)
+        b_new_best = False
+        if (epoch + 1) % args.val_every == 0:
+            if args.distributed:
+                torch.distributed.barrier()
+            epoch_time = time.time()
+            val_avg_acc = val_epoch(
+                model,
+                val_loader,
+                epoch=epoch,
+                acc_func=acc_func,
+                model_inferer=model_inferer,
+                args=args,
+                post_label=post_label,
+                post_pred=post_pred,
+                writer=writer,
+            )
+
+            val_avg_acc = np.mean(val_avg_acc)
+
+            if args.rank == 0:
+                print(
+                    "Final validation  {}/{}".format(epoch, args.max_epochs - 1),
+                    "acc",
+                    val_avg_acc,
+                    "time {:.2f}s".format(time.time() - epoch_time),
+                )
+                if writer is not None:
+                    writer.add_scalar("val_acc", val_avg_acc, epoch)
+                if val_avg_acc > val_acc_max:
+                    print("new best ({:.6f} --> {:.6f}). ".format(val_acc_max, val_avg_acc))
+                    val_acc_max = val_avg_acc
+                    b_new_best = True
+                    if args.rank == 0 and args.logdir is not None and args.save_checkpoint:
+                        save_checkpoint(
+                            model, epoch, args, best_acc=val_acc_max, optimizer=optimizer, scheduler=scheduler
+                        )
+            if args.rank == 0 and args.logdir is not None and args.save_checkpoint:
+                save_checkpoint(model, epoch, args, best_acc=val_acc_max, filename="model_final.pt")
+                if b_new_best:
+                    print("Copying to model.pt new best model!!!!")
+                    shutil.copyfile(os.path.join(args.logdir, "model_final.pt"), os.path.join(args.logdir, "model.pt"))
+
+        if scheduler is not None:
+            scheduler.step()
+
+        if args.rank == 0:
+            current_device = args.gpu
+            max_mem_allocated_bytes = torch.cuda.max_memory_allocated(device=current_device)
+            max_mem_allocated_gb = max_mem_allocated_bytes / (1024 * 1024 * 1024)
+            max_mem_reserved_bytes = torch.cuda.max_memory_reserved(device=current_device)
+            max_mem_reserved_gb = max_mem_reserved_bytes / (1024 * 1024 * 1024)
+
+            print(
+                f"Epoch {epoch} GPU Peak Memory - "
+                f"Allocated: {max_mem_allocated_gb:.2f} GB, "
+                f"Reserved: {max_mem_reserved_gb:.2f} GB"
+            )
+            torch.cuda.reset_peak_memory_stats(device=current_device)
+
+        if prof:
+            prof.step()
+
+    if prof:
+        # NOTE: torch.profiler.profile is meant to be used as a context manager.
+        #       Calling `.__exit__` here instead of just `.stop` to perform some additional
+        #       cleanup needed, but should consider refactoring to avoid unexpected behaviour.
+        prof.__exit__(None, None, None)
+
+    print("Training Finished !, Best Accuracy: ", val_acc_max)
+    return val_acc_max
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/mount/utils.py b/workloads/dev-lifescience-swinunetr-training/helm/mount/utils.py
new file mode 100644
index 0000000..653436b
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/mount/utils.py
@@ -0,0 +1,198 @@
+# Copyright 2020 - 2022 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified by Reference Models team (AMD) on 2025:
+# - Added MinIO utility functions: download_dataset_from_minio and upload_directory_to_minio
+# - Added class Tee to log standard output and error to file
+# - Added class SafeLoadImaged to fix the error when ussing num_workers > 1 in the TciaDataset
+
+import os
+import subprocess
+from pathlib import Path
+from typing import Union
+
+import boto3
+import botocore
+import numpy as np
+import scipy.ndimage as ndimage
+import torch
+from monai.transforms import LoadImaged
+
+
+class Tee(object):
+    def __init__(self, *files):
+        self.files = files
+
+    def write(self, obj):
+        for f in self.files:
+            f.write(obj)
+            f.flush()
+
+    def flush(self):
+        for f in self.files:
+            f.flush()
+
+    def close(self):
+        for f in self.files:
+            # Avoid closing the original sys.stdout/stderr if they were passed
+            if hasattr(f, "fileno") and f.fileno() > 2:  # Heuristic: 0=stdin, 1=stdout, 2=stderr
+                try:
+                    f.close()
+                except Exception:
+                    pass
+
+
+class SafeLoadImaged:
+    """Simple wrapper for the LoadImaged that doesn't do anything else.
+    It somehow fixes the error when ussing num_workers > 1 in the TciaDataset. No idea why, it might be changing
+     how the transforms are constructed and pickled, altering the process/thread state.
+    """
+
+    def __init__(self, **kwargs):
+        self.loader = LoadImaged(**kwargs)
+
+    def __call__(self, data):
+        try:
+            return self.loader(data)
+        except Exception as e:
+            print(f"Failed to load {data}: {e}")
+            raise
+
+
+def resample_3d(img, target_size):
+    imx, imy, imz = img.shape
+    tx, ty, tz = target_size
+    zoom_ratio = (float(tx) / float(imx), float(ty) / float(imy), float(tz) / float(imz))
+    img_resampled = ndimage.zoom(img, zoom_ratio, order=0, prefilter=False)
+    return img_resampled
+
+
+def dice(x, y):
+    intersect = np.sum(np.sum(np.sum(x * y)))
+    y_sum = np.sum(np.sum(np.sum(y)))
+    if y_sum == 0:
+        return 0.0
+    x_sum = np.sum(np.sum(np.sum(x)))
+    return 2 * intersect / (x_sum + y_sum)
+
+
+class AverageMeter(object):
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = np.where(self.count > 0, self.sum / self.count, self.sum)
+
+
+def distributed_all_gather(
+    tensor_list, valid_batch_size=None, out_numpy=False, world_size=None, no_barrier=False, is_valid=None
+):
+    if world_size is None:
+        world_size = torch.distributed.get_world_size()
+    if valid_batch_size is not None:
+        valid_batch_size = min(valid_batch_size, world_size)
+    elif is_valid is not None:
+        is_valid = torch.tensor(bool(is_valid), dtype=torch.bool, device=tensor_list[0].device)
+    if not no_barrier:
+        torch.distributed.barrier()
+    tensor_list_out = []
+    with torch.no_grad():
+        if is_valid is not None:
+            is_valid_list = [torch.zeros_like(is_valid) for _ in range(world_size)]
+            torch.distributed.all_gather(is_valid_list, is_valid)
+            is_valid = [x.item() for x in is_valid_list]
+        for tensor in tensor_list:
+            gather_list = [torch.zeros_like(tensor) for _ in range(world_size)]
+            torch.distributed.all_gather(gather_list, tensor)
+            if valid_batch_size is not None:
+                gather_list = gather_list[:valid_batch_size]
+            elif is_valid is not None:
+                gather_list = [g for g, v in zip(gather_list, is_valid_list) if v]
+            if out_numpy:
+                gather_list = [t.cpu().numpy() for t in gather_list]
+            tensor_list_out.append(gather_list)
+    return tensor_list_out
+
+
+def upload_directory_to_minio(bucket_name: str, local_directory: Union[str, Path], destination_directory: str = ""):
+    """
+    Uploads a whole directory recursively to a MinIO bucket, preserving the
+    top-level folder and all of its sub-folder structure.
+    """
+    print(f"Uploading directory {local_directory} to {bucket_name}/{destination_directory}...")
+
+    endpoint_host = os.environ["BUCKET_STORAGE_HOST"]
+    if not endpoint_host.startswith(("http://", "https://")):
+        endpoint_url = f"https://{endpoint_host}"
+    else:
+        endpoint_url = endpoint_host
+
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=os.environ["BUCKET_STORAGE_ACCESS_KEY"],
+        aws_secret_access_key=os.environ["BUCKET_STORAGE_SECRET_KEY"],
+        endpoint_url=endpoint_url,
+        verify="/etc/ssl/certs/tls-ca-bundle.pem",
+    )
+
+    local_directory = Path(local_directory)
+    if not local_directory.is_dir():
+        print(f"Error: {local_directory} is not a directory.")
+        return
+
+    for local_path in local_directory.rglob("*"):
+
+        if local_path.is_file():
+            # Get the path of the file relative to the starting directory to preserve
+            # the sub-folder structure.
+            relative_path = local_path.relative_to(local_directory)
+            source_dir_name = local_directory.name
+
+            # Construct the final object key
+            if destination_directory:
+                object_key = str(Path(destination_directory) / source_dir_name / relative_path)
+            else:
+                object_key = str(Path(source_dir_name) / relative_path)
+
+            try:
+                s3.upload_file(str(local_path), bucket_name, object_key)
+                print(f"File {local_path} uploaded successfully to {bucket_name}/{object_key}.")
+            except botocore.exceptions.ClientError as e:
+                print(f"Error uploading file {local_path}: {e}")
+
+
+def download_dataset_from_minio(bucket: str, bucket_directory: str, data_root_dir: str):
+    print(f"Downloading dataset from {bucket}/{bucket_directory} to {data_root_dir}")
+
+    # Ensure local directory exists
+    os.makedirs(data_root_dir, exist_ok=True)
+
+    # Use existing minio-host alias
+    source_path = f"minio-host/{bucket}/{bucket_directory}"
+
+    try:
+        result = subprocess.run(
+            ["mc", "cp", "--recursive", source_path, f"{data_root_dir}/NSCLC-Radiomics"], check=True, text=True
+        )
+
+        print("✓ Download completed successfully!")
+
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError(f"Download failed: {e}")
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/dev-lifescience-swinunetr-training/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/templates/_helpers.tpl b/workloads/dev-lifescience-swinunetr-training/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..9de5d12
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/templates/_helpers.tpl
@@ -0,0 +1,116 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  {{- if .Values.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.ephemeral_storage }}"
+  {{- end }}
+  {{- with .Values.job.resources.cpu }}
+  {{- if .requests }}
+  cpu: {{ .requests | quote }}
+  {{- end }}
+  {{- end }}
+  {{- with .Values.job.resources.memory }}
+  {{- if .requests }}
+  memory: {{ .requests | quote }}
+  {{- end }}
+  {{- end }}
+limits:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  {{- if .Values.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.ephemeral_storage }}"
+  {{- end }}
+  {{- with .Values.job.resources.cpu }}
+  {{- if .limits }}
+  cpu: {{ .limits | quote }}
+  {{- end }}
+  {{- end }}
+  {{- with .Values.job.resources.memory }}
+  {{- if .limits }}
+  memory: {{ .limits | quote }}
+  {{- end }}
+  {{- end }}
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (kindIs "map" $value) }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- else }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- end }}
+{{- end }}
+{{- if .Values.training.args }}
+{{- range $key, $value := .Values.training.args }}
+- name: "TRAINING_ARG_{{ $key | upper | replace "-" "_" }}"
+  value: {{ $value | quote }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: /workload/mount
+  name: workload-mount
+- name: workload-outputs
+  mountPath: /workload_outputs
+- mountPath: /dev/shm
+  name: dshm
+{{- range .Values.volumes }}
+- name: {{ .name }}
+  mountPath: {{ .mountPath }}
+  readOnly: true
+{{- end }}
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+- name: dshm
+  emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+- configMap:
+    name: {{ include "release.fullname" . }}
+    defaultMode: 0755
+  name: workload-mount
+- name: workload-outputs
+  emptyDir:
+    sizeLimit: {{ .Values.storage.workload_outputs.sizeLimit }}
+
+{{- range .Values.volumes }}
+- name: {{ .name }}
+  {{- if .secret }}
+  secret:
+    secretName: {{ .secret.secretName }}
+  {{- end }}
+{{- end }}
+{{- end -}}
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/templates/configmap.yaml b/workloads/dev-lifescience-swinunetr-training/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/templates/job.yaml b/workloads/dev-lifescience-swinunetr-training/helm/templates/job.yaml
new file mode 100644
index 0000000..d89d4db
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/templates/job.yaml
@@ -0,0 +1,76 @@
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  {{- with .Values.job.backoffLimit }}
+  backoffLimit: {{ . }}
+  {{- end }}
+  {{- with .Values.job.completions }}
+  completions: {{ . }}
+  {{- end }}
+  {{- with .Values.job.parallelism }}
+  parallelism: {{ . }}
+  {{- end }}
+  {{- with .Values.job.ttlSecondsAfterFinished }}
+  ttlSecondsAfterFinished: {{ . }}
+  {{- end }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      restartPolicy: {{ .Values.job.restartPolicy | default "OnFailure" }}
+      {{- if .Values.nodeSelector }}
+      nodeSelector:
+        {{- .Values.nodeSelector | toYaml | nindent 8 }}
+      {{- end }}
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}
+          {{- if .Values.entrypoint }}
+          command: ["sh", "-c"]
+          args:
+          - |
+            {{- .Values.entrypoint | nindent 12 }}
+          {{- end }}
+          {{- if .Values.env_vars }}
+          env:
+            {{- include "container.env" . | nindent 12 }}
+          {{- end }}
+          image: {{ .Values.image | quote}}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
+{{- end -}}
+
+{{- define "job_wrapped_with_kaiwojob" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoJob
+metadata:
+  name: "{{ .Release.Name }}-job"
+spec:
+  job:
+    {{- include "job" . | nindent 4 }}
+{{- end -}}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "job_wrapped_with_kaiwojob" . }}
+{{- else -}}
+{{- include "job" . }}
+{{- end -}}
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/values.schema.json b/workloads/dev-lifescience-swinunetr-training/helm/values.schema.json
new file mode 100644
index 0000000..3af2ff5
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/values.schema.json
@@ -0,0 +1,223 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "metadata": {
+      "type": "object",
+      "description": "Metadata for the job",
+      "properties": {
+        "labels": {
+          "type": "object",
+          "description": "Labels to apply to the job",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": ["labels"]
+    },
+    "image": {
+      "type": "string",
+      "description": "Docker image to use for the job"
+    },
+    "imagePullPolicy": {
+      "type": "string",
+      "description": "Image pull policy",
+      "enum": ["Always", "IfNotPresent", "Never"]
+    },
+    "imagePullSecrets": {
+      "type": "array",
+      "description": "Image pull secrets for private registries"
+    },
+    "entrypoint": {
+      "type": "string",
+      "description": "Entrypoint for the container"
+    },
+    "gpus": {
+      "type": "integer",
+      "description": "Number of GPUs to allocate",
+      "minimum": 1
+    },
+    "cpu": {
+      "type": "object",
+      "description": "CPU resource requests and limits for the container.",
+      "properties": {
+        "requests": {
+          "type": "string",
+          "description": "The amount of CPU to request for the container. e.g., '500m' or '1'."
+        },
+        "limits": {
+          "type": "string",
+          "description": "The maximum amount of CPU the container can use. e.g., '1000m' or '2'."
+        }
+      }
+    },
+    "ephemeral_storage": {
+      "type": "string",
+      "description": "Ephemeral storage space written with as <value>Gi",
+      "minimum": 1
+    },
+    "vllm_engine_args": {
+      "type": "object",
+      "description": "Arguments for the vllm engine",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "env_vars": {
+      "type": "object",
+      "description": "Environment variables for the deployment",
+      "properties": {
+        "BUCKET_STORAGE_HOST": {
+          "type": "string",
+          "description": "Bucket storage host URL",
+          "format": "uri"
+        },
+        "BUCKET_STORAGE_ACCESS_KEY": {
+          "type": "object",
+          "description": "Access key for bucket storage",
+          "properties": {
+            "name": {
+              "type": "string",
+              "description": "Name of the secret containing the access key"
+            },
+            "key": {
+              "type": "string",
+              "description": "Key in the secret containing the access key"
+            }
+          },
+          "required": ["name", "key"]
+        },
+        "BUCKET_STORAGE_SECRET_KEY": {
+          "type": "object",
+          "description": "Secret key for bucket storage",
+          "properties": {
+            "name": {
+              "type": "string",
+              "description": "Name of the secret containing the secret key"
+            },
+            "key": {
+              "type": "string",
+              "description": "Key in the secret containing the secret key"
+            }
+          },
+          "required": ["name", "key"]
+        }
+      },
+      "required": ["BUCKET_STORAGE_HOST", "BUCKET_STORAGE_ACCESS_KEY", "BUCKET_STORAGE_SECRET_KEY"]
+    },
+    "storage": {
+      "type": "object",
+      "description": "Storage configuration",
+      "properties": {
+        "dshm": {
+          "type": "object",
+          "description": "Shared memory configuration",
+          "properties": {
+            "sizeLimit": {
+              "type": "string",
+              "description": "Size limit for shared memory"
+            }
+          },
+          "required": ["sizeLimit"]
+        }
+      },
+      "required": ["dshm"]
+    },
+    "training": {
+      "type": "object",
+      "description": "Configuration for the training script.",
+      "properties": {
+        "args": {
+          "type": "object",
+          "description": "Arguments to be passed to the main training script. These will be converted into command-line flags.",
+          "additionalProperties": {
+            "anyOf": [
+              { "type": "string" },
+              { "type": "number" },
+              { "type": "boolean" }
+            ]
+          }
+        }
+      }
+    },
+    "volumes": {
+      "type": "array",
+      "description": "Custom volumes to mount from secrets or configmaps.",
+      "items": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string"
+          },
+          "mountPath": {
+            "type": "string"
+          },
+          "secret": {
+            "type": "object",
+            "properties": {
+              "secretName": {
+                "type": "string"
+              }
+            },
+            "required": ["secretName"]
+          }
+        },
+        "required": ["name", "mountPath"]
+      }
+    },
+    "nodeSelector": {
+      "type": "object",
+      "properties": {
+        "dev": {
+          "type": "string",
+          "description": "If true, use the dev node selector"
+        }
+      }
+    },
+    "kaiwo": {
+      "type": "object",
+      "properties": {
+        "enabled": {
+          "type": "boolean",
+          "description": "If true, use Kaiwo CRDs to have Kaiwo operator manage the workload"
+        }
+      }
+    },
+    "job": {
+      "type": "object",
+      "description": "Configuration specific to the Job resource.",
+      "properties": {
+        "restartPolicy": {
+          "type": "string",
+          "description": "Restart policy for the pod in the Job. Can be 'OnFailure' or 'Never'.",
+          "enum": ["OnFailure", "Never"],
+          "default": "OnFailure"
+        },
+        "backoffLimit": {
+          "type": "integer",
+          "description": "Number of retries before marking a Job as failed.",
+          "minimum": 0
+        },
+        "completions": {
+          "type": "integer",
+          "description": "The desired number of successfully finished pods the job should run to.",
+          "minimum": 1
+        },
+        "parallelism": {
+          "type": "integer",
+          "description": "The maximum number of pods that can run in parallel.",
+          "minimum": 1
+        },
+        "ttlSecondsAfterFinished": {
+          "type": "integer",
+          "description": "The lifetime (in seconds) of a Job after it has finished execution, for automatic cleanup.",
+          "minimum": 0
+        }
+      },
+      "required": ["restartPolicy"]
+    }
+  },
+  "required": ["metadata", "image", "imagePullPolicy", "gpus", "ephemeral_storage", "storage", "kaiwo", "job"],
+  "additionalProperties": false
+}
diff --git a/workloads/dev-lifescience-swinunetr-training/helm/values.yaml b/workloads/dev-lifescience-swinunetr-training/helm/values.yaml
new file mode 100644
index 0000000..8217ece
--- /dev/null
+++ b/workloads/dev-lifescience-swinunetr-training/helm/values.yaml
@@ -0,0 +1,69 @@
+metadata:
+  labels: {}
+
+image: rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
+imagePullPolicy: Always
+
+job:
+  restartPolicy: Never
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 86400
+  resources:
+    cpu:
+      requests: "64"
+      limits: "64"
+    memory:
+      requests: "128Gi"
+      limits: "128Gi"
+
+ephemeral_storage: "128Gi"
+
+gpus: 1
+
+training:
+  args:
+    dataset_bucket_name: "default-bucket"
+    dataset_bucket_directory: "reference-models/datasets/NSCLC-Radiomics/"  # in minio
+    dataset_labels: "GTV-1"
+    val_every: 5  # change to 5 for testing, originally 20
+    in_channels: 1
+    out_channels: 1
+    warmup_epochs: 100
+    save_checkpoint: true
+    use_checkpoint: true
+    feature_size: 48
+    roi_x: 96
+    roi_y: 96
+    roi_z: 96
+    logdir: "exp123_outputs"
+    n_crops_val: 2
+    n_crops: 2
+    batch_size: 1
+    workers: 32
+    max_epochs: 10  # change to 10 for testing, originally 700
+    storage_bucket_name: "default-bucket"
+    storage_destination_directory: "reference-models/models/swinunetr"  # in minio
+
+env_vars:
+  BUCKET_STORAGE_HOST: "http://minio.minio-tenant-default.svc.cluster.local:80"
+  BUCKET_STORAGE_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  BUCKET_STORAGE_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  MIOPEN_FIND_MODE: 1
+  MIOPEN_FIND_ENFORCE: 3
+
+entrypoint: |
+  bash /workload/mount/entrypoint.sh
+
+storage:
+  dshm:
+    sizeLimit: 64Gi
+  workload_outputs:
+    sizeLimit: 128Gi
+
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
diff --git a/workloads/dev-text2image-comfyui/helm/README.md b/workloads/dev-text2image-comfyui/helm/README.md
index 2eee7a8..fb9ec71 100644
--- a/workloads/dev-text2image-comfyui/helm/README.md
+++ b/workloads/dev-text2image-comfyui/helm/README.md
@@ -15,11 +15,30 @@ You can configure the following parameters in the `values.yaml` file or override
 | Parameter                    | Description                                                           | Default                                    |
 |------------------------------|-----------------------------------------------------------------------|--------------------------------------------|
 | `image`                      | Container image repository and tag                                    | `rocm/dev-ubuntu-22.04:6.2.4`             |
+| `imagePullSecrets`           | List of Kubernetes secrets for pulling images from private registries  | `[]`                                       |
 | `gpus`                       | Number of GPUs to allocate                                            | `1`                                        |
 | `model`                     | HuggingFace model path (e.g., `Comfy-Org/flux1-dev`)                | Not set                                    |
 | `tag`                       | Specific model binaries (**\*tag\*.safetensors**)  to download (optional)                        | Clone the repo when not set                                   |
 | `storage.ephemeral.quantity` | Ephemeral storage size                                               | `200Gi`                                    |
 | `kaiwo.enabled`             | Enable Kaiwo operator management                                      | `false`                                    |
+## Using Private Container Registries
+
+If you need to pull images from a private registry, set the `imagePullSecrets` field in your `values.yaml` or via the command line. This should be a list of Kubernetes secret names that provide credentials for your registry.
+
+Example in `values.yaml`:
+
+```yaml
+imagePullSecrets:
+  - my-registry-secret
+```
+
+Or via the command line:
+
+```bash
+helm template . --set imagePullSecrets={my-registry-secret} | kubectl apply -f -
+```
+
+The deployment will include these secrets in the pod spec, allowing Kubernetes to authenticate to your private registry.
 
 ## Environment Variables
 
diff --git a/workloads/dev-text2image-comfyui/helm/templates/deployment.yaml b/workloads/dev-text2image-comfyui/helm/templates/deployment.yaml
index a209880..662b29a 100644
--- a/workloads/dev-text2image-comfyui/helm/templates/deployment.yaml
+++ b/workloads/dev-text2image-comfyui/helm/templates/deployment.yaml
@@ -22,6 +22,12 @@ spec:
       labels:
         app: {{ include "release.fullname" . }}
     spec:
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
       containers:
         - name: {{ .Chart.Name }}
           args:
diff --git a/workloads/dev-text2image-comfyui/helm/values.yaml b/workloads/dev-text2image-comfyui/helm/values.yaml
index aaaebd8..9ba54c5 100644
--- a/workloads/dev-text2image-comfyui/helm/values.yaml
+++ b/workloads/dev-text2image-comfyui/helm/values.yaml
@@ -14,6 +14,7 @@ model: Comfy-Org/flux1-dev
 # tag: sd_xl_turbo_1.0
 
 imagePullPolicy: Always
+imagePullSecrets: []
 gpus: 1
 cpu_per_gpu: 16
 memory_per_gpu: 128
diff --git a/workloads/dev-tracking-mlflow/helm/README.md b/workloads/dev-tracking-mlflow/helm/README.md
index a13622a..1471523 100644
--- a/workloads/dev-tracking-mlflow/helm/README.md
+++ b/workloads/dev-tracking-mlflow/helm/README.md
@@ -27,6 +27,26 @@ You can configure the following parameters in the `values.yaml` file or override
 | `backendStore.secret.name`            | Secret name for database credentials                          | `mlflow-db-credentials`                    |
 | `backendStore.secret.userKey`         | Key in secret for username                                    | `username`                                 |
 | `backendStore.secret.passwordKey`     | Key in secret for password                                    | `password`                                 |
+| `imagePullSecrets`                    | List of Kubernetes secrets for pulling images from private registries  | `[]`                                       |
+
+## Using Private Container Registries
+
+If you need to pull images from a private registry, set the `imagePullSecrets` field in your `values.yaml` or via the command line. This should be a list of Kubernetes secret names that provide credentials for your registry.
+
+Example in `values.yaml`:
+
+```yaml
+imagePullSecrets:
+  - my-registry-secret
+```
+
+Or via the command line:
+
+```bash
+helm template . --set imagePullSecrets={my-registry-secret} | kubectl apply -f -
+```
+
+The deployment will include these secrets in the pod spec, allowing Kubernetes to authenticate to your private registry.
 
 ### Artifact Storage Configuration
 
diff --git a/workloads/dev-tracking-mlflow/helm/templates/deployment.yaml b/workloads/dev-tracking-mlflow/helm/templates/deployment.yaml
index 49c7df8..4733c87 100644
--- a/workloads/dev-tracking-mlflow/helm/templates/deployment.yaml
+++ b/workloads/dev-tracking-mlflow/helm/templates/deployment.yaml
@@ -22,6 +22,12 @@ spec:
       labels:
         app: {{ include "release.fullname" . }}
     spec:
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
       containers:
         - name: {{ .Chart.Name }}
           command: ["sh", "-c"]
diff --git a/workloads/dev-tracking-mlflow/helm/values.yaml b/workloads/dev-tracking-mlflow/helm/values.yaml
index a89ecdc..f9147cd 100644
--- a/workloads/dev-tracking-mlflow/helm/values.yaml
+++ b/workloads/dev-tracking-mlflow/helm/values.yaml
@@ -6,6 +6,7 @@ metadata:
 
 image: ghcr.io/mlflow/mlflow:v2.22.0
 imagePullPolicy: Always
+imagePullSecrets: []
 
 gpus: 0
 # also default when no gpu assigned
diff --git a/workloads/dev-workspace-jupyterlab/helm/README.md b/workloads/dev-workspace-jupyterlab/helm/README.md
index d57be00..15b7aad 100644
--- a/workloads/dev-workspace-jupyterlab/helm/README.md
+++ b/workloads/dev-workspace-jupyterlab/helm/README.md
@@ -10,6 +10,7 @@ You can configure the following parameters in the `values.yaml` file or override
 |------------------------|-----------------------------------------------------------------------------|-------------------------------------------------------------------------|
 | `image`                | Container image repository and tag                                          | `rocm/pytorch:rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0`         |
 | `imagePullPolicy`      | Image pull policy                                                           | `Always`                                                                |
+| `imagePullSecrets`     | List of image pull secrets for private registries                           | `[]`                                                                    |
 | `gpus`                 | Number of GPUs to allocate (set to 0 for CPU-only mode)                     | `1`                                                                     |
 | `memory_per_gpu`       | Memory allocated per GPU (in Gi)                                            | `128`                                                                   |
 | `cpu_per_gpu`          | CPU cores allocated per GPU                                                 | `4`                                                                     |
diff --git a/workloads/dev-workspace-jupyterlab/helm/templates/deployment.yaml b/workloads/dev-workspace-jupyterlab/helm/templates/deployment.yaml
index 279a8a6..30976a3 100644
--- a/workloads/dev-workspace-jupyterlab/helm/templates/deployment.yaml
+++ b/workloads/dev-workspace-jupyterlab/helm/templates/deployment.yaml
@@ -22,41 +22,47 @@ spec:
       labels:
         app: {{ include "release.fullname" . }}
     spec:
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
       containers:
-        - name: {{ .Chart.Name }}
-          {{- if .Values.entrypoint }}
-          args:
-          - |
-            {{- .Values.entrypoint | nindent 12 }}
-          command: ["sh", "-c"]
-          {{- end }}
-          {{- if .Values.env_vars }}
-          env:
-            {{- include "container.env" . | nindent 12 }}
-          {{- end }}
-          image: {{ .Values.image | quote}}
-          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
-          {{- if .Values.livenessProbe }}
-          livenessProbe:
-            {{- tpl (.Values.livenessProbe | toYaml) . | nindent 12 }}
-          {{- end }}
-          ports:
-            {{- range $key, $value := .Values.deployment.ports }}
-            - name: {{ $key }}
-              containerPort: {{ $value }}
-            {{- end }}
-          {{- if .Values.readinessProbe }}
-          readinessProbe:
-            {{- tpl (.Values.readinessProbe | toYaml) . | nindent 12 }}
-          {{- end }}
-          resources:
-            {{- include "container.resources" . | nindent 12 }}
-          {{- if .Values.startupProbe }}
-          startupProbe:
-            {{- tpl (.Values.startupProbe | toYaml) . | nindent 12 }}
+      - name: {{ .Chart.Name }}
+        {{- if .Values.entrypoint }}
+        args:
+        - |
+          {{- .Values.entrypoint | nindent 12 }}
+        command: ["sh", "-c"]
+        {{- end }}
+        {{- if .Values.env_vars }}
+        env:
+          {{- include "container.env" . | nindent 12 }}
+        {{- end }}
+        image: {{ .Values.image | quote}}
+        imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+        {{- if .Values.livenessProbe }}
+        livenessProbe:
+          {{- tpl (.Values.livenessProbe | toYaml) . | nindent 12 }}
+        {{- end }}
+        ports:
+          {{- range $key, $value := .Values.deployment.ports }}
+          - name: {{ $key }}
+            containerPort: {{ $value }}
           {{- end }}
-          volumeMounts:
-            {{- include "container.volumeMounts" . | nindent 12 }}
+        {{- if .Values.readinessProbe }}
+        readinessProbe:
+          {{- tpl (.Values.readinessProbe | toYaml) . | nindent 12 }}
+        {{- end }}
+        resources:
+          {{- include "container.resources" . | nindent 12 }}
+        {{- if .Values.startupProbe }}
+        startupProbe:
+          {{- tpl (.Values.startupProbe | toYaml) . | nindent 12 }}
+        {{- end }}
+        volumeMounts:
+          {{- include "container.volumeMounts" . | nindent 12 }}
       volumes:
         {{- include "container.volumes" . | nindent 8 }}
 {{- end -}}
diff --git a/workloads/dev-workspace-jupyterlab/helm/values.yaml b/workloads/dev-workspace-jupyterlab/helm/values.yaml
index e4c5d18..126401e 100644
--- a/workloads/dev-workspace-jupyterlab/helm/values.yaml
+++ b/workloads/dev-workspace-jupyterlab/helm/values.yaml
@@ -11,6 +11,7 @@ metadata:
 
 image: rocm/pytorch:rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0
 imagePullPolicy: Always
+imagePullSecrets: []
 gpus: 1
 memory_per_gpu: 128 # Gi
 cpu_per_gpu: 4
diff --git a/workloads/dev-workspace-vscode/helm/README.md b/workloads/dev-workspace-vscode/helm/README.md
index 655a033..2538ffa 100644
--- a/workloads/dev-workspace-vscode/helm/README.md
+++ b/workloads/dev-workspace-vscode/helm/README.md
@@ -30,6 +30,7 @@ You can configure the following parameters in the `values.yaml` file or override
 |------------------------|-----------------------------------------------------------------------------|-------------------------------------------------------------------------|
 | `image`                | Container image repository and tag                                          | `rocm/pytorch:rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0`         |
 | `imagePullPolicy`      | Image pull policy                                                           | `Always`                                                                |
+| `imagePullSecrets`     | List of image pull secrets for private registries                           | `[]`                                                                    |
 | `gpus`                 | Number of GPUs to allocate (set to 0 for CPU-only mode)                     | `1`                                                                     |
 | `memory_per_gpu`       | Memory allocated per GPU (in Gi)                                            | `128`                                                                   |
 | `cpu_per_gpu`          | CPU cores allocated per GPU                                                 | `4`                                                                     |
diff --git a/workloads/dev-workspace-vscode/helm/templates/deployment.yaml b/workloads/dev-workspace-vscode/helm/templates/deployment.yaml
index 4f8849e..d1974b8 100644
--- a/workloads/dev-workspace-vscode/helm/templates/deployment.yaml
+++ b/workloads/dev-workspace-vscode/helm/templates/deployment.yaml
@@ -27,41 +27,47 @@ spec:
       labels:
         app: {{ include "release.fullname" . }}
     spec:
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
       containers:
-        - name: {{ .Chart.Name }}
-          {{- if .Values.entrypoint }}
-          args:
-          - |
-            {{- .Values.entrypoint | nindent 12 }}
-          command: ["sh", "-c"]
-          {{- end }}
-          {{- if .Values.env_vars }}
-          env:
-            {{- include "container.env" . | nindent 12 }}
-          {{- end }}
-          image: {{ .Values.image | quote}}
-          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
-          {{- if .Values.livenessProbe }}
-          livenessProbe:
-            {{- .Values.livenessProbe | toYaml | nindent 12 -}}
-          {{- end }}
-          ports:
-            {{- range $key, $value := .Values.deployment.ports }}
-            - name: {{ $key }}
-              containerPort: {{ $value }}
-            {{- end }}
-          {{- if .Values.readinessProbe }}
-          readinessProbe:
-            {{- .Values.readinessProbe | toYaml | nindent 12 -}}
-          {{- end }}
-          resources:
-            {{- include "container.resources" . | nindent 12 }}
-          {{- if .Values.startupProbe }}
-          startupProbe:
-            {{- .Values.startupProbe | toYaml | nindent 12 -}}
+      - name: {{ .Chart.Name }}
+        {{- if .Values.entrypoint }}
+        args:
+        - |
+          {{- .Values.entrypoint | nindent 12 }}
+        command: ["sh", "-c"]
+        {{- end }}
+        {{- if .Values.env_vars }}
+        env:
+          {{- include "container.env" . | nindent 12 }}
+        {{- end }}
+        image: {{ .Values.image | quote}}
+        imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+        {{- if .Values.livenessProbe }}
+        livenessProbe:
+          {{- .Values.livenessProbe | toYaml | nindent 12 -}}
+        {{- end }}
+        ports:
+          {{- range $key, $value := .Values.deployment.ports }}
+          - name: {{ $key }}
+            containerPort: {{ $value }}
           {{- end }}
-          volumeMounts:
-            {{- include "container.volumeMounts" . | nindent 12 }}
+        {{- if .Values.readinessProbe }}
+        readinessProbe:
+          {{- .Values.readinessProbe | toYaml | nindent 12 -}}
+        {{- end }}
+        resources:
+          {{- include "container.resources" . | nindent 12 }}
+        {{- if .Values.startupProbe }}
+        startupProbe:
+          {{- .Values.startupProbe | toYaml | nindent 12 -}}
+        {{- end }}
+        volumeMounts:
+          {{- include "container.volumeMounts" . | nindent 12 }}
       volumes:
         {{- include "container.volumes" . | nindent 8 }}
 {{- end -}}
diff --git a/workloads/dev-workspace-vscode/helm/values.yaml b/workloads/dev-workspace-vscode/helm/values.yaml
index 37145ab..59a07bf 100644
--- a/workloads/dev-workspace-vscode/helm/values.yaml
+++ b/workloads/dev-workspace-vscode/helm/values.yaml
@@ -14,6 +14,7 @@ pvc_annotations:
 
 image: rocm/pytorch:rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0
 imagePullPolicy: Always
+imagePullSecrets: []
 gpus: 1
 memory_per_gpu: 128 # Gi
 cpu_per_gpu: 4
diff --git a/workloads/download-data-to-bucket/helm/overrides/media-finetune-wan-disney-dataset.yaml b/workloads/download-data-to-bucket/helm/overrides/media-finetune-wan-disney-dataset.yaml
new file mode 100644
index 0000000..710697a
--- /dev/null
+++ b/workloads/download-data-to-bucket/helm/overrides/media-finetune-wan-disney-dataset.yaml
@@ -0,0 +1,95 @@
+# This override is intended for workloads/media-finetune-wan/README.md
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels:
+  kaiwo.silogen.ai/managed: "true"
+
+# Data download and preprocess script:
+dataScript: |
+    from huggingface_hub import snapshot_download
+    import pandas as pd
+    import os
+
+    def create_metadata_csv(videos_file_path, captions_file_path, output_csv_path):
+        """
+        Create a metadata.csv file combining video filenames with their corresponding captions.
+
+        Args:
+            videos_file_path: Path to the videos.txt file containing video filenames
+            captions_file_path: Path to the prompt.txt file containing captions
+            output_csv_path: Path where the metadata.csv file will be saved
+        """
+
+        # Read video filenames and remove "videos/" prefix
+        with open(videos_file_path, 'r', encoding='utf-8') as f:
+            video_filenames = [line.strip() for line in f if line.strip()]
+
+        # Read captions
+        with open(captions_file_path, 'r', encoding='utf-8') as f:
+            captions = [line.strip() for line in f if line.strip()]
+
+        # Verify that we have the same number of videos and captions
+        if len(video_filenames) != len(captions):
+            print(f"Warning: Number of video files ({len(video_filenames)}) doesn't match number of captions ({len(captions)})")
+            min_length = min(len(video_filenames), len(captions))
+            video_filenames = video_filenames[:min_length]
+            captions = captions[:min_length]
+            print(f'Using first {min_length} entries for both')
+
+        # Create DataFrame
+        metadata_df = pd.DataFrame({
+            'video': video_filenames,
+            'prompt': captions
+        })
+
+        # Save to CSV
+        metadata_df.to_csv(output_csv_path, index=False, encoding='utf-8')
+
+        print(f'Created metadata.csv with {len(metadata_df)} entries')
+        print(f'Saved to: {output_csv_path}')
+
+        # Display first few entries
+        print('\nFirst 5 entries:')
+        print(metadata_df.head())
+
+        return metadata_df
+
+    # Define dataset
+    repo_id = 'Wild-Heart/Disney-VideoGeneration-Dataset'
+
+    # Define paths
+    output_dir = '/downloads/datasets'
+
+    # Define file names
+    videos_file = 'videos.txt'
+    captions_file = 'prompt.txt'
+    output_csv = 'metadata.csv'
+
+    videos_file = os.path.join(output_dir, 'videos.txt')
+    captions_file = os.path.join(output_dir, 'prompt.txt')
+    output_csv = os.path.join(output_dir, 'metadata.csv')
+
+    # Download the dataset
+    snapshot_download(
+        repo_id=repo_id,
+        repo_type='dataset',
+        local_dir=output_dir)
+    print('Dataset downloaded successfully.')
+
+    # Process the dataset (create a metadata.csv file)
+    metadata_df = create_metadata_csv(videos_file, captions_file, output_csv)
+    print('Metadata CSV created successfully.')
+
+# Where the resources should be stored:
+bucketDataDir: default-bucket/datasets/Disney-VideoGeneration-Dataset
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "32Mi"
diff --git a/workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml b/workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml
new file mode 100644
index 0000000..8048251
--- /dev/null
+++ b/workloads/download-data-to-bucket/helm/overrides/tutorial-05-opi-data.yaml
@@ -0,0 +1,78 @@
+# This override is intended for docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels:
+  kaiwo.silogen.ai/managed: "true"
+
+# Data download and preprocess script:
+dataScript: |
+  from huggingface_hub import hf_hub_download
+  import json
+  import os
+  import random
+
+  def convert_opi(input_file, output_file):
+      """
+      Converts a JSON array of objects with 'instruction', 'input', and 'output' fields
+      into a JSONL file with the specified message format.
+      """
+      with open(input_file, 'r') as f:
+          data = json.load(f)
+
+      with open(output_file, 'w') as f:
+          for row in data:
+              if all(k in row for k in ("instruction", "input", "output")):
+                  line = {
+                      "messages": [
+                          {"role": "user", "content": f"{row['instruction']} Sequence: {row['input']}"},
+                          {"role": "assistant", "content": row["output"]}
+                      ]
+                  }
+                  f.write(json.dumps(line) + "\n")
+
+  def create_sample(input_jsonl, output_jsonl, n):
+      """Create a random sample of n lines from input_jsonl and write to output_jsonl."""
+      with open(input_jsonl, 'r') as f:
+          lines = f.readlines()
+      sample = random.sample(lines, min(n, len(lines)))
+      with open(output_jsonl, 'w') as f:
+          f.writelines(sample)
+
+  repo_id = "BAAI/OPI"
+  target_dir = "/downloads"
+  output_dir = "/downloads/datasets"
+  data_in = [
+      "OPI_DATA/OPI_updated_160k.json",  # smaller update dataset
+    #   "OPI_DATA/OPI_full_1.61M_train.json",  # full dataset
+  ]
+  create_sample_n = 1000  # Set to None to disable
+
+  for file in data_in:
+      hf_hub_download(repo_id=repo_id,
+                      filename=file,
+                      repo_type="dataset",
+                      local_dir=target_dir)
+      print('Downloaded', file)
+      file_out = file.split('/')[1].replace(".json", ".jsonl")
+      out_path = os.path.join(output_dir, file_out)
+      convert_opi(os.path.join(target_dir, file), out_path)
+      print('Converted', file, 'to', file_out)
+
+      if create_sample_n is not None:
+          sample_out = out_path.replace(".jsonl", f".sample{create_sample_n}.jsonl")
+          create_sample(out_path, sample_out, create_sample_n)
+          print(f'Created random sample of {create_sample_n} lines: {sample_out}')
+
+# Where the resources should be stored:
+bucketDataDir: default-bucket/datasets/
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "50Gi"
diff --git a/workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml b/workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml
new file mode 100644
index 0000000..2a19f60
--- /dev/null
+++ b/workloads/download-data-to-bucket/helm/overrides/tutorial-07-disney-dataset.yaml
@@ -0,0 +1,95 @@
+# Tutorial 07: Download Disney VideoGeneration Dataset Configuration
+# Downloads and preprocesses the Disney VideoGeneration Dataset for Wan fine-tuning
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Data download and preprocess script:
+dataScript: |
+  from huggingface_hub import snapshot_download
+  import pandas as pd
+  import os
+
+  def create_metadata_csv(videos_file_path, captions_file_path, output_csv_path):
+      """
+      Create a metadata.csv file combining video filenames with their corresponding captions.
+
+      Args:
+          videos_file_path: Path to the videos.txt file containing video filenames
+          captions_file_path: Path to the prompt.txt file containing captions
+          output_csv_path: Path where the metadata.csv file will be saved
+      """
+
+      # Read video filenames and remove "videos/" prefix
+      with open(videos_file_path, 'r', encoding='utf-8') as f:
+          video_filenames = [line.strip() for line in f if line.strip()]
+
+      # Read captions
+      with open(captions_file_path, 'r', encoding='utf-8') as f:
+          captions = [line.strip() for line in f if line.strip()]
+
+      # Verify that we have the same number of videos and captions
+      if len(video_filenames) != len(captions):
+          print(f"Warning: Number of video files ({len(video_filenames)}) doesn't match number of captions ({len(captions)})")
+          min_length = min(len(video_filenames), len(captions))
+          video_filenames = video_filenames[:min_length]
+          captions = captions[:min_length]
+          print(f'Using first {min_length} entries for both')
+
+      # Create DataFrame
+      metadata_df = pd.DataFrame({
+          'video': video_filenames,
+          'prompt': captions
+      })
+
+      # Save to CSV
+      metadata_df.to_csv(output_csv_path, index=False, encoding='utf-8')
+
+      print(f'Created metadata.csv with {len(metadata_df)} entries')
+      print(f'Saved to: {output_csv_path}')
+
+      # Display first few entries
+      print('\nFirst 5 entries:')
+      print(metadata_df.head())
+
+      return metadata_df
+
+  # Define dataset
+  repo_id = 'Wild-Heart/Disney-VideoGeneration-Dataset'
+
+  # Define paths
+  output_dir = '/downloads/datasets'
+
+  # Define file names
+  videos_file = 'videos.txt'
+  captions_file = 'prompt.txt'
+  output_csv = 'metadata.csv'
+
+  videos_file = os.path.join(output_dir, 'videos.txt')
+  captions_file = os.path.join(output_dir, 'prompt.txt')
+  output_csv = os.path.join(output_dir, 'metadata.csv')
+
+  # Download the dataset
+  snapshot_download(
+      repo_id=repo_id,
+      repo_type='dataset',
+      local_dir=output_dir)
+  print('Dataset downloaded successfully.')
+
+  # Process the dataset (create a metadata.csv file)
+  metadata_df = create_metadata_csv(videos_file, captions_file, output_csv)
+  print('Metadata CSV created successfully.')
+
+# Where the resources should be stored:
+bucketDataDir: default-bucket/datasets/Disney-VideoGeneration-Dataset
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "32Mi"
diff --git a/workloads/download-data-to-bucket/helm/overrides/ultrafeedback-binarized-to-minio.yaml b/workloads/download-data-to-bucket/helm/overrides/ultrafeedback-binarized-to-minio.yaml
new file mode 100644
index 0000000..6458d66
--- /dev/null
+++ b/workloads/download-data-to-bucket/helm/overrides/ultrafeedback-binarized-to-minio.yaml
@@ -0,0 +1,33 @@
+
+# Data download and preprocess script:
+dataScript: |
+  import datasets
+  def dpo_formatter(example, idx):
+      return {
+          "dataset": "ultrafeedback-binarized",
+          "id": f"ultrafeedback-binarized_{idx}",
+          "prompt_messages": example["messages"][0:1],
+          "chosen_messages": example["chosen"][1:2],
+          "rejected_messages": example["rejected"][1:2],
+      }
+  hf_id="HuggingFaceH4/ultrafeedback_binarized"
+  dataset = datasets.load_dataset(hf_id, split="train_prefs")
+  def equal_score_filter(datapoint):
+      return datapoint["score_chosen"] == datapoint["score_rejected"]
+  dataset = dataset.filter(equal_score_filter)
+  dataset = dataset.map(dpo_formatter, with_indices=True, remove_columns=dataset.column_names)
+  dataset.to_json("/downloads/datasets/ultrafeedback-binarized.jsonl") # Need to save any data files in this specific directory to be uploaded.
+
+# Where the resources should be stored:
+bucketDataDir: default-bucket/datasets/
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "128Mi"
diff --git a/workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-T2V-A14B.yaml b/workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-T2V-A14B.yaml
new file mode 100644
index 0000000..56e92e9
--- /dev/null
+++ b/workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-T2V-A14B.yaml
@@ -0,0 +1,24 @@
+# Use to add labels to the metadata of the resources created by this workload.
+labels:
+  kaiwo.silogen.ai/managed: "true"
+
+# Which model to download
+modelID: Wan-AI/Wan2.2-T2V-A14B
+
+# Where the resources should be stored:
+bucketPath: default-bucket/models/Wan-AI/Wan2.2-T2V-A14B
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Download & Upload configuration:
+downloadExcludeGlob: "original/*"  # Exclude things from the HuggingFace download with this
+allowOverwrite: false
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "128Gi"
diff --git a/workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-TI2V-5B.yaml b/workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-TI2V-5B.yaml
new file mode 100644
index 0000000..a3d3842
--- /dev/null
+++ b/workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-TI2V-5B.yaml
@@ -0,0 +1,24 @@
+# Use to add labels to the metadata of the resources created by this workload.
+labels:
+  kaiwo.silogen.ai/managed: "true"
+
+# Which model to download
+modelID: Wan-AI/Wan2.2-TI2V-5B
+
+# Where the resources should be stored:
+bucketPath: default-bucket/models/Wan-AI/Wan2.2-TI2V-5B
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Download & Upload configuration:
+downloadExcludeGlob: "original/*" # Exclude things from the HuggingFace download with this
+allowOverwrite: false
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "50Gi"
diff --git a/workloads/download-huggingface-model-to-bucket/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml b/workloads/download-huggingface-model-to-bucket/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml
index f51d3a6..e1444a9 100644
--- a/workloads/download-huggingface-model-to-bucket/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml
+++ b/workloads/download-huggingface-model-to-bucket/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml
@@ -1,2 +1,3 @@
 modelId: meta-llama/Llama-3.1-8B-Instruct
+bucketPath: default-bucket/models/meta-llama/Llama-3.1-8B-Instruct
 storageQuantity: 38Gi
diff --git a/workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml b/workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml
new file mode 100644
index 0000000..acbf732
--- /dev/null
+++ b/workloads/download-huggingface-model-to-bucket/helm/overrides/tutorial-07-wan2-2-ti2v-5b.yaml
@@ -0,0 +1,26 @@
+# Tutorial 07: Download Wan 2.2 5B Model Configuration
+# Downloads the Wan 2.2 TI2V 5B parameter model for video generation fine-tuning
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Which model to download
+modelID: Wan-AI/Wan2.2-TI2V-5B
+
+# Where the resources should be stored:
+bucketPath: default-bucket/models/Wan-AI/Wan2.2-TI2V-5B
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Download & Upload configuration:
+downloadExcludeGlob: "original/*" # Exclude things from the HuggingFace download with this
+allowOverwrite: false
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "50Gi"
diff --git a/workloads/llm-finetune-silogen-engine/helm/README.md b/workloads/llm-finetune-silogen-engine/helm/README.md
index 26e599f..c4c762b 100644
--- a/workloads/llm-finetune-silogen-engine/helm/README.md
+++ b/workloads/llm-finetune-silogen-engine/helm/README.md
@@ -78,3 +78,16 @@ Note that the logging frequency is set by the HuggingFace Transformers [logging
 ## Best-known-configuration model overrides
 
 The directory `overrides/models` hosts finetuning recipes for various models. The files are named according to model canonical names, which is the huggingface pattern of `organization/model-name` just changed into `organization_model-name`. These configurations have been shown to work well in experiments, but that does not guarantee that these exact parameters are always optimal. The best parameters still depend on the data, too.
+
+## Running DPO
+
+The default values have an SFT-specific field `finetuning_config.sft_args`, which must be set to null to run DPO. An example command for running DPO training:
+
+```bash
+helm template debug-dpo . \
+ --values overrides/tiny-llama-dpo-full-param.yaml \
+ --values overrides/data/ultrafeedback-binarized.yaml \
+ --set checkpointsRemote="default-bucket/experiments/debug-dpo-tiny-llama-ultrafeedback-alpha" \
+ | kubectl apply -f - -nkaiwo
+```
+Note that first you must download Ultrafeedback Binarized with the `workloads/download-data-to-bucket/helm` chart using the override file `workloads/download-data-to-bucket/helm/overrides/ultrafeedback-binarized-to-minio.yaml`.
diff --git a/workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md b/workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md
new file mode 100644
index 0000000..5996644
--- /dev/null
+++ b/workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md
@@ -0,0 +1,428 @@
+# Finetuning config structure and parameters for DPO
+
+This document describes the structure of the DPO finetuning configuration, and the parameters and values that can be defined there.
+
+See the finetuning config section [this config file](overrides/tiny-llama-dpo-full-param.yaml) for an example of a valid configuration.
+See the various sub-configs for their options. Additional properties are not allowed.
+
+**Top-level properties:**
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| data_conf | `object` | ✅ | [ChatTrainValidConfig](#chattrainvalidconfig) |  | The data input config |
+| training_args | `object` | ✅ | [SilogenDPOConfig](#silogendpoconfig) |  | TRL DPOTrainerArguments with some restrictions |
+| batchsize_conf | `object` | ✅ | [BatchsizeConfig](#batchsizeconfig) |  | Batch size configuration |
+| peft_conf | `object` | ✅ | [GenericPeftConfig](#genericpeftconfig) and/or [NoPeftConfig](#nopeftconfig) and/or [PretrainedPeftConfig](#pretrainedpeftconfig) |  | Adapter configuration |
+| run_conf | `object` | ✅ | [RunConfig](#runconfig) |  | Model related configuration |
+| method | `const` |  | `dpo` | `"dpo"` |  |
+| overrides | `object` |  | [Overrides](#overrides) | `{"lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface |
+| tracking | `object` or `null` |  | [FinetuningTrackingConfig](#finetuningtrackingconfig) | `null` | MLFlow tracking configuration |
+| quant_conf | `object` |  | [BnBQuantizationConfig](#bnbquantizationconfig) and/or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration |
+
+
+---
+
+# Definitions
+
+## AutoSplitDataInput
+
+Automatic validation split from the training data
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| type | `const` | ✅ | `AUTO_SPLIT` |  |  |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
+| ratio | `number` |  | number | `0.2` | Ratio of the training data to use for validation |
+| seed | `integer` |  | integer | `1289525893` | Seed for the random number generator for splitting |
+
+## BatchsizeConfig
+
+Config for determining the total batch size
+
+Total batch size is the effective batch size for the complete training run. It is equal to
+number of processes * per-device batch size * accumulation.
+
+The maximum batch size per device is the maximum batch size that can be accommodated on a single device.
+This mostly limited by the memory capacity of the device.
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| total_train_batch_size | `integer` | ✅ | integer |  | The total batch size for the training run |
+| max_per_device_train_batch_size | `integer` | ✅ | integer |  | The maximum training batch size per device |
+| per_device_eval_batch_size | `integer` or `null` |  | integer | `null` | The maximum eval batch size per device, if not given, will use same as training batch size |
+
+## BnBQuantizationConfig
+
+Bits and Bytes configuration
+
+The options are from the BitsAndBytes config,
+see: https://huggingface.co/docs/transformers/en/main_classes/quantization#transformers.BitsAndBytesConfig
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| quantization_type | `const` |  | `bits-and-bytes` | `"bits-and-bytes"` |  |
+| load_in_8bit | `boolean` |  | boolean | `false` |  |
+| load_in_4bit | `boolean` |  | boolean | `false` |  |
+| llm_int8_threshold | `number` |  | number | `6.0` |  |
+| llm_int8_skip_modules | `array` or `null` |  | string | `null` |  |
+| llm_int8_enable_fp32_cpu_offload | `boolean` |  | boolean | `false` |  |
+| llm_int8_has_fp16_weight | `boolean` |  | boolean | `false` |  |
+| bnb_4bit_compute_dtype | `string` or `null` |  | string | `null` |  |
+| bnb_4bit_quant_type | `const` |  | `fp4` and/or `nf4` | `"fp4"` |  |
+| bnb_4bit_use_double_quant | `boolean` |  | boolean | `false` |  |
+| bnb_4bit_quant_storage | `string` or `null` |  | string | `null` |  |
+
+## ChatTemplateName
+
+Chat template to use.
+
+#### Type: `string`
+
+**Possible Values:** `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31`
+
+## ChatTrainValidConfig
+
+Training time data configuration
+
+Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput
+is also allowed for the validation side.
+
+Additionally includes chat template and padding configurations, as those are part of the data input pipeline.
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) and/or [WeightedMixDataInput](#weightedmixdatainput) |  |  |
+| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) and/or [ConcatenationDataInput](#concatenationdatainput) and/or [NoneDataInput](#nonedatainput) |  |  |
+| chat_template_name | `string` |  | [ChatTemplateName](#chattemplatename) | `"mistral-with-system"` |  |
+| padding_side | `string` |  | string | `"right"` | Padding side, right is usually right. |
+| missing_pad_token_strategy | `string` |  | [MissingPadTokenStrategy](#missingpadtokenstrategy) | `"bos-repurpose"` | See the MissingPadTokenStrategys for descriptions of the options |
+
+## ConcatenationDataInput
+
+A simple list of datasets
+
+These are simply concatenated, the same as sampling all with equal weight.
+
+The datasets themselves need to be in the finetuning supported JSONL formats.
+For SFT this means lines:
+
+    {"messages": [{"content": "string", "role": "string"}]}
+
+For DPO this means lines of:
+    {
+       "prompt_messages": [{"content": "string", "role": "string"}],
+       "chosen_messages": [{"content": "string", "role": "string"}],
+        "rejected_messages": [{"content": "string", "role": "string"}]
+    }
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| type | `const` | ✅ | `CONCATENATION` |  |  |
+| datasets | `array` | ✅ | [DatasetDefinition](#datasetdefinition) |  |  |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
+
+## DatasetDefinition
+
+Define how to load a dataset
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Description |
+| -------- | ---- | -------- | --------------- | ----------- |
+| path | `string` | ✅ | string | Local path to a JSONL file in the finetuning data format |
+
+## FinetuningTrackingConfig
+
+Settings that define how run details are logged
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| mlflow_server_uri | `string` | ✅ | string |  | MLflow server URI. Can be local path. |
+| experiment_name | `string` | ✅ | string |  | Experiment name that is used for MLFlow tracking. |
+| hf_mlflow_log_artifacts | `string` |  | string | `"False"` | Whether to store model artifacts in MLFlow. |
+
+## GenericPeftConfig
+
+Config for any new initialized PEFT Adapter
+
+See https://huggingface.co/docs/peft/tutorial/peft_model_config for the possible kwargs
+and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py for the types.
+
+Example:
+
+    >>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM',
+    ...         'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}}
+    >>> generic_conf = GenericPeftConfig(**loaded_data)
+    >>> generic_conf.get_peft_config()
+    LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, ...)
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| peft_type | `string` | ✅ | [PeftType](#pefttype) |  |  |
+| task_type | `string` |  | [TaskType](#tasktype) | `"CAUSAL_LM"` |  |
+| peft_kwargs | `object` |  | object |  |  |
+
+## MissingPadTokenStrategy
+
+Specifies the available missing pad token strategies.
+
+We've shown in a small set of experiments that repurposing EOS can start to hurt performance
+while the other options seem to work equally well.
+
+Repurposing EOS is the default in many online sources, but it is actually a bad idea if we want to predict
+EOS, as all the pad_token_ids get ignored in loss computation, and thus the model does not learn to predict
+the end of the text. However, for models that have additional tokens for end of message, end of turn, etc.
+this is not so dangerous.
+
+Repurposing BOS is similar to repurposing EOS, but since we do not need to predict BOS, this may be more sensible.
+
+Repurposing UNK can work with tokenizers that never produce UNKs in normal data (e.g. Mistral tokenizers should have
+a byte fall-back so that everything can be tokenized).
+
+UNK_CONVERT_TO_EOS uses a hack where the unk_token_id is initially used for padding, but in the collation phase the
+input-side UNKs (padding) gets set to EOS, so that the input-side padding looks like EOS. On the output-side, the
+UNKs (padding) still gets ignored. NOTE: This will leave the tokenizer's pad_token_id set to the unk_token_id; so
+any subsequent use of the model where padding is involved should somehow explicitly set the pad_token_id again.
+
+#### Type: `string`
+
+**Possible Values:** `eos-repurpose` or `bos-repurpose` or `unk-repurpose` or `unk-convert-to-eos`
+
+## ModelArguments
+
+These are passed to AutoModelForCausalLM.from_pretrained
+
+See parameter docstrings and help at:
+https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained
+See below in "Parameters for big model inference" too, it affects training too. Also note that this link takes you
+to the transformers main branch version - be sure to compare with the installed version of transformers (that keeps
+changing over time, and it is difficult to keep this docstring up to date, so we wanted to link to the latest here).
+
+Some important parameters to consider are:
+
+- device_map :
+    A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer
+    name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass
+    the device (e.g., "cpu", "cuda:1", "mps", or a GPU ordinal rank like 1) on which the model will be allocated,
+    the device map will map the entire model to this device. Passing device_map = 0 means put the whole model on GPU
+    0.
+- attn_implementation :
+    The attention implementation to use in the model (if relevant). Can be any of "eager" (manual implementation of
+    the attention), "sdpa" (using F.scaled_dot_product_attention), or "flash_attention_2" (using
+    Dao-AILab/flash-attention). By default, if available, SDPA will be used for torch>=2.1.1. The default is
+    otherwise the manual "eager" implementation.
+
+NOTE:
+    This does not include quantization_config. Quantization config is specified separately.
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| torch_dtype | `const` |  | `auto` | `"auto"` |  |
+| device_map | `object` or `string` or `null` |  | object and/or string | `null` | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential". |
+| max_memory | `object` or `null` |  | object | `null` |  |
+| low_cpu_mem_usage | `boolean` |  | boolean | `false` |  |
+| attn_implementation | `string` or `null` |  | string | `null` | Note: this can be set to "sdpa", "flash_attention_2", "eager". |
+| offload_folder | `string` or `null` |  | string | `null` |  |
+| offload_state_dict | `boolean` or `null` |  | boolean | `null` | Default is True if offloading (otherwise no effect) |
+| offload_buffers | `boolean` or `null` |  | boolean | `null` |  |
+| use_cache | `boolean` |  | boolean | `true` | Saves generated hidden states to speed up generation, see: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 This is mutually exclusive with gradient_checkpointing. |
+| cache_dir | `string` or `null` |  | string | `null` |  |
+| force_download | `boolean` |  | boolean | `false` |  |
+| local_files_only | `boolean` |  | boolean | `false` |  |
+| proxies | `object` or `null` |  | object | `null` |  |
+| resume_download | `boolean` |  | boolean | `false` |  |
+| revision | `string` |  | string | `"main"` |  |
+| code_revision | `string` |  | string | `"main"` |  |
+| subfolder | `string` or `null` |  | string | `null` |  |
+| token | `string` or `null` |  | string | `null` |  |
+| use_safetensors | `boolean` or `null` |  | boolean | `null` |  |
+| variant | `string` or `null` |  | string | `null` |  |
+| trust_remote_code | `boolean` |  | boolean | `false` | Warning: if set to True, allows execution of downloaded remote code. |
+
+## NoPeftConfig
+
+A trivial config specifying that no peft is used
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Description |
+| -------- | ---- | -------- | --------------- | ----------- |
+| peft_type | `const` | ✅ | `NO_PEFT` |  |
+
+## NoQuantizationConfig
+
+A marker not to use quantization
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| quantization_type | `const` |  | `no-quantization` | `"no-quantization"` |  |
+
+## NoneDataInput
+
+A special type for not using data e.g. in validation
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| type | `const` | ✅ | `NONE` |  |  |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
+
+## Overrides
+
+Override options
+
+These implement dynamic scaling for the learning rate.
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| lr_multiplier | `number` |  | number | `1.0` | Multiplier applied to the learning rate in the training_args |
+| lr_batch_size_scaling | `string` |  | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size.             'none': No scaling.             'sqrt': Multiplies learning rate by square root of batch size (a classic scaling rule).             'linear': Multiplies learning rate by the batch size (a more modern scaling rule). |
+
+## PeftType
+
+Enum class for the different types of adapters in PEFT.
+
+Supported PEFT types:
+- PROMPT_TUNING
+- MULTITASK_PROMPT_TUNING
+- P_TUNING
+- PREFIX_TUNING
+- LORA
+- ADALORA
+- BOFT
+- ADAPTION_PROMPT
+- IA3
+- LOHA
+- LOKR
+- OFT
+- XLORA
+- POLY
+- LN_TUNING
+- VERA
+- FOURIERFT
+- HRA
+- BONE
+- RANDLORA
+- C3A
+
+#### Type: `string`
+
+**Possible Values:** `PROMPT_TUNING` or `MULTITASK_PROMPT_TUNING` or `P_TUNING` or `PREFIX_TUNING` or `LORA` or `ADALORA` or `BOFT` or `ADAPTION_PROMPT` or `IA3` or `LOHA` or `LOKR` or `OFT` or `POLY` or `LN_TUNING` or `VERA` or `FOURIERFT` or `XLORA` or `HRA` or `VBLORA` or `CPT` or `BONE` or `RANDLORA` or `TRAINABLE_TOKENS` or `C3A`
+
+## PretrainedPeftConfig
+
+PEFT adapter uses the config and initialisation from a pretrained adapter
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Description |
+| -------- | ---- | -------- | --------------- | ----------- |
+| peft_type | `const` | ✅ | `PRETRAINED_PEFT` |  |
+| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft. |
+
+## RunConfig
+
+Experiment running configuration
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| model | `string` |  | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be /local_resources/basemodel |
+| model_args | `object` |  | [ModelArguments](#modelarguments) | `{"torch_dtype": "auto", "device_map": "auto", "max_memory": null, "low_cpu_mem_usage": false, "attn_implementation": null, "offload_folder": null, "offload_state_dict": null, "offload_buffers": null, "use_cache": true, "cache_dir": null, "force_download": false, "local_files_only": false, "proxies": null, "resume_download": false, "revision": "main", "code_revision": "main", "subfolder": null, "token": null, "use_safetensors": null, "variant": null, "trust_remote_code": false}` |  |
+| tokenizer | `string` or `null` |  | string | `null` | Model HuggingFace ID, or path, or None to use the one associated with the model |
+| use_fast_tokenizer | `boolean` |  | boolean | `true` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. |
+| resume_from_checkpoint | `boolean` or `string` |  | boolean and/or string | `false` | Normally should be set to 'auto' to continue if a checkpoint exists.        Can set to True to always try to continue, False to never try, or a path to load from a specific path. |
+| final_checkpoint_name | `string` |  | string | `"checkpoint-final"` | Name of final checkpoint. Should be left as default |
+| determinism | `string` |  | `no` `half` `full` | `"no"` | Set the level of determinism in implementations. Deterministic implementations are not always available,            and when they are, they are usually slower than their non-deterministic counterparts. Recommended for            debugging only.            'no': No determinism.            'half': Prefer deterministic implementations.            'full': Only fully deterministic implementations, error out on operations that only have non-deterministic                    implementations. |
+
+## SilogenDPOConfig
+
+HuggingFace TRL DPOConfig as Config with additional SiloGen conventions
+
+The list of training arguments is best available online (the version might not be up-to-date here):
+https://huggingface.co/docs/transformers/v4.53.0/en/main_classes/trainer#transformers.TrainingArguments
+
+Additionally, the DPOConfig has arguments specific to DPO training, which can be found here:
+https://huggingface.co/docs/trl/v0.13.0/en/dpo_trainer#trl.DPOConfig
+
+The object does a lot of things besides specifying the training configuration options (e.g. it
+has computed properties like true training batch size etc.)
+
+## TaskType
+
+Enum class for the different types of tasks supported by PEFT.
+
+Overview of the supported task types:
+- SEQ_CLS: Text classification.
+- SEQ_2_SEQ_LM: Sequence-to-sequence language modeling.
+- CAUSAL_LM: Causal language modeling.
+- TOKEN_CLS: Token classification.
+- QUESTION_ANS: Question answering.
+- FEATURE_EXTRACTION: Feature extraction. Provides the hidden states which can be used as embeddings or features
+  for downstream tasks.
+
+#### Type: `string`
+
+**Possible Values:** `SEQ_CLS` or `SEQ_2_SEQ_LM` or `CAUSAL_LM` or `TOKEN_CLS` or `QUESTION_ANS` or `FEATURE_EXTRACTION`
+
+## WeightedDatasetDefinition
+
+Define a dataset, with a weight for sampling
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| path | `string` | ✅ | string |  | Local path to a JSONL file in the finetuning data format |
+| sampling_weight | `number` |  | number | `1.0` |  |
+
+## WeightedMixDataInput
+
+A list of datasets where each is sampled by a certain weight
+
+These datasets are interleaved based on the sampling weights. The resulting dataset is fully precomputed, upto
+the point where every single sample in every dataset gets picked. This means that with small sampling weights,
+it can take a lot of draws to see every sample from a dataset and so the resulting dataset can be very large.
+
+The datasets themselves need to be in the finetuning supported JSONL formats.
+For SFT this means lines:
+
+    {"messages": [{"content": "string", "role": "string"}]}
+
+For DPO this means lines of:
+    {
+       "prompt_messages": [{"content": "string", "role": "string"}],
+       "chosen_messages": [{"content": "string", "role": "string"}],
+        "rejected_messages": [{"content": "string", "role": "string"}]
+    }
+
+#### Type: `object`
+
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| type | `const` | ✅ | `PRECOMPUTE_WEIGHTED_MIX` |  |  |
+| datasets | `array` | ✅ | [WeightedDatasetDefinition](#weighteddatasetdefinition) |  |  |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
+| seed | `integer` |  | integer | `19851243` | Seed for the random number generator for interleaving draws |
diff --git a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md b/workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md
similarity index 82%
rename from workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
rename to workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md
index 3c980b1..228f0ae 100644
--- a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
+++ b/workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md
@@ -1,6 +1,6 @@
-# Finetuning config structure and parameters
+# Finetuning config structure and parameters for SFT
 
-This document describes the structure of the finetuning configuration, and the parameters and values that can be defined there.
+This document describes the structure of the SFT finetuning configuration, and the parameters and values that can be defined there.
 
 See the finetuning config section [this config file](overrides/llama-31-tiny-random-deepspeed-values.yaml) for an example of a valid configuration.
 See the various sub-configs for their options. Additional properties are not allowed.
@@ -17,7 +17,7 @@ See the various sub-configs for their options. Additional properties are not all
 | sft_args | `object` | ✅ | [SFTArguments](#sftarguments) |  | SFT specific arguments |
 | method | `const` |  | `sft` | `"sft"` |  |
 | overrides | `object` |  | [Overrides](#overrides) | `{"lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface |
-| tracking | `object` or `null` |  | [FinetuningTrackingConfig](#finetuningtrackingconfig) |  | MLFlow tracking configuration |
+| tracking | `object` or `null` |  | [FinetuningTrackingConfig](#finetuningtrackingconfig) | `null` | MLFlow tracking configuration |
 | quant_conf | `object` |  | [BnBQuantizationConfig](#bnbquantizationconfig) and/or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration |
 
 
@@ -50,11 +50,11 @@ This mostly limited by the memory capacity of the device.
 
 #### Type: `object`
 
-| Property | Type | Required | Possible values | Description |
-| -------- | ---- | -------- | --------------- | ----------- |
-| total_train_batch_size | `integer` | ✅ | integer | The total batch size for the training run |
-| max_per_device_train_batch_size | `integer` | ✅ | integer | The maximum training batch size per device |
-| per_device_eval_batch_size | `integer` or `null` |  | integer | The maximum eval batch size per device, if not given, will use same as training batch size |
+| Property | Type | Required | Possible values | Default | Description |
+| -------- | ---- | -------- | --------------- | ------- | ----------- |
+| total_train_batch_size | `integer` | ✅ | integer |  | The total batch size for the training run |
+| max_per_device_train_batch_size | `integer` | ✅ | integer |  | The maximum training batch size per device |
+| per_device_eval_batch_size | `integer` or `null` |  | integer | `null` | The maximum eval batch size per device, if not given, will use same as training batch size |
 
 ## BnBQuantizationConfig
 
@@ -68,16 +68,16 @@ see: https://huggingface.co/docs/transformers/en/main_classes/quantization#trans
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | quantization_type | `const` |  | `bits-and-bytes` | `"bits-and-bytes"` |  |
-| load_in_8bit | `boolean` |  | boolean | `False` |  |
-| load_in_4bit | `boolean` |  | boolean | `False` |  |
+| load_in_8bit | `boolean` |  | boolean | `false` |  |
+| load_in_4bit | `boolean` |  | boolean | `false` |  |
 | llm_int8_threshold | `number` |  | number | `6.0` |  |
-| llm_int8_skip_modules | `array` or `null` |  | string |  |  |
-| llm_int8_enable_fp32_cpu_offload | `boolean` |  | boolean | `False` |  |
-| llm_int8_has_fp16_weight | `boolean` |  | boolean | `False` |  |
-| bnb_4bit_compute_dtype | `string` or `null` |  | string |  |  |
+| llm_int8_skip_modules | `array` or `null` |  | string | `null` |  |
+| llm_int8_enable_fp32_cpu_offload | `boolean` |  | boolean | `false` |  |
+| llm_int8_has_fp16_weight | `boolean` |  | boolean | `false` |  |
+| bnb_4bit_compute_dtype | `string` or `null` |  | string | `null` |  |
 | bnb_4bit_quant_type | `const` |  | `fp4` and/or `nf4` | `"fp4"` |  |
-| bnb_4bit_use_double_quant | `boolean` |  | boolean | `False` |  |
-| bnb_4bit_quant_storage | `string` or `null` |  | string |  |  |
+| bnb_4bit_use_double_quant | `boolean` |  | boolean | `false` |  |
+| bnb_4bit_quant_storage | `string` or `null` |  | string | `null` |  |
 
 ## ChatTemplateName
 
@@ -115,11 +115,14 @@ These are simply concatenated, the same as sampling all with equal weight.
 The datasets themselves need to be in the finetuning supported JSONL formats.
 For SFT this means lines:
 
-    {"messages": {"content": "string", "role": "string"}}
+    {"messages": [{"content": "string", "role": "string"}]}
 
 For DPO this means lines of:
-
-    {"prompt_messages": {"content": "string", "role": "string"}, "chosen_messages": {"content": "string", "role": "string"}, "rejected_messages": {"content": "string", "role": "string"}}
+    {
+       "prompt_messages": [{"content": "string", "role": "string"}],
+       "chosen_messages": [{"content": "string", "role": "string"}],
+        "rejected_messages": [{"content": "string", "role": "string"}]
+    }
 
 #### Type: `object`
 
@@ -149,8 +152,6 @@ Settings that define how run details are logged
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | mlflow_server_uri | `string` | ✅ | string |  | MLflow server URI. Can be local path. |
 | experiment_name | `string` | ✅ | string |  | Experiment name that is used for MLFlow tracking. |
-| run_id | `string` or `null` |  | string |  | Run id, to resume logging to previously started run. |
-| run_name | `string` or `null` |  | string |  | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified. |
 | hf_mlflow_log_artifacts | `string` |  | string | `"False"` | Whether to store model artifacts in MLFlow. |
 
 ## GenericPeftConfig
@@ -165,14 +166,8 @@ Example:
     >>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM',
     ...         'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}}
     >>> generic_conf = GenericPeftConfig(**loaded_data)
-    >>> # Then later in the code something like:
-    >>> model = transformers.AutoModel.from_pretrained('hf-internal-testing/tiny-random-MistralModel')
-    >>> peft.get_peft_model(model, generic_conf.get_peft_config())
-    PeftModelForCausalLM(
-      (base_model): LoraModel(
-        ...
-      )
-    )
+    >>> generic_conf.get_peft_config()
+    LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, ...)
 
 #### Type: `object`
 
@@ -240,26 +235,26 @@ NOTE:
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | torch_dtype | `const` |  | `auto` | `"auto"` |  |
-| device_map | `object` or `string` or `null` |  | object and/or string |  | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential". |
-| max_memory | `object` or `null` |  | object |  |  |
-| low_cpu_mem_usage | `boolean` |  | boolean | `False` |  |
-| attn_implementation | `string` or `null` |  | string |  | Note: this can be set to "sdpa", "flash_attention_2", "eager". |
-| offload_folder | `string` or `null` |  | string |  |  |
-| offload_state_dict | `boolean` or `null` |  | boolean |  | Default is True if offloading (otherwise no effect) |
-| offload_buffers | `boolean` or `null` |  | boolean |  |  |
+| device_map | `object` or `string` or `null` |  | object and/or string | `null` | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential". |
+| max_memory | `object` or `null` |  | object | `null` |  |
+| low_cpu_mem_usage | `boolean` |  | boolean | `false` |  |
+| attn_implementation | `string` or `null` |  | string | `null` | Note: this can be set to "sdpa", "flash_attention_2", "eager". |
+| offload_folder | `string` or `null` |  | string | `null` |  |
+| offload_state_dict | `boolean` or `null` |  | boolean | `null` | Default is True if offloading (otherwise no effect) |
+| offload_buffers | `boolean` or `null` |  | boolean | `null` |  |
 | use_cache | `boolean` |  | boolean | `true` | Saves generated hidden states to speed up generation, see: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 This is mutually exclusive with gradient_checkpointing. |
-| cache_dir | `string` or `null` |  | string |  |  |
-| force_download | `boolean` |  | boolean | `False` |  |
-| local_files_only | `boolean` |  | boolean | `False` |  |
-| proxies | `object` or `null` |  | object |  |  |
-| resume_download | `boolean` |  | boolean | `False` |  |
+| cache_dir | `string` or `null` |  | string | `null` |  |
+| force_download | `boolean` |  | boolean | `false` |  |
+| local_files_only | `boolean` |  | boolean | `false` |  |
+| proxies | `object` or `null` |  | object | `null` |  |
+| resume_download | `boolean` |  | boolean | `false` |  |
 | revision | `string` |  | string | `"main"` |  |
 | code_revision | `string` |  | string | `"main"` |  |
-| subfolder | `string` or `null` |  | string |  |  |
-| token | `string` or `null` |  | string |  |  |
-| use_safetensors | `boolean` or `null` |  | boolean |  |  |
-| variant | `string` or `null` |  | string |  |  |
-| trust_remote_code | `boolean` |  | boolean | `False` | Warning: if set to True, allows execution of downloaded remote code. |
+| subfolder | `string` or `null` |  | string | `null` |  |
+| token | `string` or `null` |  | string | `null` |  |
+| use_safetensors | `boolean` or `null` |  | boolean | `null` |  |
+| variant | `string` or `null` |  | string | `null` |  |
+| trust_remote_code | `boolean` |  | boolean | `false` | Warning: if set to True, allows execution of downloaded remote code. |
 
 ## NoPeftConfig
 
@@ -328,10 +323,13 @@ Supported PEFT types:
 - VERA
 - FOURIERFT
 - HRA
+- BONE
+- RANDLORA
+- C3A
 
 #### Type: `string`
 
-**Possible Values:** `PROMPT_TUNING` or `MULTITASK_PROMPT_TUNING` or `P_TUNING` or `PREFIX_TUNING` or `LORA` or `ADALORA` or `BOFT` or `ADAPTION_PROMPT` or `IA3` or `LOHA` or `LOKR` or `OFT` or `POLY` or `LN_TUNING` or `VERA` or `FOURIERFT` or `XLORA` or `HRA` or `VBLORA`
+**Possible Values:** `PROMPT_TUNING` or `MULTITASK_PROMPT_TUNING` or `P_TUNING` or `PREFIX_TUNING` or `LORA` or `ADALORA` or `BOFT` or `ADAPTION_PROMPT` or `IA3` or `LOHA` or `LOKR` or `OFT` or `POLY` or `LN_TUNING` or `VERA` or `FOURIERFT` or `XLORA` or `HRA` or `VBLORA` or `CPT` or `BONE` or `RANDLORA` or `TRAINABLE_TOKENS` or `C3A`
 
 ## PretrainedPeftConfig
 
@@ -354,9 +352,9 @@ Experiment running configuration
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | model | `string` |  | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be /local_resources/basemodel |
 | model_args | `object` |  | [ModelArguments](#modelarguments) | `{"torch_dtype": "auto", "device_map": "auto", "max_memory": null, "low_cpu_mem_usage": false, "attn_implementation": null, "offload_folder": null, "offload_state_dict": null, "offload_buffers": null, "use_cache": true, "cache_dir": null, "force_download": false, "local_files_only": false, "proxies": null, "resume_download": false, "revision": "main", "code_revision": "main", "subfolder": null, "token": null, "use_safetensors": null, "variant": null, "trust_remote_code": false}` |  |
-| tokenizer | `string` or `null` |  | string |  | Model HuggingFace ID, or path, or None to use the one associated with the model |
+| tokenizer | `string` or `null` |  | string | `null` | Model HuggingFace ID, or path, or None to use the one associated with the model |
 | use_fast_tokenizer | `boolean` |  | boolean | `true` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. |
-| resume_from_checkpoint | `boolean` or `string` |  | boolean and/or string |  | Normally should be set to 'auto' to continue if a checkpoint exists.        Can set to True to always try to continue, False to never try, or a path to load from a specific path. |
+| resume_from_checkpoint | `boolean` or `string` |  | boolean and/or string | `false` | Normally should be set to 'auto' to continue if a checkpoint exists.        Can set to True to always try to continue, False to never try, or a path to load from a specific path. |
 | final_checkpoint_name | `string` |  | string | `"checkpoint-final"` | Name of final checkpoint. Should be left as default |
 | determinism | `string` |  | `no` `half` `full` | `"no"` | Set the level of determinism in implementations. Deterministic implementations are not always available,            and when they are, they are usually slower than their non-deterministic counterparts. Recommended for            debugging only.            'no': No determinism.            'half': Prefer deterministic implementations.            'full': Only fully deterministic implementations, error out on operations that only have non-deterministic                    implementations. |
 
@@ -370,16 +368,16 @@ Supervised fine-tuning arguments
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | max_seq_length | `integer` |  | integer | `2048` | Maximum length input sequence length. Longer sequences will be filtered out. |
 | save_name_if_new_basemodel | `string` |  | string | `"checkpoint-new-basemodel"` | If a new basemodel is saved, it will be saved with this name |
-| train_on_completions_only | `boolean` |  | boolean | `False` | Only compute loss on the assistant's turns. |
+| train_on_completions_only | `boolean` |  | boolean | `false` | Only compute loss on the assistant's turns. |
 
 ## SilogenTrainingArguments
 
 HuggingFace TrainingArguments as Config with additional SiloGen conventions
 
 The list of training arguments is best available online (the version might not be up-to-date here):
-https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments
+https://huggingface.co/docs/transformers/v4.53.0/en/main_classes/trainer#transformers.TrainingArguments
 
-The TrainingArguments object does a lot of things besides specifying the training configuaration options (e.g. it
+The TrainingArguments object does a lot of things besides specifying the training configuration options (e.g. it
 has computed properties like true training batch size etc.)
 
 ## TaskType
@@ -421,11 +419,14 @@ it can take a lot of draws to see every sample from a dataset and so the resulti
 The datasets themselves need to be in the finetuning supported JSONL formats.
 For SFT this means lines:
 
-    {"messages": {"content": "string", "role": "string"}}
+    {"messages": [{"content": "string", "role": "string"}]}
 
 For DPO this means lines of:
-
-    {"prompt_messages": {"content": "string", "role": "string"}, "chosen_messages": {"content": "string", "role": "string"}, "rejected_messages": {"content": "string", "role": "string"}}
+    {
+       "prompt_messages": [{"content": "string", "role": "string"}],
+       "chosen_messages": [{"content": "string", "role": "string"}],
+        "rejected_messages": [{"content": "string", "role": "string"}]
+    }
 
 #### Type: `object`
 
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/data/ultrafeedback-binarized.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/data/ultrafeedback-binarized.yaml
new file mode 100644
index 0000000..9ae9cc2
--- /dev/null
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/data/ultrafeedback-binarized.yaml
@@ -0,0 +1,7 @@
+# This file specifies the data downloaded by the workloads/download-data-to-bucket/helm chart
+# with the override file workloads/download-data-to-bucket/helm/overrides/ultrafeedback-binarized-to-minio.yaml
+finetuning_config:
+  data_conf:
+    training_data:
+      datasets:
+        - path: default-bucket/datasets/ultrafeedback-binarized.jsonl
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml
index 3a938fd..14e7e15 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b-instruct.yaml
@@ -2,7 +2,7 @@
 model: "meta-llama/Llama-3.1-8B-Instruct"
 
 # Resources:
-downloadsReservedSize: 24Gi
+downloadsReservedSize: 64Gi
 checkpointsReservedSize: 512Gi
 finetuningGpus: 1
 
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/tiny-llama-dpo-full-param.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/tiny-llama-dpo-full-param.yaml
new file mode 100644
index 0000000..57808b9
--- /dev/null
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/tiny-llama-dpo-full-param.yaml
@@ -0,0 +1,67 @@
+# Bucket storage inputs
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Resource configuration:
+finetuningGpus: 1 # 1 or 8, values in between are not robust at the moment
+storageClass: mlstorage
+
+# Resources:
+downloadsReservedSize: 64Gi
+checkpointsReservedSize: 128Gi
+
+# Run configuration:
+distributedType: "auto-deepspeed-stage1"
+mergeAdapter: false
+
+### Model input and output, required args ###
+checkpointsRemote: "default-bucket/experiments/tinyllama-1.1b-chat-v1.0_ultrafeedback-dpo"
+basemodel: "hf://TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+
+### Finetuning config section ###
+finetuning_config:
+  method: dpo
+  data_conf:
+    training_data:
+      datasets:
+        - path: default-bucket/datasets/ultrafeedback-binarized.json
+    chat_template_name: "keep-original"
+    missing_pad_token_strategy: "bos-repurpose"
+  training_args:
+    learning_rate: 0.00005
+    max_grad_norm: 7.0
+    weight_decay: 0.00001
+    optim: "adamw_torch"
+    num_train_epochs: 1
+    lr_scheduler_type: cosine
+    warmup_ratio: 0.05
+    logging_strategy: steps
+    logging_steps: 0.01
+    save_strategy: "no"
+    seed: 42
+    bf16: true
+    report_to:
+      - none
+    push_to_hub: false
+    gradient_checkpointing: false
+    eval_steps: 0.2
+    eval_strategy: "steps"
+    metric_for_best_model: "loss"
+    greater_is_better: false
+  batchsize_conf:
+    total_train_batch_size: 8
+    max_per_device_train_batch_size: 1
+  peft_conf:
+    peft_type: "NO_PEFT"
+  run_conf:
+    model_args:
+      torch_dtype: bfloat16
+      use_cache: false
+      attn_implementation: "flash_attention_2"
+    resume_from_checkpoint: auto
+  sft_args: null
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/tutorial-05-llama-lora-opi-data.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/tutorial-05-llama-lora-opi-data.yaml
new file mode 100644
index 0000000..747f128
--- /dev/null
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/tutorial-05-llama-lora-opi-data.yaml
@@ -0,0 +1,25 @@
+# This override is intended for docs/tutorials/tutorial-05-finetune-llama8b-custom-domain-data.md
+
+# Bucket storage inputs
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Resource configuration:
+finetuningGpus: 1
+storageClass: mlstorage
+
+### Model input and output, required args ###
+basemodel: "default-bucket/models/meta-llama/Llama-3.1-8B-Instruct"
+checkpointsRemote: "default-bucket/experiments/finetuning/llama-31-8B-lora-opi-1k"
+
+### Finetuning config section ###
+finetuning_config:
+  data_conf:
+    training_data:
+      datasets:
+        - path: "default-bucket/datasets/OPI_updated_160k.sample1000.jsonl"
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/utilities/mlflow.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/utilities/mlflow.yaml
new file mode 100644
index 0000000..24e3e82
--- /dev/null
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/utilities/mlflow.yaml
@@ -0,0 +1,11 @@
+# This is an example configuration for MLflow tracking.
+# Note that to use both MLFlow and Tensorboard, report_to should include bot, because simply using
+# both override files will end up with just one of the two enabled due to YAML list override rules.
+finetuning_config:
+  tracking:
+    mlflow_server_uri: http://dev-tracking-mlflow-shared
+    experiment_name: my_experiment
+  training_args:
+    report_to:
+      - mlflow
+    logging_first_step: true
diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
index adabd1d..ce823cb 100644
--- a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
@@ -115,4 +115,8 @@ data:
     sft_args:
       {{- toYaml .sft_args | nindent 6 }}
     {{- end }}
+    {{- if .tracking }}
+    tracking:
+      {{- toYaml .tracking | nindent 6 }}
+    {{- end }}
     {{- end }}
diff --git a/workloads/llm-finetune-silogen-engine/helm/values.schema.json b/workloads/llm-finetune-silogen-engine/helm/values.schema.json
index a9cab48..94be741 100644
--- a/workloads/llm-finetune-silogen-engine/helm/values.schema.json
+++ b/workloads/llm-finetune-silogen-engine/helm/values.schema.json
@@ -139,6 +139,7 @@
           },
           "data_type": {
             "default": "ChatConversation",
+            "description": "Generally, the data_type is automatically set based on the experiment config method.",
             "title": "Data Type",
             "type": "string"
           },
@@ -301,6 +302,7 @@
         "type": "object"
       },
       "ChatTemplateName": {
+        "description": "Chat template to use.",
         "enum": [
           "mistral-with-system",
           "chat-ml",
@@ -365,7 +367,7 @@
       },
       "ConcatenationDataInput": {
         "additionalProperties": false,
-        "description": "A simple list of datasets\n\nThese are simply concatenated, the same as sampling all with equal weight.\n\nThe datasets themselves need to be in the finetuning supported JSONL formats.\nFor SFT this means lines:\n{\"messages\": {\"content\": \"string\", \"role\": \"string\"}}\nFor DPO this means lines of:\n{\"prompt_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"chosen_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"rejected_messages\": {\"content\": \"string\", \"role\": \"string\"}}",
+        "description": "A simple list of datasets\n\nThese are simply concatenated, the same as sampling all with equal weight.\n\nThe datasets themselves need to be in the finetuning supported JSONL formats.\nFor SFT this means lines:\n\n    {\"messages\": {\"content\": \"string\", \"role\": \"string\"}}\n\nFor DPO this means lines of:\n\n    {\"prompt_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"chosen_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"rejected_messages\": {\"content\": \"string\", \"role\": \"string\"}}",
         "properties": {
           "type": {
             "const": "CONCATENATION",
@@ -374,6 +376,7 @@
           },
           "data_type": {
             "default": "ChatConversation",
+            "description": "Generally, the data_type is automatically set based on the experiment config method.",
             "title": "Data Type",
             "type": "string"
           },
@@ -531,39 +534,18 @@
         "description": "Settings that define how run details are logged",
         "properties": {
           "mlflow_server_uri": {
+            "description": "MLflow server URI. Can be local path.",
             "title": "Mlflow Server Uri",
             "type": "string"
           },
           "experiment_name": {
+            "description": "Experiment name that is used for MLFlow tracking.",
             "title": "Experiment Name",
             "type": "string"
           },
-          "run_id": {
-            "anyOf": [
-              {
-                "type": "string"
-              },
-              {
-                "type": "null"
-              }
-            ],
-            "default": null,
-            "title": "Run Id"
-          },
-          "run_name": {
-            "anyOf": [
-              {
-                "type": "string"
-              },
-              {
-                "type": "null"
-              }
-            ],
-            "default": null,
-            "title": "Run Name"
-          },
           "hf_mlflow_log_artifacts": {
             "default": "False",
+            "description": "Whether to store model artifacts in MLFlow.",
             "title": "Hf Mlflow Log Artifacts",
             "type": "string"
           }
@@ -577,7 +559,7 @@
       },
       "GenericPeftConfig": {
         "additionalProperties": false,
-        "description": "Config for any new initialized PEFT Adapter\n\nSee https://huggingface.co/docs/peft/tutorial/peft_model_config for the possible kwargs\nand https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py for the types.\n\nExample\n=======\n    >>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM',\n    ...         'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}}\n    >>> generic_conf = GenericPeftConfig(**loaded_data)\n    >>> # Then later in the code something like:\n    >>> model = transformers.AutoModel.from_pretrained('hf-internal-testing/tiny-random-MistralModel')\n    >>> peft.get_peft_model(model, generic_conf.get_peft_config())\n    PeftModelForCausalLM(\n      (base_model): LoraModel(\n        ...\n      )\n    )",
+        "description": "Config for any new initialized PEFT Adapter\n\nSee https://huggingface.co/docs/peft/tutorial/peft_model_config for the possible kwargs\nand https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py for the types.\n\nExample:\n\n    >>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM',\n    ...         'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}}\n    >>> generic_conf = GenericPeftConfig(**loaded_data)\n    >>> generic_conf.get_peft_config()\n    LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, ...)",
         "properties": {
           "peft_type": {
             "$ref": "#/$defs/PeftType"
@@ -630,7 +612,7 @@
       },
       "ModelArguments": {
         "additionalProperties": false,
-        "description": "These are passed to AutoModelForCausalLM.from_pretrained\n\nSee parameter docstrings and help at:\nhttps://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained\nSee below in \"Parameters for big model inference\" too, it affects training too. Also note that this link takes you\nto the transformers main branch version - be sure to compare with the installed version of transformers (that keeps\nchanging over time, and it is difficult to keep this doctstring up to date, so we wanted to link to the latest here).\n\nSome important parameters to consider are:\n\ndevice_map :\n    A map that specifies where each submodule should go. It doesn\u2019t need to be refined to each parameter/buffer\n    name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass\n    the device (e.g., \"cpu\", \"cuda:1\", \"mps\", or a GPU ordinal rank like 1) on which the model will be allocated,\n    the device map will map the entire model to this device. Passing device_map = 0 means put the whole model on GPU\n    0.\nattn_implementation :\n    The attention implementation to use in the model (if relevant). Can be any of \"eager\" (manual implementation of\n    the attention), \"sdpa\" (using F.scaled_dot_product_attention), or \"flash_attention_2\" (using\n    Dao-AILab/flash-attention). By default, if available, SDPA will be used for torch>=2.1.1. The default is\n    otherwise the manual \"eager\" implementation.\n\nNOTE:\n    This does not include quantization_config. Quantization config is specified separately.",
+        "description": "These are passed to AutoModelForCausalLM.from_pretrained\n\nSee parameter docstrings and help at:\nhttps://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained\nSee below in \"Parameters for big model inference\" too, it affects training too. Also note that this link takes you\nto the transformers main branch version - be sure to compare with the installed version of transformers (that keeps\nchanging over time, and it is difficult to keep this docstring up to date, so we wanted to link to the latest here).\n\nSome important parameters to consider are:\n\n- device_map :\n    A map that specifies where each submodule should go. It doesn\u2019t need to be refined to each parameter/buffer\n    name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass\n    the device (e.g., \"cpu\", \"cuda:1\", \"mps\", or a GPU ordinal rank like 1) on which the model will be allocated,\n    the device map will map the entire model to this device. Passing device_map = 0 means put the whole model on GPU\n    0.\n- attn_implementation :\n    The attention implementation to use in the model (if relevant). Can be any of \"eager\" (manual implementation of\n    the attention), \"sdpa\" (using F.scaled_dot_product_attention), or \"flash_attention_2\" (using\n    Dao-AILab/flash-attention). By default, if available, SDPA will be used for torch>=2.1.1. The default is\n    otherwise the manual \"eager\" implementation.\n\nNOTE:\n    This does not include quantization_config. Quantization config is specified separately.",
         "properties": {
           "torch_dtype": {
             "default": "auto",
@@ -660,6 +642,7 @@
               }
             ],
             "default": null,
+            "description": "Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify \"auto\", \"balanced_low_0\", or \"sequential\".",
             "title": "Device Map"
           },
           "max_memory": {
@@ -692,6 +675,7 @@
               }
             ],
             "default": null,
+            "description": "Note: this can be set to \"sdpa\", \"flash_attention_2\", \"eager\".",
             "title": "Attn Implementation"
           },
           "offload_folder": {
@@ -716,6 +700,7 @@
               }
             ],
             "default": null,
+            "description": "Default is True if offloading (otherwise no effect)",
             "title": "Offload State Dict"
           },
           "offload_buffers": {
@@ -732,6 +717,7 @@
           },
           "use_cache": {
             "default": true,
+            "description": "Saves generated hidden states to speed up generation, see: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 This is mutually exclusive with gradient_checkpointing.",
             "title": "Use Cache",
             "type": "boolean"
           },
@@ -837,6 +823,7 @@
           },
           "trust_remote_code": {
             "default": false,
+            "description": "Warning: if set to True, allows execution of downloaded remote code.",
             "title": "Trust Remote Code",
             "type": "boolean"
           }
@@ -885,6 +872,7 @@
           },
           "data_type": {
             "default": "ChatConversation",
+            "description": "Generally, the data_type is automatically set based on the experiment config method.",
             "title": "Data Type",
             "type": "string"
           }
@@ -898,7 +886,6 @@
       "OptimizerNames": {
         "description": "Stores the acceptable string identifiers for optimizers.",
         "enum": [
-          "adamw_hf",
           "adamw_torch",
           "adamw_torch_fused",
           "adamw_torch_xla",
@@ -956,7 +943,7 @@
           },
           "lr_batch_size_scaling": {
             "default": "none",
-            "description": "Scales the learning rate in the training_args by a factor derived from the total training batch size.\n            'none': No scaling.\n            'sqrt': Multiplies learning rate by square root of batch size (a classic scaling rule).\n            'linear': Multiplies learning rate by the batch size (a more modern scaling rule).\n",
+            "description": "Scales the learning rate in the training_args by a factor derived from the total training batch size.             'none': No scaling.             'sqrt': Multiplies learning rate by square root of batch size (a classic scaling rule).             'linear': Multiplies learning rate by the batch size (a more modern scaling rule).\n",
             "enum": [
               "none",
               "sqrt",
@@ -970,7 +957,7 @@
         "type": "object"
       },
       "PeftType": {
-        "description": "Enum class for the different types of adapters in PEFT.\n\nSupported PEFT types:\n- PROMPT_TUNING\n- MULTITASK_PROMPT_TUNING\n- P_TUNING\n- PREFIX_TUNING\n- LORA\n- ADALORA\n- BOFT\n- ADAPTION_PROMPT\n- IA3\n- LOHA\n- LOKR\n- OFT\n- XLORA\n- POLY\n- LN_TUNING\n- VERA\n- FOURIERFT\n- HRA",
+        "description": "Enum class for the different types of adapters in PEFT.\n\nSupported PEFT types:\n- PROMPT_TUNING\n- MULTITASK_PROMPT_TUNING\n- P_TUNING\n- PREFIX_TUNING\n- LORA\n- ADALORA\n- BOFT\n- ADAPTION_PROMPT\n- IA3\n- LOHA\n- LOKR\n- OFT\n- XLORA\n- POLY\n- LN_TUNING\n- VERA\n- FOURIERFT\n- HRA\n- BONE\n- RANDLORA\n- C3A",
         "enum": [
           "PROMPT_TUNING",
           "MULTITASK_PROMPT_TUNING",
@@ -990,7 +977,12 @@
           "FOURIERFT",
           "XLORA",
           "HRA",
-          "VBLORA"
+          "VBLORA",
+          "CPT",
+          "BONE",
+          "RANDLORA",
+          "TRAINABLE_TOKENS",
+          "C3A"
         ],
         "title": "PeftType",
         "type": "string"
@@ -1005,6 +997,7 @@
             "type": "string"
           },
           "name_or_path": {
+            "description": "HF ID or path to the pretrained peft.",
             "title": "Name Or Path",
             "type": "string"
           }
@@ -1081,7 +1074,7 @@
               }
             ],
             "default": false,
-            "description": "Normally should be set to 'auto' to continue if a checkpoint exists.\nCan set to True to always try to continue, False to never try, or a path to load from a specific path.",
+            "description": "Normally should be set to 'auto' to continue if a checkpoint exists.        Can set to True to always try to continue, False to never try, or a path to load from a specific path.",
             "title": "Resume From Checkpoint"
           },
           "final_checkpoint_name": {
@@ -1092,7 +1085,7 @@
           },
           "determinism": {
             "default": "no",
-            "description": "Set the level of determinism in implementations. Deterministic implementations are not always available,\nand when they are, they are usually slower than their non-deterministic counterparts. Recommended for\ndebugging only.\n'no': No determinism.\n'half': Prefer deterministic implementations.\n'full': Only fully deterministic implementations, error out on operations that only have non-deterministic\n        implementations.",
+            "description": "Set the level of determinism in implementations. Deterministic implementations are not always available,            and when they are, they are usually slower than their non-deterministic counterparts. Recommended for            debugging only.            'no': No determinism.            'half': Prefer deterministic implementations.            'full': Only fully deterministic implementations, error out on operations that only have non-deterministic                    implementations.",
             "enum": [
               "no",
               "half",
@@ -1107,6 +1100,7 @@
       },
       "SFTArguments": {
         "additionalProperties": false,
+        "description": "Supervised fine-tuning arguments",
         "properties": {
           "max_seq_length": {
             "default": 2048,
@@ -1122,7 +1116,7 @@
           },
           "train_on_completions_only": {
             "default": false,
-            "description": "Only comput loss on the assistant's turns.",
+            "description": "Only compute loss on the assistant's turns.",
             "title": "Train On Completions Only",
             "type": "boolean"
           }
@@ -1425,11 +1419,11 @@
           "lr_scheduler_kwargs": {
             "anyOf": [
               {
-                "additionalProperties": true,
-                "type": "object"
+                "type": "string"
               },
               {
-                "type": "string"
+                "additionalProperties": true,
+                "type": "object"
               },
               {
                 "type": "null"
@@ -1448,28 +1442,14 @@
             "type": "integer"
           },
           "log_level": {
-            "anyOf": [
-              {
-                "type": "string"
-              },
-              {
-                "type": "null"
-              }
-            ],
             "default": "passive",
-            "title": "Log Level"
+            "title": "Log Level",
+            "type": "string"
           },
           "log_level_replica": {
-            "anyOf": [
-              {
-                "type": "string"
-              },
-              {
-                "type": "null"
-              }
-            ],
             "default": "warning",
-            "title": "Log Level Replica"
+            "title": "Log Level Replica",
+            "type": "string"
           },
           "log_on_each_node": {
             "default": true,
@@ -1889,11 +1869,11 @@
           "accelerator_config": {
             "anyOf": [
               {
-                "additionalProperties": true,
-                "type": "object"
+                "type": "string"
               },
               {
-                "type": "string"
+                "additionalProperties": true,
+                "type": "object"
               },
               {
                 "type": "null"
@@ -1905,11 +1885,11 @@
           "deepspeed": {
             "anyOf": [
               {
-                "additionalProperties": true,
-                "type": "object"
+                "type": "string"
               },
               {
-                "type": "string"
+                "additionalProperties": true,
+                "type": "object"
               },
               {
                 "type": "null"
@@ -2113,6 +2093,18 @@
             "title": "Hub Always Push",
             "type": "boolean"
           },
+          "hub_revision": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "title": "Hub Revision"
+          },
           "gradient_checkpointing": {
             "default": false,
             "title": "Gradient Checkpointing",
@@ -2156,18 +2148,6 @@
             "title": "Fp16 Backend",
             "type": "string"
           },
-          "evaluation_strategy": {
-            "anyOf": [
-              {
-                "$ref": "#/$defs/IntervalStrategy"
-              },
-              {
-                "type": "string"
-              }
-            ],
-            "default": null,
-            "title": "Evaluation Strategy"
-          },
           "push_to_hub_model_id": {
             "anyOf": [
               {
@@ -2244,16 +2224,9 @@
             "title": "Ray Scope"
           },
           "ddp_timeout": {
-            "anyOf": [
-              {
-                "type": "integer"
-              },
-              {
-                "type": "null"
-              }
-            ],
             "default": 1800,
-            "title": "Ddp Timeout"
+            "title": "Ddp Timeout",
+            "type": "integer"
           },
           "torch_compile": {
             "default": false,
@@ -2284,30 +2257,6 @@
             "default": null,
             "title": "Torch Compile Mode"
           },
-          "dispatch_batches": {
-            "anyOf": [
-              {
-                "type": "boolean"
-              },
-              {
-                "type": "null"
-              }
-            ],
-            "default": null,
-            "title": "Dispatch Batches"
-          },
-          "split_batches": {
-            "anyOf": [
-              {
-                "type": "boolean"
-              },
-              {
-                "type": "null"
-              }
-            ],
-            "default": null,
-            "title": "Split Batches"
-          },
           "include_tokens_per_second": {
             "anyOf": [
               {
@@ -2384,6 +2333,21 @@
             "default": false,
             "title": "Use Liger Kernel"
           },
+          "liger_kernel_config": {
+            "anyOf": [
+              {
+                "additionalProperties": {
+                  "type": "boolean"
+                },
+                "type": "object"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "title": "Liger Kernel Config"
+          },
           "eval_use_gather_object": {
             "anyOf": [
               {
@@ -2846,11 +2810,11 @@
           "lr_scheduler_kwargs": {
             "anyOf": [
               {
-                "additionalProperties": true,
-                "type": "object"
+                "type": "string"
               },
               {
-                "type": "string"
+                "additionalProperties": true,
+                "type": "object"
               },
               {
                 "type": "null"
@@ -2869,28 +2833,14 @@
             "type": "integer"
           },
           "log_level": {
-            "anyOf": [
-              {
-                "type": "string"
-              },
-              {
-                "type": "null"
-              }
-            ],
             "default": "passive",
-            "title": "Log Level"
+            "title": "Log Level",
+            "type": "string"
           },
           "log_level_replica": {
-            "anyOf": [
-              {
-                "type": "string"
-              },
-              {
-                "type": "null"
-              }
-            ],
             "default": "warning",
-            "title": "Log Level Replica"
+            "title": "Log Level Replica",
+            "type": "string"
           },
           "log_on_each_node": {
             "default": true,
@@ -3304,11 +3254,11 @@
           "accelerator_config": {
             "anyOf": [
               {
-                "additionalProperties": true,
-                "type": "object"
+                "type": "string"
               },
               {
-                "type": "string"
+                "additionalProperties": true,
+                "type": "object"
               },
               {
                 "type": "null"
@@ -3320,11 +3270,11 @@
           "deepspeed": {
             "anyOf": [
               {
-                "additionalProperties": true,
-                "type": "object"
+                "type": "string"
               },
               {
-                "type": "string"
+                "additionalProperties": true,
+                "type": "object"
               },
               {
                 "type": "null"
@@ -3528,6 +3478,18 @@
             "title": "Hub Always Push",
             "type": "boolean"
           },
+          "hub_revision": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "title": "Hub Revision"
+          },
           "gradient_checkpointing": {
             "default": false,
             "title": "Gradient Checkpointing",
@@ -3571,18 +3533,6 @@
             "title": "Fp16 Backend",
             "type": "string"
           },
-          "evaluation_strategy": {
-            "anyOf": [
-              {
-                "$ref": "#/$defs/IntervalStrategy"
-              },
-              {
-                "type": "string"
-              }
-            ],
-            "default": null,
-            "title": "Evaluation Strategy"
-          },
           "push_to_hub_model_id": {
             "anyOf": [
               {
@@ -3659,16 +3609,9 @@
             "title": "Ray Scope"
           },
           "ddp_timeout": {
-            "anyOf": [
-              {
-                "type": "integer"
-              },
-              {
-                "type": "null"
-              }
-            ],
             "default": 1800,
-            "title": "Ddp Timeout"
+            "title": "Ddp Timeout",
+            "type": "integer"
           },
           "torch_compile": {
             "default": false,
@@ -3699,30 +3642,6 @@
             "default": null,
             "title": "Torch Compile Mode"
           },
-          "dispatch_batches": {
-            "anyOf": [
-              {
-                "type": "boolean"
-              },
-              {
-                "type": "null"
-              }
-            ],
-            "default": null,
-            "title": "Dispatch Batches"
-          },
-          "split_batches": {
-            "anyOf": [
-              {
-                "type": "boolean"
-              },
-              {
-                "type": "null"
-              }
-            ],
-            "default": null,
-            "title": "Split Batches"
-          },
           "include_tokens_per_second": {
             "anyOf": [
               {
@@ -3799,6 +3718,21 @@
             "default": false,
             "title": "Use Liger Kernel"
           },
+          "liger_kernel_config": {
+            "anyOf": [
+              {
+                "additionalProperties": {
+                  "type": "boolean"
+                },
+                "type": "object"
+              },
+              {
+                "type": "null"
+              }
+            ],
+            "default": null,
+            "title": "Liger Kernel Config"
+          },
           "eval_use_gather_object": {
             "anyOf": [
               {
@@ -3863,7 +3797,7 @@
       },
       "WeightedMixDataInput": {
         "additionalProperties": false,
-        "description": "A list of datasets where each is sampled by a certain weight\n\nThese datasets are interleaved based on the sampling weights. The resulting dataset is fully precomputed, upto\nthe point where every single sample in every dataset gets picked. This means that with small sampling weights,\nit can take a lot of draws to see every sample from a dataset and so the resulting dataset can be very large.\n\nThe datasets themselves need to be in the finetuning supported JSONL formats.\nFor SFT this means lines:\n{\"messages\": {\"content\": \"string\", \"role\": \"string\"}}\nFor DPO this means lines of:\n{\"prompt_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"chosen_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"rejected_messages\": {\"content\": \"string\", \"role\": \"string\"}}",
+        "description": "A list of datasets where each is sampled by a certain weight\n\nThese datasets are interleaved based on the sampling weights. The resulting dataset is fully precomputed, upto\nthe point where every single sample in every dataset gets picked. This means that with small sampling weights,\nit can take a lot of draws to see every sample from a dataset and so the resulting dataset can be very large.\n\nThe datasets themselves need to be in the finetuning supported JSONL formats.\nFor SFT this means lines:\n\n    {\"messages\": {\"content\": \"string\", \"role\": \"string\"}}\n\nFor DPO this means lines of:\n\n    {\"prompt_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"chosen_messages\": {\"content\": \"string\", \"role\": \"string\"}, \"rejected_messages\": {\"content\": \"string\", \"role\": \"string\"}}",
         "properties": {
           "type": {
             "const": "PRECOMPUTE_WEIGHTED_MIX",
@@ -3872,6 +3806,7 @@
           },
           "data_type": {
             "default": "ChatConversation",
+            "description": "Generally, the data_type is automatically set based on the experiment config method.",
             "title": "Data Type",
             "type": "string"
           },
diff --git a/workloads/llm-finetune-silogen-engine/helm/values.yaml b/workloads/llm-finetune-silogen-engine/helm/values.yaml
index 4f74106..a890a84 100644
--- a/workloads/llm-finetune-silogen-engine/helm/values.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/values.yaml
@@ -92,3 +92,4 @@ finetuning_config:
     resume_from_checkpoint: auto
   sft_args:
     max_seq_length: 2048
+  tracking: null
diff --git a/workloads/media-finetune-wan/helm/Chart.yaml b/workloads/media-finetune-wan/helm/Chart.yaml
new file mode 100644
index 0000000..4422ed5
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: finetune-wan
+description: Job for fine-tuning Wan2.2 with DiffSynth framework.
+type: application
+version: 0.0.1
+appVersion: "0.0.1"
diff --git a/workloads/media-finetune-wan/helm/README.md b/workloads/media-finetune-wan/helm/README.md
new file mode 100644
index 0000000..0f3cc66
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/README.md
@@ -0,0 +1,68 @@
+# Fine-Tuning Wan 2.2
+
+## Introduction
+This Helm Chart is used to deploy the media-finetune-wan workload, i.e. a Job for customizing Wan 2.2 models, by performing either full-parameter of LoRA fine-tuning, using DiffSynth framework. There are two options to run the workload:
+
+- Interactive (e.g. for testing)
+- Automated
+
+The interactive option will simply create a container waiting for manual input, with a setup ready to test or tweak the fine-tuning workflows.
+The automated workflows will run the following steps automatically and finish the job:
+
+1. Download models and data
+2. Setup and install the training environment
+3. Fine-tune
+4. Upload checkpoints
+
+## Prerequisites
+This workload requires the Wan2.2 model (either 5B or 14B) and a suitable dataset available in a MinIO cluster (or equivalent) bucket storage. We provide workloads to download and prepare the model and an example dataset.
+
+### Model
+We can use the workload `workloads/download-huggingface-model-to-bucket/helm` to download the Wan2.2 models (either the 5B or the 14B parameter version). This workload downloads the chosen Wan2.2 model and saves it in MinIO.
+
+#### Wan2.2 5B
+```
+helm template workloads/download-huggingface-model-to-bucket/helm \
+  -f workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-TI2V-5B.yaml \
+  --name-template download-wan2-2-ti2v-5b \
+  | kubectl apply -f -
+```
+
+#### Wan2.2 14B
+```
+helm template workloads/download-huggingface-model-to-bucket/helm \
+  -f workloads/download-huggingface-model-to-bucket/helm/overrides/media-finetune-wan2-2-T2V-A14B.yaml \
+  --name-template download-wan2-2-t2b-a14b \
+  | kubectl apply -f -
+```
+
+### Dataset
+The workload `workloads/download-data-to-bucket/helm/overrides/media-finetune-wan-disney-dataset.yaml` prepares an example dataset to get started with fine-tuning. This workload downloads the [Steamboat Willy dataset](https://huggingface.co/datasets/Wild-Heart/Disney-VideoGeneration-Dataset), preprocesses it as required by the fine-tuning workload, and saves the dataset in MinIO.
+
+```
+helm template workloads/download-data-to-bucket/helm \
+  -f workloads/download-data-to-bucket/helm/overrides/media-finetune-wan-disney-dataset.yaml \
+  --name-template download-disney-dataset \
+  | kubectl apply -f -
+```
+
+## Fine-tuning
+### Interactive
+To run the interactive workload:
+
+```sh
+helm template ./helm --name-template wan-finetune | kubectl apply -f -
+```
+
+### Automated
+To run one of the automated workloads (overrides):
+
+```sh
+helm template ./helm -f ./helm/overrides/5B_lora.yaml --name-template wan-finetune | kubectl apply -f -
+```
+
+You can choose the override from any of the provided:
+- `5B_full.yaml` - (5B full parameter finetuning)
+- `5B_lora.yaml` - (5B LoRA finetuning)
+- `14B_full_highnoise.yaml` & `14B_full_lownoise.yaml` - (14B full parameter finetuning)
+- `14B_lora_highnoise.yaml` & `14B_lora_lownoise.yaml` - (14B LoRA finetuning)
diff --git a/workloads/media-finetune-wan/helm/overrides/14B_full_highnoise.yaml b/workloads/media-finetune-wan/helm/overrides/14B_full_highnoise.yaml
new file mode 100644
index 0000000..14b2c90
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/14B_full_highnoise.yaml
@@ -0,0 +1,37 @@
+resources:
+  cpu: 32
+  gpus: 4
+  memory: 256Gi
+workspaceReservedSize: 50Gi
+downloadsReservedSize: 128Gi
+checkpointsReservedSize: 1024Gi
+
+basemodelId: Wan-AI/Wan2.2-T2V-A14B
+datasetId: Disney-VideoGeneration-Dataset
+
+finetune_config:
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+  height: 480
+  width: 832
+  num_frames: 49
+  dataset_repeat: 10
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00001-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00002-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00003-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00004-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00005-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00006-of-00006.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/Wan2.1_VAE.pth\"]"
+  learning_rate: "1e-5"
+  num_epochs: 2
+  remove_prefix_in_ckpt: "pipe.dit."
+  output_path: "/workspace/checkpoints/Wan2.2-T2V-A14B_full_highnoise"
+  trainable_models: "dit"
+  max_timestep_boundary: 0.417
+  min_timestep_boundary: 0
+  architecture: "full"
+
+interactive: false
diff --git a/workloads/media-finetune-wan/helm/overrides/14B_full_lownoise.yaml b/workloads/media-finetune-wan/helm/overrides/14B_full_lownoise.yaml
new file mode 100644
index 0000000..b980731
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/14B_full_lownoise.yaml
@@ -0,0 +1,37 @@
+resources:
+  cpu: 32
+  gpus: 4
+  memory: 256Gi
+workspaceReservedSize: 50Gi
+downloadsReservedSize: 128Gi
+checkpointsReservedSize: 1024Gi
+
+basemodelId: Wan-AI/Wan2.2-T2V-A14B
+datasetId: Disney-VideoGeneration-Dataset
+
+finetune_config:
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+  height: 480
+  width: 832
+  num_frames: 49
+  dataset_repeat: 10
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00001-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00002-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00003-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00004-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00005-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00006-of-00006.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/Wan2.1_VAE.pth\"]"
+  learning_rate: "1e-5"
+  num_epochs: 2
+  remove_prefix_in_ckpt: "pipe.dit."
+  output_path: "/workspace/checkpoints/Wan2.2-T2V-A14B_full_lownoise"
+  trainable_models: "dit"
+  max_timestep_boundary: 1
+  min_timestep_boundary: 0.417
+  architecture: "full"
+
+interactive: false
diff --git a/workloads/media-finetune-wan/helm/overrides/14B_lora_highnoise.yaml b/workloads/media-finetune-wan/helm/overrides/14B_lora_highnoise.yaml
new file mode 100644
index 0000000..f6626dc
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/14B_lora_highnoise.yaml
@@ -0,0 +1,39 @@
+resources:
+  cpu: 32
+  gpus: 4
+  memory: 256Gi
+workspaceReservedSize: 50Gi
+downloadsReservedSize: 128Gi
+checkpointsReservedSize: 512Gi
+
+basemodelId: Wan-AI/Wan2.2-T2V-A14B
+datasetId: Disney-VideoGeneration-Dataset
+
+finetune_config:
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+  height: 480
+  width: 832
+  num_frames: 49
+  dataset_repeat: 10
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00001-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00002-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00003-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00004-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00005-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00006-of-00006.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/Wan2.1_VAE.pth\"]"
+  learning_rate: "1e-4"
+  num_epochs: 5
+  remove_prefix_in_ckpt: "pipe.dit."
+  output_path: "/workspace/checkpoints/Wan2.2-T2V-A14B_lora_highnoise"
+  lora_base_model: "dit"
+  lora_target_modules: "q,k,v,o,ffn.0,ffn.2"
+  lora_rank: 32
+  max_timestep_boundary: 0.358
+  min_timestep_boundary: 0
+  architecture: "lora"
+
+interactive: false
diff --git a/workloads/media-finetune-wan/helm/overrides/14B_lora_lownoise.yaml b/workloads/media-finetune-wan/helm/overrides/14B_lora_lownoise.yaml
new file mode 100644
index 0000000..4895e47
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/14B_lora_lownoise.yaml
@@ -0,0 +1,39 @@
+resources:
+  cpu: 32
+  gpus: 4
+  memory: 256Gi
+workspaceReservedSize: 50Gi
+downloadsReservedSize: 128Gi
+checkpointsReservedSize: 512Gi
+
+basemodelId: Wan-AI/Wan2.2-T2V-A14B
+datasetId: Disney-VideoGeneration-Dataset
+
+finetune_config:
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+  height: 480
+  width: 832
+  num_frames: 49
+  dataset_repeat: 10
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00001-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00002-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00003-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00004-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00005-of-00006.safetensors\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/high_noise_model/diffusion_pytorch_model-00006-of-00006.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-T2V-A14B/Wan2.1_VAE.pth\"]"
+  learning_rate: "1e-4"
+  num_epochs: 5
+  remove_prefix_in_ckpt: "pipe.dit."
+  output_path: "/workspace/checkpoints/Wan2.2-T2V-A14B_lora_lownoise"
+  lora_base_model: "dit"
+  lora_target_modules: "q,k,v,o,ffn.0,ffn.2"
+  lora_rank: 32
+  max_timestep_boundary: 1
+  min_timestep_boundary: 0.417
+  architecture: "lora"
+
+interactive: false
diff --git a/workloads/media-finetune-wan/helm/overrides/5B_full.yaml b/workloads/media-finetune-wan/helm/overrides/5B_full.yaml
new file mode 100644
index 0000000..6254031
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/5B_full.yaml
@@ -0,0 +1,32 @@
+resources:
+  cpu: 32
+  gpus: 4
+  memory: 256Gi
+workspaceReservedSize: 50Gi
+downloadsReservedSize: 32Gi
+checkpointsReservedSize: 512Gi
+
+basemodelId: Wan-AI/Wan2.2-TI2V-5B
+datasetId: Disney-VideoGeneration-Dataset
+
+finetune_config:
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+  height: 480
+  width: 832
+  num_frames: 49
+  dataset_repeat: 10
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00001-of-00003.safetensors\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00002-of-00003.safetensors\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00003-of-00003.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2_VAE.pth\"]"
+  learning_rate: "1e-5"
+  num_epochs: 2
+  remove_prefix_in_ckpt: "pipe.dit."
+  output_path: "/workspace/checkpoints/Wan2.2-TI2V-5B_full"
+  trainable_models: "dit"
+  architecture: "full"
+
+interactive: false
diff --git a/workloads/media-finetune-wan/helm/overrides/5B_lora.yaml b/workloads/media-finetune-wan/helm/overrides/5B_lora.yaml
new file mode 100644
index 0000000..77d911f
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/5B_lora.yaml
@@ -0,0 +1,34 @@
+resources:
+  cpu: 32
+  gpus: 4
+  memory: 256Gi
+workspaceReservedSize: 50Gi
+downloadsReservedSize: 32Gi
+checkpointsReservedSize: 256Gi
+
+basemodelId: Wan-AI/Wan2.2-TI2V-5B
+datasetId: Disney-VideoGeneration-Dataset
+
+finetune_config:
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+  height: 480
+  width: 832
+  num_frames: 49
+  dataset_repeat: 10
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00001-of-00003.safetensors\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00002-of-00003.safetensors\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00003-of-00003.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2_VAE.pth\"]"
+  learning_rate: "1e-4"
+  num_epochs: 5
+  remove_prefix_in_ckpt: "pipe.dit."
+  output_path: "/workspace/checkpoints/Wan2.2-TI2V-5B_lora"
+  lora_base_model: "dit"
+  lora_target_modules: "q,k,v,o,ffn.0,ffn.2"
+  lora_rank: 32
+  architecture: "lora"
+
+interactive: false
diff --git a/workloads/media-finetune-wan/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/media-finetune-wan/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml b/workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml
new file mode 100644
index 0000000..23d10cd
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/overrides/tutorial-07-5b-lora.yaml
@@ -0,0 +1,50 @@
+# Wan 2.2 5B LoRA Fine-tuning Configuration
+# Tutorial 07: Optimized configuration for multi-GPU LoRA training
+
+# Container resources - optimized for compilation and training performance
+resources:
+  cpu: 32 # High CPU count reduces PyTorch compilation time significantly
+  gpus: 4 # 4-GPU setup provides good balance of speed and resource efficiency
+  memory: 256Gi # Sufficient memory for model loading and training
+workspaceReservedSize: 50Gi # Space for DiffSynth framework and temporary files
+downloadsReservedSize: 32Gi # Space for model downloads (~32 GiB total)
+checkpointsReservedSize: 256Gi # Space for checkpoint storage and uploads
+
+# Model and dataset configuration
+basemodelId: Wan-AI/Wan2.2-TI2V-5B # 5B parameter text-to-video model
+datasetId: Disney-VideoGeneration-Dataset # Steamboat Willie -dataset for training
+
+# Fine-tuning parameters
+finetune_config:
+  # Dataset configuration
+  dataset_base_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset"
+  dataset_metadata_path: "/workspace/local_resources/data/Disney-VideoGeneration-Dataset/metadata.csv"
+
+  # Video generation parameters
+  height: 480 # Video height in pixels
+  width: 832 # Video width in pixels
+  num_frames: 49 # Number of frames per video clip
+  dataset_repeat: 10 # Number of times to repeat dataset per epoch
+
+  # Model paths - references to downloaded model components
+  model_paths: "[[ \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00001-of-00003.safetensors\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00002-of-00003.safetensors\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/diffusion_pytorch_model-00003-of-00003.safetensors\"], \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/models_t5_umt5-xxl-enc-bf16.pth\", \
+    \"models/Wan-AI/Wan2.2-TI2V-5B/Wan2.2_VAE.pth\"]"
+
+  # Training hyperparameters
+  learning_rate: "1e-4" # Learning rate - balanced for LoRA training
+  num_epochs: 5 # Number of training epochs
+  remove_prefix_in_ckpt: "pipe.dit." # Prefix removal for checkpoint compatibility
+  output_path: "/workspace/checkpoints/Wan2.2-TI2V-5B_lora" # Local checkpoint output directory
+
+  # LoRA-specific parameters
+  lora_base_model: "dit" # Apply LoRA to DiT (Diffusion Transformer) layers
+  lora_target_modules: "q,k,v,o,ffn.0,ffn.2" # Target attention and feedforward modules
+  lora_rank: 32 # LoRA rank - balance between efficiency and quality
+  architecture: "lora" # Use LoRA fine-tuning (vs. full parameter training)
+
+# Run mode
+interactive: false # Automated training (set to true for manual exploration)
diff --git a/workloads/media-finetune-wan/helm/templates/_helpers.tpl b/workloads/media-finetune-wan/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..2ba76cd
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/templates/_helpers.tpl
@@ -0,0 +1,103 @@
+# Helper templates
+
+# Interactive entrypoint
+{{- define "interactiveEntrypoint" -}}
+sleep 12h
+{{- end -}}
+
+# Install entrypoint
+{{- define "installEntrypoint" -}}
+# Setup DiffSynth
+echo 'Installing DiffSynth...'
+cd /workspace
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+git checkout {{ .Values.diffsynth_commit | default "090074e" }}
+pip install .
+{{- end -}}
+
+# Download entrypoint
+{{- define "downloadEntrypoint" -}}
+# Setup MinIO, Download resources:
+echo 'Setting up MinIO client...';
+curl -s https://dl.min.io/client/mc/release/linux-amd64/mc \
+  --create-dirs \
+  -o /tmp/mc
+chmod +x /tmp/mc
+export PATH="${PATH}:/tmp/"
+mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
+echo 'Downloading base model directly to DiffSynth models directory...'
+echo 'Downloading base model from: {{ printf "%s/%s" (.Values.bucketBasemodelPath | trimSuffix "/") (.Values.basemodelId | trimSuffix "/") }}/'
+mc ls minio-host/{{ printf "%s/%s" (.Values.bucketBasemodelPath | trimSuffix "/") (.Values.basemodelId | trimSuffix "/") }}/ || echo "Base model path not found!"
+mc cp --recursive \
+  minio-host/{{ printf "%s/%s" (.Values.bucketBasemodelPath | trimSuffix "/") (.Values.basemodelId | trimSuffix "/") }}/ \
+  /workspace/DiffSynth-Studio/models/{{ .Values.basemodelId | trimSuffix "/" }}
+echo 'Downloading training dataset...'
+echo 'Downloading training dataset from: {{ printf "%s/%s" (.Values.bucketDatasetPath | trimSuffix "/") (.Values.datasetId | trimSuffix "/") }}/'
+mc ls minio-host/{{ printf "%s/%s" (.Values.bucketDatasetPath | trimSuffix "/") (.Values.datasetId | trimSuffix "/") }}/ || echo "Training dataset path not found!"
+mc cp --recursive \
+  minio-host/{{ printf "%s/%s" (.Values.bucketDatasetPath | trimSuffix "/") (.Values.datasetId | trimSuffix "/") }}/ \
+  /workspace/local_resources/data/{{ .Values.datasetId | trimSuffix "/" }}
+echo 'Verifying downloaded files...'
+ls -la /workspace/DiffSynth-Studio/models/{{ .Values.basemodelId | trimSuffix "/" }}/ || echo "No model files found"
+ls -la /workspace/local_resources/data/{{ .Values.datasetId | trimSuffix "/" }}/ || echo "No dataset files found"
+{{- end -}}
+
+# Finetune entrypoint
+{{- define "finetuneEntrypoint" -}}
+# Start finetuning
+echo 'Starting finetuning...'
+cd /workspace/DiffSynth-Studio
+accelerate launch \
+  --config_file /workspace/configs/accelerate_config.yaml \
+  examples/wanvideo/model_training/train.py \
+  {{- with .Values.finetune_config }}
+  --dataset_base_path "{{ .dataset_base_path }}" \
+  --dataset_metadata_path "{{ .dataset_metadata_path }}" \
+  --height {{ .height }} \
+  --width {{ .width }} \
+  --num_frames {{ .num_frames }} \
+  --dataset_repeat {{ .dataset_repeat }} \
+  --model_paths '{{ .model_paths }}' \
+  --learning_rate "{{ .learning_rate }}" \
+  --num_epochs {{ .num_epochs }} \
+  --remove_prefix_in_ckpt "{{ .remove_prefix_in_ckpt }}" \
+  --output_path "{{ .output_path }}" \
+  {{- if eq .architecture "lora" }}
+  --lora_base_model "{{ .lora_base_model }}" \
+  --lora_target_modules "{{ .lora_target_modules }}" \
+  --lora_rank {{ .lora_rank }}
+  {{- else if eq .architecture "full" }}
+  --trainable_models {{ .trainable_models }}
+  --max_timestep_boundary {{ .max_timestep_boundary }}
+  --min_timestep_boundary {{ .min_timestep_boundary }}
+  {{- end }}
+  {{- end }}
+{{- end -}}
+
+# Upload entrypoint
+{{- define "uploadEntrypoint" -}}
+# Upload checkpoints
+echo 'Uploading checkpoints...'
+mc cp --recursive \
+  {{ .Values.finetune_config.output_path | trimSuffix "/" }}/ \
+  minio-host/{{- printf "%s/%s/%s/%s" (.Values.bucketBasemodelPath | trimSuffix "/") (.Values.basemodelId | trimSuffix "/") (last (splitList "/" .Values.finetune_config.output_path) | trimSuffix "/") (now | date "20060102-150405") -}}/ \
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+limits:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+{{- end -}}
diff --git a/workloads/media-finetune-wan/helm/templates/configmap.yaml b/workloads/media-finetune-wan/helm/templates/configmap.yaml
new file mode 100644
index 0000000..61274d1
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/templates/configmap.yaml
@@ -0,0 +1,28 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-configs"
+data:
+  accelerate_config.yaml: |
+    compute_environment: LOCAL_MACHINE
+    debug: false
+    deepspeed_config:
+      gradient_accumulation_steps: 4
+      offload_optimizer_device: none
+      offload_param_device: none
+      zero3_init_flag: false
+      zero_stage: 2
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    enable_cpu_affinity: false
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: {{ .Values.resources.gpus | default 1 }}
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
diff --git a/workloads/media-finetune-wan/helm/templates/job.yaml b/workloads/media-finetune-wan/helm/templates/job.yaml
new file mode 100644
index 0000000..b0e4631
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/templates/job.yaml
@@ -0,0 +1,120 @@
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ .Release.Name }}-job
+  namespace: {{ .Release.Namespace }}
+  {{- if .Values.labels }}
+  labels:
+    {{- range $label, $value := .Values.labels }}
+      {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  backoffLimit: 0
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: finetune-wan
+          image: {{ .Values.image }}
+          imagePullPolicy: Always
+          workingDir: /workspace
+          command:
+            - /bin/bash
+            - -l
+            - -e
+            - -u
+            - -c
+          args:
+            - |
+            {{- if .Values.interactive }}
+              {{- include "interactiveEntrypoint" . | nindent 14 }}
+            {{- else }}
+              echo '===== Install ====='
+              {{- include "installEntrypoint" . | nindent 14 }}
+              echo '===== Download ====='
+              {{- include "downloadEntrypoint" . | nindent 14 }}
+              echo '===== Finetune ====='
+              {{- include "finetuneEntrypoint" . | nindent 14 }}
+              echo '===== Upload ====='
+              {{- include "uploadEntrypoint" . | nindent 14 }}
+            {{- end }}
+          resources:
+            {{- include "container.resources" . | trim | nindent 12 }}
+          env:
+            - name: BUCKET_STORAGE_HOST
+              value: {{ .Values.bucketStorageHost }}
+            - name: BUCKET_STORAGE_ACCESS_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.bucketCredentialsSecret.name }}
+                  key: {{ .Values.bucketCredentialsSecret.accessKeyKey }}
+            - name: BUCKET_STORAGE_SECRET_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.bucketCredentialsSecret.name }}
+                  key: {{ .Values.bucketCredentialsSecret.secretKeyKey }}
+          volumeMounts:
+            - name: dshm # Increase SHM size for the container by mounting /dev/shm, for Pytorch parallel processing
+              mountPath: /dev/shm
+            - name: workspace
+              mountPath: /workspace
+              readOnly: false
+            - name: downloads
+              mountPath: /workspace/local_resources
+              readOnly: false
+            - name: checkpoints
+              mountPath: /workspace/checkpoints
+              readOnly: false
+            - name: configs
+              mountPath: /workspace/configs
+              readOnly: true
+          securityContext:
+            allowPrivilegeEscalation: false
+            runAsNonRoot: true
+            runAsUser: 1000
+            runAsGroup: 1000
+            seccompProfile:
+              type: RuntimeDefault
+            capabilities:
+              drop: ["ALL"]
+      securityContext:
+        runAsUser: 1000
+        runAsGroup: 1000
+        fsGroup: 1000
+        fsGroupChangePolicy: "OnRootMismatch"
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory # equivalent to `docker run --shm-size=(total_memory/2)`
+        - name: workspace
+          emptyDir:
+            sizeLimit: "{{ .Values.workspaceReservedSize }}"
+        - name: downloads
+          emptyDir:
+            sizeLimit: "{{ .Values.downloadsReservedSize }}"
+        - name: checkpoints
+          emptyDir:
+            sizeLimit: "{{ .Values.checkpointsReservedSize }}"
+        - name: configs
+          configMap:
+            name: "{{ .Release.Name }}-configs"
+{{- end -}}
+
+{{- define "job_wrapped_with_kaiwojob" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoJob
+metadata:
+  name: "{{ .Release.Name }}-job"
+spec:
+  job:
+    {{- include "job" . | nindent 4 }}
+{{- end -}}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "job_wrapped_with_kaiwojob" . }}
+{{- else -}}
+{{- include "job" . }}
+{{- end -}}
diff --git a/workloads/media-finetune-wan/helm/values.yaml b/workloads/media-finetune-wan/helm/values.yaml
new file mode 100644
index 0000000..41a983b
--- /dev/null
+++ b/workloads/media-finetune-wan/helm/values.yaml
@@ -0,0 +1,39 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
+
+# Use to add labels to the metadata of the resources created by this workload
+labels: {}
+
+# Image configuration
+image: rocm/pytorch-training:v25.7
+
+# Container resources
+resources:
+  cpu: 1
+  gpus: 1
+  memory: 16Gi
+workspaceReservedSize: 32Gi
+downloadsReservedSize: 32Gi
+checkpointsReservedSize: 256Gi
+
+# Model and data
+basemodelId: Wan-AI/Wan2.2-TI2V-5B
+datasetId: Disney-VideoGeneration-Dataset
+
+# Bucket storage inputs
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Bucket locations for the model and training data
+bucketBasemodelPath: default-bucket/models/
+bucketDatasetPath: default-bucket/datasets/
+bucketCheckpointPath: default-bucket/checkpoints/
+
+# Interactive mode (if true,  the pod will wait for manual user input instead of starting the finetuning)
+interactive: true
diff --git a/workloads/media-torchserve-wan21/helm/Chart.yaml b/workloads/media-torchserve-wan21/helm/Chart.yaml
new file mode 100644
index 0000000..baa0dbd
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: wan21-torchserve
+description: A Helm chart for Wan2.1 serving via torchserve
+version: 0.1.0
diff --git a/workloads/media-torchserve-wan21/helm/README.md b/workloads/media-torchserve-wan21/helm/README.md
new file mode 100644
index 0000000..6faed29
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/README.md
@@ -0,0 +1,76 @@
+# 🧠 TorchServe for Wan2.1 — Helm Deployment
+
+This Helm chart deploys a custom TorchServe instance serving the Wan2.1 text-to-video generation model. It handles model setup, packaging, and serving in a fully containerized GPU environment.
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml # Helm chart metadata
+├── values.yaml # Main config values (e.g. image, ports, GPU)
+├── templates/ # Helm templates for deployment, config, etc.
+├── mount/ # Files mounted into the container via ConfigMap
+│ ├── config.properties
+│ ├── download_model.py
+│ ├── entrypoint.sh
+│ ├── model_setup.sh
+│ ├── requirements-torchserve.txt
+│ ├── torchserve_rocm62_requirements.patch
+│ └── wan_handler.py
+├── overrides/ # Optional values overrides
+```
+
+## 📚Prerequisits
+
+The TorchServe workload is heavilly dependent on `torchserve-model-packager` workload, which is responsible for compressing the model into .zip and uploading it to MiniO object storage. For this workload it's important to be sure that compressed model is already available for downloading in MiniO.
+
+You can run the workload `torchserve-model-packager` by:
+
+```
+helm template workloads/torchserve-model-packager/helm \
+    -f workloads/torchserve-model-packager/helm/overrides/Wan2.1-1.3B-diffusers.yaml
+    --name-template wan21 \
+    | kubectl appply -f -
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template wan21 . | kubectl apply -f -
+```
+
+This sets up the pod, installs dependencies, downloads the model, creates the `.mar` archive, and launches TorchServe.
+
+## 🌐 Access the API
+
+Forward the API to your local machine:
+
+`kubectl port-forward deployment/wan21-torchserve-wan21 8080:8080`
+
+Send request:
+
+```
+curl -X POST http://localhost:8080/predictions/wan21 \
+  -H "Content-Type: application/json" \
+  -d '{
+    "prompt": "a scene of an astronaut riding a horse on mars",
+    "width": 480,
+    "height": 832,
+    "num_frames": 81,
+    "num_inference_steps": 40
+  }' \
+  --output "output-$(date +%Y%m%d%H%M%S).mp4"
+```
+
+## ⚙️ TorchServe Files Explained
+
+| File                                     | Purpos                                                     |
+| ---------------------------------------- | ---------------------------------------------------------- |
+| `entrypoint.sh`                        | Installs dependencies, builds model, and starts TorchServe |
+| `wan_handler.py`                       | Custom inference handler using HuggingFace Diffusers       |
+| `config.properties`                    | Sets TorchServe ports, timeouts, etc.                      |
+| `requirements-torchserve.txt`          | Dependencies for serving                                   |
+| `torchserve_rocm62_requirements.patch` | Patch to adjust ROCm-specific install                      |
+| `model_setup.sh`                       | Generates `.mar` file for torchserve                     |
diff --git a/workloads/media-torchserve-wan21/helm/mount/config.properties b/workloads/media-torchserve-wan21/helm/mount/config.properties
new file mode 100644
index 0000000..e586bed
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/mount/config.properties
@@ -0,0 +1,6 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+
+default_response_timeout=3000
+default_startup_timeout=500
diff --git a/workloads/media-torchserve-wan21/helm/mount/entrypoint.sh b/workloads/media-torchserve-wan21/helm/mount/entrypoint.sh
new file mode 100644
index 0000000..45ff06d
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/mount/entrypoint.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+set -e
+
+export PYTHONUNBUFFERED=1
+export PYTHONDONTWRITEBYTECODE=1
+export PIP_NO_CACHE_DIR=off
+export PIP_DISABLE_PIP_VERSION_CHECK=on
+export CONDA_ENV_PATH=/opt/conda/envs/py_3.10
+export PATH="$CONDA_ENV_PATH/bin:$PATH"
+
+TS_COMMIT_ID=62c4d6a1fdc1d071dbcf758ebd756029af20bd5e
+DIFFUSERS_COMMIT=20e4b6a628c7e433f5805de49afc28f991c185c0
+
+# Copy files from readonly ConfigMap to writable location
+mkdir -p /workspace/wan21-torchserve
+cp /workload/mount/entrypoint.sh /workspace/
+find /workload/mount -maxdepth 1 -xtype f ! -name 'entrypoint.sh' -exec cp {} /workspace/wan21-torchserve/ \;
+
+cd /workspace
+
+# Install Java and zip (needed by TorchServe)
+apt-get update -qq && \
+DEBIAN_FRONTEND=noninteractive apt-get install -y -qq openjdk-17-jdk zip curl
+
+# Download MinIO client binary to custom directory (no root access needed)
+echo 'Installing MinIO client'
+curl -s https://dl.min.io/client/mc/release/linux-amd64/mc \
+    --create-dirs \
+    -o /minio-binaries/mc
+chmod +x /minio-binaries/mc
+export PATH="${PATH}:/minio-binaries/"
+
+# Configure MinIO client
+mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_ACCESS_KEY $BUCKET_SECRET_KEY
+
+python -m pip freeze
+# Install Python requirements
+python -m pip install -r ./wan21-torchserve/requirements-torchserve.txt
+python -m pip install git+https://github.com/huggingface/diffusers@$DIFFUSERS_COMMIT
+python -m pip install "numpy==1.26.4"
+
+# first check if serve directory exists (from previous crash/fail)
+if [ -d "serve" ]; then
+    echo "Directory 'serve' already exists. Removing it..."
+    rm -rf serve
+fi
+
+# Clone TorchServe
+git clone https://github.com/pytorch/serve.git
+cd serve
+git checkout $TS_COMMIT_ID
+# Rewrite/create empty requirements file for rocm62
+: > requirements/torch_rocm62.txt
+python -m pip install torch-model-archiver
+python ./ts_scripts/install_dependencies.py --rocm=rocm62
+python ./ts_scripts/install_from_src.py
+# SetUp for torchserve:
+cd /workspace/wan21-torchserve
+
+echo "Running model setup script..."
+bash model_setup.sh
+echo "Model archive created successfully."
+
+echo "Starting TorchServe..."
+if torchserve --start \
+  --ncs \
+  --model-store model_store \
+  --models wan21.mar \
+  --disable-token-auth \
+  --ts-config config.properties; then
+
+  echo "TorchServe started."
+  touch /workload/healthy
+else
+  echo "TorchServe failed to start." >&2
+  exit 1
+fi
+
+# Keep alive
+while true; do sleep 10; done
diff --git a/workloads/media-torchserve-wan21/helm/mount/model_setup.sh b/workloads/media-torchserve-wan21/helm/mount/model_setup.sh
new file mode 100644
index 0000000..5f789bd
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/mount/model_setup.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+set -e  # Exit on any error
+
+echo "Starting model setup ... "
+
+# Check required environment variables
+if [ -z "$BUCKET_PATH" ] || [ -z "$MODEL_ID" ]; then
+    echo "ERROR: BUCKET_PATH and MODEL_ID environment variables must be set"
+    exit 1
+fi
+
+# Download model.zip from MinIO
+echo "Downloading model from minio-host/$BUCKET_PATH/$MODEL_ID/packaged/model.zip..."
+
+if mc cp minio-host/$BUCKET_PATH/$MODEL_ID/packaged/model.zip model.zip 2>/dev/null; then
+    echo "Model.zip downloaded successfully ($(du -h model.zip | cut -f1))"
+else
+    echo "ERROR: model.zip not found at minio-host/$BUCKET_PATH/$MODEL_ID/packaged/model.zip"
+    echo "Please ensure the torchserve-model-packager workload has completed successfully"
+    exit 1
+fi
+
+# Create model store directory
+mkdir -p model_store
+echo "Model store directory ready"
+
+# Create model archive
+echo "Creating model archive with torch-model-archiver..."
+if torch-model-archiver --model-name wan21 --version 1.0 --export-path model_store --handler wan_handler.py --extra-files model.zip; then
+    echo "Model archive created successfully"
+else
+    echo "ERROR: Failed to create model archive"
+    exit 1
+fi
+
+# Verify the archive was created
+if [ -f "model_store/wan21.mar" ]; then
+    echo "Model archive verification passed: wan21.mar created ($(du -h model_store/wan21.mar | cut -f1))"
+else
+    echo "ERROR: Expected model archive wan21.mar not found in model_store/"
+    exit 1
+fi
+
+echo "Model setup completed successfully!"
diff --git a/workloads/media-torchserve-wan21/helm/mount/requirements-torchserve.txt b/workloads/media-torchserve-wan21/helm/mount/requirements-torchserve.txt
new file mode 100644
index 0000000..f3436b5
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/mount/requirements-torchserve.txt
@@ -0,0 +1,10 @@
+accelerate>=1.1.1
+dashscope==1.22.2
+easydict==1.13
+ftfy==6.3.1
+gradio>=5.0.0
+imageio==2.36.1
+imageio-ffmpeg==0.6.0
+opencv-python>=4.9.0.80
+pygit2==1.17.0
+tqdm
diff --git a/workloads/media-torchserve-wan21/helm/mount/torchserve_rocm62_requirements.patch b/workloads/media-torchserve-wan21/helm/mount/torchserve_rocm62_requirements.patch
new file mode 100644
index 0000000..0b5b6b1
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/mount/torchserve_rocm62_requirements.patch
@@ -0,0 +1,10 @@
+diff --git a/requirements/torch_rocm62.txt b/requirements/torch_rocm62.txt
+index 291a07b4..d3f5a12f 100644
+--- a/requirements/torch_rocm62.txt
++++ b/requirements/torch_rocm62.txt
+@@ -1,4 +1 @@
+---index-url https://download.pytorch.org/whl/rocm6.2
+-torch==2.5.1+rocm6.2; sys_platform == 'linux'
+-torchvision==0.20.1+rocm6.2; sys_platform == 'linux'
+-torchaudio==2.5.1+rocm6.2; sys_platform == 'linux'
++
diff --git a/workloads/media-torchserve-wan21/helm/mount/wan_handler.py b/workloads/media-torchserve-wan21/helm/mount/wan_handler.py
new file mode 100644
index 0000000..fa7228f
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/mount/wan_handler.py
@@ -0,0 +1,133 @@
+import json
+import logging
+import uuid
+import zipfile
+from abc import ABC
+
+import diffusers
+import torch
+from diffusers import DiffusionPipeline
+from diffusers.utils import export_to_video
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+logger.info("Diffusers version %s", diffusers.__version__)
+
+
+class DiffusersHandler(BaseHandler, ABC):
+    """
+    Diffusers handler class for text to video generation.
+    """
+
+    def __init__(self):
+        self.initialized = False
+
+    def initialize(self, ctx):
+        """In this initialize function, the Wan2.1 model is
+            loaded and initialized here.
+        Args:
+            ctx (context): It is a JSON Object containing information
+            pertaining to the model artifacts parameters.
+        """
+        self.manifest = ctx.manifest
+        properties = ctx.system_properties
+        model_dir = properties.get("model_dir")
+
+        self.device = torch.device(
+            "cuda:" + str(properties.get("gpu_id"))
+            if torch.cuda.is_available() and properties.get("gpu_id") is not None
+            else "cpu"
+        )
+
+        with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
+            zip_ref.extractall(model_dir + "/model")
+
+        self.pipe = DiffusionPipeline.from_pretrained(model_dir + "/model")
+        self.pipe.to(self.device)
+        logger.info("Diffusion model from path %s loaded successfully", model_dir)
+
+        self.initialized = True
+
+    def preprocess(self, requests):
+        inputs = []
+        for _, data in enumerate(requests):
+            logger.info(f"Full incoming data: {data}")
+            input_text = None
+            width = int(data.get("width", 480))
+            height = int(data.get("height", 832))
+            num_frames = int(data.get("num_frames", 81))
+            num_inference_steps = int(data.get("num_inference_steps", 40))
+
+            if "body" in data:
+                body = data["body"]
+                if isinstance(body, (str, bytes, bytearray)):
+                    try:
+                        if isinstance(body, (bytes, bytearray)):
+                            body = body.decode("utf-8")
+                        body_json = json.loads(body)
+                        input_text = body_json.get("prompt")
+                        width = int(body_json.get("width", width))
+                        height = int(body_json.get("height", height))
+                        num_frames = int(body_json.get("num_frames", num_frames))
+                        num_inference_steps = int(body_json.get("num_inference_steps", num_inference_steps))
+                    except Exception as e:
+                        logger.error(f"Failed to parse body as JSON: {e}")
+                elif isinstance(body, dict):
+                    input_text = body.get("prompt")
+                    width = int(body.get("width", width))
+                    height = int(body.get("height", height))
+                    num_frames = int(body.get("num_frames", num_frames))
+                    num_inference_steps = int(body.get("num_inference_steps", num_inference_steps))
+            if input_text is None:
+                input_text = data.get("prompt")
+            if input_text is None:
+                logger.error("No prompt found in request data! Data was: %s", data)
+                continue
+            logger.info(
+                f"Received text: '{input_text}', width: {width}, "
+                f"height: {height}, num_frames: {num_frames}, "
+                f"num_inference_steps: {num_inference_steps}"
+            )
+            inputs.append(
+                {
+                    "prompt": input_text,
+                    "width": width,
+                    "height": height,
+                    "num_frames": num_frames,
+                    "num_inference_steps": num_inference_steps,
+                }
+            )
+        return inputs
+
+    def inference(self, inputs):
+        """Generates video relevant to the received text."""
+        video_paths = []
+        for inp in inputs:
+            output = self.pipe(
+                prompt=inp["prompt"],
+                negative_prompt=None,
+                height=inp["height"],
+                width=inp["width"],
+                num_frames=inp["num_frames"],
+                guidance_scale=5.0,
+                num_inference_steps=inp["num_inference_steps"],
+            ).frames[0]
+
+            video_path = f"output_{uuid.uuid4()}.mp4"
+            export_to_video(output, video_path, fps=16)
+            logger.info(f"Generated video: {video_path}")
+            video_paths.append(video_path)
+        return video_paths
+
+    def postprocess(self, inference_output):
+        """Post Process Function to handle video output.
+        Args:
+            inference_output (list): It contains paths to the generated video files.
+        Returns:
+            list: Returns a list of binary video data or URLs.
+        """
+        video_data = []
+        for video_path in inference_output:
+            with open(video_path, "rb") as vid_file:
+                video_data.append(vid_file.read())
+        return video_data
diff --git a/workloads/media-torchserve-wan21/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/media-torchserve-wan21/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/media-torchserve-wan21/helm/overrides/tutorial-06-serve.yaml b/workloads/media-torchserve-wan21/helm/overrides/tutorial-06-serve.yaml
new file mode 100644
index 0000000..ae569d8
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/overrides/tutorial-06-serve.yaml
@@ -0,0 +1,57 @@
+# Container image configuration
+image:
+  name: "rocm/pytorch-training"
+  tag: "v25.2"
+  pullPolicy: "Always"
+
+# Setting job labels
+metadata:
+  labels:
+    kaiwo.silogen.ai/managed: "true"
+
+resources:
+  gpus: 1
+  memoryPerGpu: 64
+  cpuPerGpu: 4
+
+# Volume configurations
+storage:
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+    - ReadWriteOnce
+  dshm:
+    enabled: true
+    sizeLimit: "64Gi"
+
+entrypoint: "bash /workload/mount/entrypoint.sh"
+
+# Environment variables
+env_vars:
+  BUCKET_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  BUCKET_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  BUCKET_STORAGE_HOST: "http://minio.minio-tenant-default.svc.cluster.local:80"
+  BUCKET_PATH: "default-bucket/models"
+  MODEL_ID: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+
+deployment:
+  ports:
+    http: 8080
+
+
+readinessProbe:
+  exec:
+    command:
+    - cat
+    - /workload/healthy
+  initialDelaySeconds: 60
+  periodSeconds: 10
+
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
diff --git a/workloads/media-torchserve-wan21/helm/templates/_helpers.tpl b/workloads/media-torchserve-wan21/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..37b1446
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/templates/_helpers.tpl
@@ -0,0 +1,74 @@
+{{/* Release name helper */}}
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+{{/* Full release name with timestamp fallback */}}
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+{{/* Container resources helper (GPU, ephemeral storage) */}}
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+limits:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+{{- end -}}
+
+{{/* Optional container environment variables from values.env_vars */}}
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+{{/* Volume mounts (config + shm) */}}
+{{- define "container.volumeMounts" -}}
+- name: workload-mount
+  mountPath: /workload/mount
+- name: workspace-volume
+  mountPath: /workspace
+- name: dshm
+  mountPath: /dev/shm
+{{- end -}}
+
+{{/* Volumes (configmap + memory disk) */}}
+{{- define "container.volumes" -}}
+- name: dshm
+  emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+- name: workload-mount
+  configMap:
+    name: {{ include "release.fullname" . }}
+- name: workspace-volume
+  emptyDir: {}
+{{- end -}}
diff --git a/workloads/media-torchserve-wan21/helm/templates/configmap.yaml b/workloads/media-torchserve-wan21/helm/templates/configmap.yaml
new file mode 100644
index 0000000..e8a7612
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |-
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/media-torchserve-wan21/helm/templates/deployment.yaml b/workloads/media-torchserve-wan21/helm/templates/deployment.yaml
new file mode 100644
index 0000000..56410f3
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/templates/deployment.yaml
@@ -0,0 +1,59 @@
+{{- define "deployment" -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {{ include "release.fullname" . }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      containers:
+        - name: {{ .Chart.Name }}
+          image: "{{ .Values.image.name }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy | quote }}
+          {{- if .Values.entrypoint }}
+          command: ["/bin/bash", "-c"]
+          args:
+            - {{ .Values.entrypoint | quote }}
+          {{- end }}
+          {{- if .Values.env_vars }}
+          env:
+{{ include "container.env" . | indent 12 }}
+          {{- end }}
+          ports:
+            {{- range $key, $value := .Values.deployment.ports }}
+            - name: {{ $key }}
+              containerPort: {{ $value }}
+            {{- end }}
+          {{- if .Values.livenessProbe }}
+          livenessProbe:
+{{ .Values.livenessProbe | toYaml | indent 12 }}
+          {{- end }}
+          {{- if .Values.readinessProbe }}
+          readinessProbe:
+{{ .Values.readinessProbe | toYaml | indent 12 }}
+          {{- end }}
+          {{- if .Values.startupProbe }}
+          startupProbe:
+{{ .Values.startupProbe | toYaml | indent 12 }}
+          {{- end }}
+          resources:
+{{ include "container.resources" . | indent 12 }}
+          volumeMounts:
+{{ include "container.volumeMounts" . | indent 12 }}
+      volumes:
+{{ include "container.volumes" . | indent 8 }}
+{{- end -}}
+
+{{ include "deployment" . }}
diff --git a/workloads/media-torchserve-wan21/helm/values.yaml b/workloads/media-torchserve-wan21/helm/values.yaml
new file mode 100644
index 0000000..ae569d8
--- /dev/null
+++ b/workloads/media-torchserve-wan21/helm/values.yaml
@@ -0,0 +1,57 @@
+# Container image configuration
+image:
+  name: "rocm/pytorch-training"
+  tag: "v25.2"
+  pullPolicy: "Always"
+
+# Setting job labels
+metadata:
+  labels:
+    kaiwo.silogen.ai/managed: "true"
+
+resources:
+  gpus: 1
+  memoryPerGpu: 64
+  cpuPerGpu: 4
+
+# Volume configurations
+storage:
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+    - ReadWriteOnce
+  dshm:
+    enabled: true
+    sizeLimit: "64Gi"
+
+entrypoint: "bash /workload/mount/entrypoint.sh"
+
+# Environment variables
+env_vars:
+  BUCKET_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  BUCKET_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  BUCKET_STORAGE_HOST: "http://minio.minio-tenant-default.svc.cluster.local:80"
+  BUCKET_PATH: "default-bucket/models"
+  MODEL_ID: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+
+deployment:
+  ports:
+    http: 8080
+
+
+readinessProbe:
+  exec:
+    command:
+    - cat
+    - /workload/healthy
+  initialDelaySeconds: 60
+  periodSeconds: 10
+
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
diff --git a/workloads/torchserve-model-packager/helm/Chart.yaml b/workloads/torchserve-model-packager/helm/Chart.yaml
new file mode 100644
index 0000000..1c5f274
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: torchserve-packager
+description: A Helm chart for packaging models to .zip archive
+type: application
+version: 0.1.0
+appVersion: "1.0.0"
diff --git a/workloads/torchserve-model-packager/helm/README.md b/workloads/torchserve-model-packager/helm/README.md
new file mode 100644
index 0000000..ea6e90a
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/README.md
@@ -0,0 +1,31 @@
+# Helm Template for Packaging Models for TorchServe
+
+This helm template archives model files. The workload interacts with MiniO storage to manage model artifacts. Specifically, it writes compressed model files to the remote storage.
+
+## Usage
+
+To deploy the workload, use the following command:
+
+```bash
+helm template workloads/torchserve-model-packager/helm \
+    --name-template job \
+    | kubectl apply -f -
+```
+
+You can pass overrides to the default values as following:
+
+```bash
+helm template workloads/torchserve-model-packager/helm \
+    -f workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-1.3B-diffusers.yaml
+    --name-template VACE \
+    | kubectl appply -f -
+```
+
+### Note
+
+- The `helm install` command is designed for ongoing installations, not one-time jobs. Therefore, it is recommended to use `helm template` and pipe the output to `kubectl create`. This approach is more suitable for jobs that do not require modifying existing entities.
+- Use `kubectl create` instead of `kubectl apply` for this job, as it is intended to create new resources without updating existing ones.
+
+## Configuration
+
+Refer to the `values.yaml` file for configurable user input values. The file includes instructions to help you customize the workload as needed.
diff --git a/workloads/torchserve-model-packager/helm/overrides/NXAI-XLSTM-7B.yaml b/workloads/torchserve-model-packager/helm/overrides/NXAI-XLSTM-7B.yaml
new file mode 100644
index 0000000..dfa7133
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/overrides/NXAI-XLSTM-7B.yaml
@@ -0,0 +1,43 @@
+image:
+  name: "rocm/pytorch"
+  tag: "rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0"
+  pullPolicy: "Always"
+
+modelId: NX-AI/xLSTM-7b
+
+entrypoint: |
+  #!/bin/bash
+
+  python -m pip install xlstm mlstm_kernels
+  python -m pip uninstall -y transformers
+  python -m pip install --upgrade --force-reinstall --no-deps 'transformers @ git+https://github.com/NX-AI/transformers.git@integrate_xlstm'
+  python -m pip install pygit2==1.17.0
+  python -m pip install 'accelerate>=0.26.0'
+  python -m pip install --upgrade --force-reinstall "tokenizers<0.21"
+  python -m pip install --no-cache-dir --index-url https://download.pytorch.org/whl/rocm6.4 --force-reinstall "triton==3.2.0" # triton installation already installed in correct path but not visible to python interpreter
+
+
+  # --- Python: Download and save model ---
+  python <<'PYCODE'
+  from transformers import AutoTokenizer, AutoModelForCausalLM
+  import os
+
+
+  print("Downloading model...")
+  tokenizer = AutoTokenizer.from_pretrained("{{ .Values.modelId }}")
+  model = AutoModelForCausalLM.from_pretrained("{{ .Values.modelId }}")
+  print("Model downloaded.")
+  print("Saving tokenizer and model...")
+  tokenizer.save_pretrained('./xlstm/tokenizer')
+  model.save_pretrained('./xlstm/model')
+
+  PYCODE
+
+  # --- Zip model directory ---
+  cd xlstm
+  zip -r ../model.zip *
+  cd ..
+  rm -rf xlstm
+
+  # Upload resources
+  mc cp model.zip minio-host/{{ .Values.bucketPath }}/{{ .Values.modelId }}/packaged/model.zip
diff --git a/workloads/torchserve-model-packager/helm/overrides/Wan2.1-1.3B-diffusers.yaml b/workloads/torchserve-model-packager/helm/overrides/Wan2.1-1.3B-diffusers.yaml
new file mode 100644
index 0000000..d0767bd
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/overrides/Wan2.1-1.3B-diffusers.yaml
@@ -0,0 +1,36 @@
+modelId: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+
+entrypoint: |
+  #!/bin/bash
+
+  pip install --upgrade pip
+  pip install "diffusers>=0.33" ftfy accelerate
+
+  # --- Python: Download and save model ---
+  python <<'EOF'
+  import torch
+  from diffusers import AutoencoderKLWan, WanPipeline
+  from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+
+  flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
+
+  scheduler = UniPCMultistepScheduler(
+      prediction_type="flow_prediction",
+      use_flow_sigmas=True,
+      num_train_timesteps=1000,
+      flow_shift=flow_shift,
+  )
+
+  vae = AutoencoderKLWan.from_pretrained("{{ .Values.modelId }}", subfolder="vae", torch_dtype=torch.float32)
+  pipe = WanPipeline.from_pretrained("{{ .Values.modelId }}", vae=vae, torch_dtype=torch.bfloat16)
+  pipe.scheduler = scheduler
+  pipe.save_pretrained("./Diffusion_model")
+  EOF
+
+  # --- Zip model directory ---
+  cd Diffusion_model
+  zip -r ../model.zip *
+  cd ..
+
+  # Upload resources
+  mc cp model.zip minio-host/{{ .Values.bucketPath }}/{{ .Values.modelId }}/packaged/model.zip
diff --git a/workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-1.3B-diffusers.yaml b/workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-1.3B-diffusers.yaml
new file mode 100644
index 0000000..8f6943c
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-1.3B-diffusers.yaml
@@ -0,0 +1,42 @@
+modelId: Wan-AI/Wan2.1-VACE-1.3B-diffusers
+
+entrypoint: |
+  #!/bin/bash
+
+  pip install --upgrade pip
+  pip install "diffusers>=0.33" ftfy accelerate
+
+  # --- Python: Download and save model ---
+  python <<'EOF'
+  import torch
+  from diffusers import AutoencoderKLWan, WanVACEPipeline
+  from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+
+  flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
+
+  vae = AutoencoderKLWan.from_pretrained(
+      "{{ .Values.modelId }}",
+      subfolder="vae",
+      torch_dtype=torch.float32,
+  )
+  pipe = WanVACEPipeline.from_pretrained(
+      "{{ .Values.modelId }}",
+      vae=vae,
+      torch_dtype=torch.bfloat16,
+  )
+
+  pipe.scheduler = UniPCMultistepScheduler.from_config(
+      pipe.scheduler.config,
+      flow_shift=flow_shift,
+  )
+
+  pipe.save_pretrained("./Diffusion_model")
+  EOF
+
+  # --- Zip model directory ---
+  cd Diffusion_model
+  zip -r ../model.zip *
+  cd ..
+
+  # Upload resources
+  mc cp model.zip minio-host/{{ .Values.bucketPath }}/{{ .Values.modelId }}/packaged/model.zip
diff --git a/workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-14B-diffusers.yaml b/workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-14B-diffusers.yaml
new file mode 100644
index 0000000..538eb50
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/overrides/Wan2.1-VACE-14B-diffusers.yaml
@@ -0,0 +1,42 @@
+modelId: Wan-AI/Wan2.1-VACE-14B-diffusers
+
+entrypoint: |
+  #!/bin/bash
+
+  pip install --upgrade pip
+  pip install "diffusers>=0.33" ftfy accelerate
+
+  # --- Python: Download and save model ---
+  python <<'EOF'
+  import torch
+  from diffusers import AutoencoderKLWan, WanVACEPipeline
+  from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+
+  flow_shift = 5.0  # 5.0 for 720P, 3.0 for 480P
+
+  vae = AutoencoderKLWan.from_pretrained(
+      "{{ .Values.modelId }}",
+      subfolder="vae",
+      torch_dtype=torch.float32,
+  )
+  pipe = WanVACEPipeline.from_pretrained(
+      "{{ .Values.modelId }}",
+      vae=vae,
+      torch_dtype=torch.bfloat16,
+  )
+
+  pipe.scheduler = UniPCMultistepScheduler.from_config(
+      pipe.scheduler.config,
+      flow_shift=flow_shift,
+  )
+
+  pipe.save_pretrained("./Diffusion_model")
+  EOF
+
+  # --- Zip model directory ---
+  cd Diffusion_model
+  zip -r ../model.zip *
+  cd ..
+
+  # Upload resources
+  mc cp model.zip minio-host/{{ .Values.bucketPath }}/{{ .Values.modelId }}/packaged/model.zip
diff --git a/workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml b/workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml
new file mode 100644
index 0000000..43ec5b3
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/overrides/tutorial-06-package.yaml
@@ -0,0 +1,83 @@
+# Container image configuration
+image:
+  name: "rocm/pytorch-training"
+  tag: "v25.2"
+  pullPolicy: "Always"
+
+# Setting job labels
+metadata:
+  labels:
+    kaiwo.silogen.ai/managed: "true"
+
+resources:
+  gpus: 1
+  memoryPerGpu: 64
+  cpuPerGpu: 4
+
+# Volume configurations
+storage:
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+    - ReadWriteOnce
+  dshm:
+    enabled: true
+    sizeLimit: "32Gi"
+
+modelId: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+bucketPath: default-bucket/models
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+entrypoint: |
+  #!/bin/bash
+
+  pip install --upgrade pip
+  pip install "diffusers>=0.33" ftfy accelerate
+
+  # --- Python: Download and save model ---
+  python <<'EOF'
+  import torch
+  from diffusers import AutoencoderKLWan, WanPipeline
+  from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+
+  flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
+
+  scheduler = UniPCMultistepScheduler(
+      prediction_type="flow_prediction",
+      use_flow_sigmas=True,
+      num_train_timesteps=1000,
+      flow_shift=flow_shift,
+  )
+
+  vae = AutoencoderKLWan.from_pretrained("{{ .Values.modelId }}", subfolder="vae", torch_dtype=torch.float32)
+  pipe = WanPipeline.from_pretrained("{{ .Values.modelId }}", vae=vae, torch_dtype=torch.bfloat16)
+  pipe.scheduler = scheduler
+  pipe.save_pretrained("./Diffusion_model")
+  EOF
+
+  # --- Zip model directory ---
+  cd Diffusion_model
+  zip -r ../model.zip *
+  cd ..
+
+  # Upload resources
+  mc cp model.zip minio-host/{{ .Values.bucketPath }}/{{ .Values.modelId }}/packaged/model.zip
+
+# Environment variables
+env_vars:
+  BUCKET_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  BUCKET_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+
+# Job restart and timeout settings
+job:
+  restartPolicy: "Never"
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 432000
+
+kaiwo:
+  enabled: false
diff --git a/workloads/torchserve-model-packager/helm/templates/_helpers.tpl b/workloads/torchserve-model-packager/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..8b117e1
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/templates/_helpers.tpl
@@ -0,0 +1,92 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+limits:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: /dev/shm
+  name: dshm
+- mountPath: /workload
+  name: ephemeral-storage
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+{{- if .Values.storage.dshm.enabled }}
+- emptyDir:
+    medium: Memory
+    {{- if .Values.storage.dshm.sizeLimit }}
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+    {{- end }}
+  name: dshm
+{{- end }}
+{{- if .Values.storage.ephemeral.storageClassName }}
+- ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.storage.ephemeral.quantity }}
+        storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+  name: ephemeral-storage
+{{- else }}
+- emptyDir: {}
+  name: ephemeral-storage
+  {{- if .Values.storage.ephemeral.quantity }}
+  sizeLimit: {{ .Values.storage.ephemeral.quantity }}
+  {{- end }}
+{{- end }}
+{{- end -}}
diff --git a/workloads/torchserve-model-packager/helm/templates/job.yaml b/workloads/torchserve-model-packager/helm/templates/job.yaml
new file mode 100644
index 0000000..d9e79d4
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/templates/job.yaml
@@ -0,0 +1,59 @@
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  {{- if .Values.metadata.labels }}
+  labels:
+    {{- range $label, $value := .Values.metadata.labels }}
+    {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  backoffLimit: {{ .Values.job.backoffLimit | default 0 }}
+  ttlSecondsAfterFinished: {{ .Values.job.ttlSecondsAfterFinished | default 3600 }}
+  template:
+    spec:
+      containers:
+        - name: {{ .Chart.Name }}
+          image: "{{ .Values.image.name }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy | quote }}
+          {{- if .Values.entrypoint }}
+          command: ["bash", "-c"]
+          args:
+          - |
+            set -e
+
+            # --- Install dependencies ---
+            apt-get update -qq && \
+            DEBIAN_FRONTEND=noninteractive apt-get install -y -qq zip curl
+
+            # Download MinIO client binary to custom directory (no root access needed)
+            echo 'Installing MinIO client'
+            curl -s https://dl.min.io/client/mc/release/linux-amd64/mc \
+                --create-dirs \
+                -o /minio-binaries/mc
+            chmod +x /minio-binaries/mc
+            export PATH="${PATH}:/minio-binaries/"
+
+            # Configure MinIO client
+            mc alias set minio-host {{ .Values.bucketStorageHost }} $BUCKET_ACCESS_KEY $BUCKET_SECRET_KEY
+
+            cd /workload
+            {{- .Values.entrypoint | replace "{{ .Values.modelId }}" .Values.modelId | replace "{{ .Values.bucketPath }}" .Values.bucketPath | nindent 12}}
+            echo "Job Done"
+          {{- end }}
+          {{- if .Values.env_vars }}
+          env:
+            {{- include "container.env" . | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      restartPolicy: {{ .Values.job.restartPolicy | default "Never" }}
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
+{{- end }}
+
+{{- include "job" . }}
diff --git a/workloads/torchserve-model-packager/helm/values.yaml b/workloads/torchserve-model-packager/helm/values.yaml
new file mode 100644
index 0000000..b4327ae
--- /dev/null
+++ b/workloads/torchserve-model-packager/helm/values.yaml
@@ -0,0 +1,83 @@
+# Container image configuration
+image:
+  name: "rocm/pytorch-training"
+  tag: "v25.2"
+  pullPolicy: "Always"
+
+# Setting job labels
+metadata:
+  labels:
+    kaiwo.silogen.ai/managed: "true"
+
+resources:
+  gpus: 1
+  memoryPerGpu: 64
+  cpuPerGpu: 4
+
+# Volume configurations
+storage:
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+    - ReadWriteOnce
+  dshm:
+    enabled: true
+    sizeLimit: "32Gi"
+
+modelId: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+bucketPath: default-bucket/models
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+entrypoint: |
+  #!/bin/bash
+
+  pip install --upgrade pip
+  pip install "diffusers>=0.33" ftfy accelerate
+
+  # --- Python: Download and save model ---
+  python <<'EOF'
+  import torch
+  from diffusers import AutoencoderKLWan, WanPipeline
+  from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+
+  flow_shift = 3.0  # 5.0 for 720P, 3.0 for 480P
+
+  scheduler = UniPCMultistepScheduler(
+      prediction_type="flow_prediction",
+      use_flow_sigmas=True,
+      num_train_timesteps=1000,
+      flow_shift=flow_shift,
+  )
+
+  vae = AutoencoderKLWan.from_pretrained("{{ .Values.modelId }}", subfolder="vae", torch_dtype=torch.float32)
+  pipe = WanPipeline.from_pretrained("{{ .Values.modelId }}", vae=vae, torch_dtype=torch.bfloat16)
+  pipe.scheduler = scheduler
+  pipe.save_pretrained("./Diffusion_model")
+  EOF
+
+  # --- Zip model directory ---
+  cd Diffusion_model
+  zip -r ../model.zip *
+  cd ..
+
+  # Upload resources
+  mc cp model.zip minio-host/{{ .Values.bucketPath }}/{{ .Values.modelId }}/packaged/model.zip
+
+# Environment variables
+env_vars:
+  BUCKET_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  BUCKET_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+
+# Job restart and timeout settings
+job:
+  restartPolicy: "Never"
+  backoffLimit: 0
+  ttlSecondsAfterFinished: 3600
+
+kaiwo:
+  enabled: false
diff --git a/workloads/vlm-lora-finetune/helm/Chart.yaml b/workloads/vlm-lora-finetune/helm/Chart.yaml
new file mode 100644
index 0000000..dbb691c
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: vlm-lora-finetune
+description: A Helm chart for training LORA adapters for an openCLIP model
+version: 0.0.1
diff --git a/workloads/vlm-lora-finetune/helm/README.md b/workloads/vlm-lora-finetune/helm/README.md
new file mode 100644
index 0000000..843c398
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/README.md
@@ -0,0 +1,105 @@
+# VLM-LORA finetuning using OpenCLIP Workload
+
+This workload showcases finetuning [Low-rank adaptation (LORA)](https://en.wikipedia.org/wiki/Fine-tuning_(deep_learning)#Low-rank_adaptation) layers for an OpenCLIP model. LORA layer finetuning is often much faster compared to full and makes sharing more space efficient.
+
+Repo 'clipora' is used as a base with minor modifications: https://github.com/awilliamson10/clipora. clipora is an example of LORA finetuning from github user 'awilliamson10'. See files used to build the image in the [docker folder](/docker/vlm-lora-finetune).
+
+## Links
+
+- Openclip repo: https://github.com/mlfoundations/open_clip
+- CLIP finetuning guide (not LORA): https://github.com/mlfoundations/open_clip/discussions/812
+
+## Steps
+
+- Use an available OpenCLIP model (such as ViT-L-14) as a base with pretrained weights
+- Inject new LORA layers to the base model
+- Finetune just the LORA layers with new data, using huggingface peft
+- Save the finetuned LORA layers and use them later for inference with the base model. The save folder will contain
+LORA config, (clipora) train config that was used and the LORA weights
+- During inference, the pretrained model (including weights) is loaded again and the LORA layers injected with finetuned weights.
+
+You can change some settings in clipora config, see [mount/bridge_train_config.yml](mount/bridge_train_config.yml) for example.
+
+## Using custom data
+
+You can use your own dataset for finetuning. Easiest way is to use a CSV file with a format OpenCLIP/clipora expects. It should have an image path and text as columns like this:
+
+```
+image_path,language_instruction
+"/data/episode_0006/step_0037.png","put the cube on top of the cylinder"
+"/data/episode_0048/step_0010.png","Move the blue spoon to the left burner"
+"/data/episode_0046/step_0031.png","take the yellow cube and move in to the left"
+...
+```
+
+Where each 'image_path' points to an image and each 'language_instruction' is a correct description of the image. There should be a train and evaluation CSV. When done, set `train_dataset` and `eval_dataset` to the CSV paths.
+
+## Example run
+
+`helm` folder contains a fully working example of finetuning on a kubernetes cluster with an AMD GPU. It uses a subset of robotics dataset BridgeData https://github.com/rail-berkeley/bridge_data_v2 located in huggingface: https://huggingface.co/datasets/dusty-nv/bridge_orig_ep100. The bridge dataset contains multiple items of a robot's trajectory and the correct language instruction. In our case all trajectory images get a separate row and the language instruction is used as the correct text. The huggingface dataset is in TFRecords format and is parsed and modified to the CSV dataset format. NOTE: understanding the example dataset parsing script is not required, it's a one-off that's used to modify TFRecords to the openCLIP format of raw images and a CSV with image paths and texts. Example steps:
+
+- download example bridge sample dataset and prepare it for openCLIP training with the format mentioned above.
+- train LORAs with clipora on bridge sample dataset (note: training might take 15 minutes or more, logs do not instantly update)
+- print probabilities and output an image visualizing results in PVC using the latest checkpoint
+
+Workload should run successfully from start to finish with these commands:
+
+```
+# Create finetuning job and run it
+helm template vlm-lora-openclip . --set metadata.user_id=username_here | kubectl apply -f -
+```
+
+You can check logs and status with these commands:
+
+```
+# get pod name. training job pod starts with 'vlm-lora-finetuning-job'
+kubectl get pods
+# check logs.
+kubectl logs vlm-lora-finetuning-job-{user_id}-{pod_hash}
+# by default the example job stays up for 10 minutes after finishing
+# so if you want to you can copy the example output to your computer:
+kubectl cp vlm-lora-finetuning-job-{user_id}-{pod_hash}:/workload/bridge_output bridge_output
+```
+
+Training started successfully if you see logs like this after 1-2 minutes:
+
+```
+...
+INFO:root:Loaded ViT-L-14 model config.
+INFO:root:Loading pretrained ViT-L-14 weights (datacomp_xl_s13b_b90k).
+Starting clipora training with config: /mounted-files/bridge_train_config.yml
+
+Output directory: /workload/bridge_output
+Using seed 1337
+***** Running training *****
+  Using device: cuda
+  Num Iters = 56
+  Num Epochs = 3
+  Instantaneous batch size per device = 32
+  Gradient Accumulation steps = 1
+```
+
+It may take several minutes until the logs update. You might also see some warning messages about deprecated imports or missing cuda drivers which can be ignored.
+
+### Example results
+
+Comparing original and lora model on evaluation set:
+
+```
+Running inference comparison...
+Original eval loss:
+{'eval_loss': tensor(4.7737)}
+Lora eval loss:
+{'eval_loss': tensor(0.4188)}
+Visualizing results...
+probs before:
+[2.57515550e-01 1.46542358e-14 7.42484391e-01 3.23128511e-17
+ 5.35866707e-12 8.72237192e-25 6.95570677e-08 7.13214876e-10
+ 3.27358718e-13 1.16048234e-10]
+probs after:
+[9.9999988e-01 1.3494220e-27 1.3096904e-07 1.7481791e-22 4.6080438e-26
+ 3.9263898e-32 4.6212802e-19 5.2853294e-17 1.5104853e-20 2.8321767e-16]
+```
+
+![Example results](output_image.png)
+*Highest probability bolded. First text is the correct one. As seen, after LORA layer training the result is correct. Note that this was picked randomly from evaluation set, in another case the finetuning might not have worked so well. Used for illustrative purposes.*
diff --git a/workloads/vlm-lora-finetune/helm/mount/bridge_train_config.yml b/workloads/vlm-lora-finetune/helm/mount/bridge_train_config.yml
new file mode 100644
index 0000000..b5d067d
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/mount/bridge_train_config.yml
@@ -0,0 +1,42 @@
+# OpenCLIP base model name
+model_name: "ViT-L-14"
+# Use these pretrained weights, provided by OpenCLIP
+pretrained: "datacomp_xl_s13b_b90k"
+compile: False
+seed: 1337
+
+device: "cuda"
+# Output directory for finetuned lora weights and config. Will contain checkpoint folders with
+# lora weights and config
+output_dir: "/workload/bridge_output"
+
+wandb: False
+wandb_project: test-clipora
+
+# Custom CSV dataset
+train_dataset: "/workload/bridge_data/train.csv"
+eval_dataset: "/workload/bridge_data/eval.csv"
+datatype: "csv"
+csv_separator: ","
+image_col: "image_path"
+text_col: "language_instruction"
+shuffle: True
+# How many workers to use in dataloader. Recommend using more than 1
+workers: 8
+
+lora_rank: 16
+lora_alpha: 32
+lora_dropout: 0.0
+
+batch_size: 32
+gradient_accumulation_steps: 1
+gradient_checkpointing: False
+
+use_8bit_adam: False
+
+learning_rate: 1e-4
+epochs: 3
+warmup: 0.01
+save_interval: 500
+eval_interval: 30
+eval_steps: 10
diff --git a/workloads/vlm-lora-finetune/helm/mount/prepare_custom_dataset.py b/workloads/vlm-lora-finetune/helm/mount/prepare_custom_dataset.py
new file mode 100644
index 0000000..ce69c54
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/mount/prepare_custom_dataset.py
@@ -0,0 +1,483 @@
+"""
+Dataset preparation script for Bridge robotics dataset. NOTE: this script was created
+as a one-off script for parsing part of bridge dataset (TFRecord) to openCLIP format
+(a CSV with image paths and associated text descriptions and images). See helm README.md for more details.
+
+This script downloads and processes the Bridge robotics dataset from Hugging Face,
+converting TFRecord format data into a custom format suitable for vision-language
+model fine-tuning. The script:
+
+1. Downloads TFRecord files and metadata from a Hugging Face repository
+2. Parses episode data including images, language instructions, actions, and states
+3. Extracts and saves images as PNG files organized by episodes
+4. Creates CSV files mapping image paths to language instructions
+5. Splits the data into training and evaluation sets. These are saved in the folder of
+'train_dataset' variable in the config yaml file.
+
+The resulting dataset format is compatible with openCLIP and similar vision-language
+model training pipelines.
+
+Example usage:
+    python prepare_custom_dataset.py --repo_id dusty-nv/bridge_orig_ep100 --config /path/to/clipora_config.yml
+"""
+
+import argparse
+import csv
+import json
+import os
+import random
+
+import numpy as np
+import tensorflow as tf
+import yaml
+from huggingface_hub import hf_hub_download
+from PIL import Image
+
+
+def get_tfrecord_dataset(repo_id="dusty-nv/bridge_orig_ep100"):
+    """
+    Download TFRecord dataset files from a Hugging Face repository.
+
+    Downloads the Bridge dataset TFRecord files and associated metadata files
+    from the specified Hugging Face repository.
+
+    Args:
+        repo_id (str): The Hugging Face repository ID (default: "dusty-nv/bridge_orig_ep100").
+
+    Returns:
+        tf.data.TFRecordDataset: The loaded TFRecord dataset.
+        dict: Features metadata loaded from features.json.
+    """
+    # TODO: how to make it so we don't need to hardcode filenames here
+    files = [
+        "1.0.0/bridge_orig_ep100-train.tfrecord-00000-of-00002",
+        "1.0.0/bridge_orig_ep100-train.tfrecord-00001-of-00002",
+        "1.0.0/dataset_info.json",
+        "1.0.0/features.json",
+    ]
+
+    tfrecord_files = []
+
+    features_json_path = None
+    for fname in files:
+        path = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename=fname)
+        # collect tfrecords for later use
+        if "tfrecord" in fname:
+            tfrecord_files.append(path)
+        elif "features.json" in fname:
+            features_json_path = path
+
+        print(path)
+
+    raw_dataset = tf.data.TFRecordDataset(tfrecord_files)
+    # features.json contains tfrecord schema
+    with open(features_json_path, "r") as f:
+        features_data = json.load(f)
+
+    return raw_dataset, features_data
+
+
+def build_feature_description_from_json(features_data):
+    """
+    Build TensorFlow feature description from features.json schema.
+
+    Parses the features.json schema to create TensorFlow feature descriptions
+    that can be used with tf.io.parse_single_example. Handles nested features,
+    variable-length sequences, and different data types.
+
+    Args:
+        features_data (dict): Features schema loaded from features.json.
+
+    Returns:
+        dict: Feature description mapping for tf.io.parse_single_example.
+        dict: Reshape information for tensors that need reshaping.
+        set: Fields that should be converted from int64 to bool.
+    """
+    # Map dtype strings to TensorFlow dtypes
+    # Note: VarLenFeature doesn't support bool, so we use int64 and convert later
+    dtype_mapping = {
+        "float32": tf.float32,
+        "int64": tf.int64,
+        "bool": tf.int64,  # Parse as int64, convert to bool later
+        "string": tf.string,
+        "uint8": tf.uint8,
+    }
+
+    # Keep track of fields that should be converted to bool after parsing
+    bool_fields = set()
+
+    feature_description = {}
+    reshape_info = {}
+
+    def process_features(features_dict, prefix=""):
+        """
+        Recursively process features dictionary from features.json schema.
+
+        Traverses the nested feature structure and creates appropriate TensorFlow
+        feature descriptions for parsing TFRecord data.
+
+        Args:
+            features_dict (dict): Features dictionary from schema.
+            prefix (str): Current key prefix for nested features.
+        """
+        for key, feature_spec in features_dict["features"].items():
+            full_key = f"{prefix}/{key}" if prefix else key
+
+            if feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.text_feature.Text":
+                if prefix and "steps" in prefix:  # Variable length sequence
+                    feature_description[full_key] = tf.io.VarLenFeature(tf.string)
+                else:  # Fixed length
+                    feature_description[full_key] = tf.io.FixedLenFeature([], tf.string)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.scalar.Scalar":
+                dtype_str = feature_spec["tensor"]["dtype"]
+                tf_dtype = dtype_mapping.get(dtype_str, tf.string)
+
+                # Track boolean fields for later conversion
+                if dtype_str == "bool":
+                    bool_fields.add(full_key)
+
+                if prefix and "steps" in prefix:  # Variable length sequence
+                    feature_description[full_key] = tf.io.VarLenFeature(tf_dtype)
+                else:  # Fixed length
+                    feature_description[full_key] = tf.io.FixedLenFeature([], tf_dtype)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.tensor_feature.Tensor":
+                tensor_info = feature_spec["tensor"]
+                dtype_str = tensor_info["dtype"]
+                tf_dtype = dtype_mapping.get(dtype_str, tf.string)
+
+                # Extract shape information
+                if "shape" in tensor_info and "dimensions" in tensor_info["shape"]:
+                    shape = [int(dim) for dim in tensor_info["shape"]["dimensions"]]
+                    if prefix and "steps" in prefix:  # Variable length sequence of tensors
+                        feature_description[full_key] = tf.io.VarLenFeature(tf_dtype)
+                        reshape_info[full_key] = shape  # Store original shape for reshaping
+                    else:  # Fixed length tensor
+                        feature_description[full_key] = tf.io.FixedLenFeature(shape, tf_dtype)
+                else:
+                    # Scalar tensor
+                    if prefix and "steps" in prefix:
+                        feature_description[full_key] = tf.io.VarLenFeature(tf_dtype)
+                    else:
+                        feature_description[full_key] = tf.io.FixedLenFeature([], tf_dtype)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.image_feature.Image":
+                # Images are stored as encoded strings (PNG/JPEG)
+                if prefix and "steps" in prefix:
+                    feature_description[full_key] = tf.io.VarLenFeature(tf.string)
+                else:
+                    feature_description[full_key] = tf.io.FixedLenFeature([], tf.string)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.features_dict.FeaturesDict":
+                # Recursively process nested features
+                process_features(feature_spec["featuresDict"], full_key)
+
+            elif feature_spec["pythonClassName"] == "tensorflow_datasets.core.features.dataset_feature.Dataset":
+                # Process sequence features with "steps" prefix to indicate variable length
+                sequence_feature = feature_spec["sequence"]["feature"]
+                if "featuresDict" in sequence_feature:
+                    process_features(sequence_feature["featuresDict"], full_key)
+
+    # Process the root features
+    process_features(features_data["featuresDict"])
+
+    return feature_description, reshape_info, bool_fields
+
+
+def parse_bridge_episode(example_proto, features_data):
+    """
+    Parse a Bridge dataset episode from TFRecord format.
+
+    Parses a single episode from the Bridge dataset using the features schema,
+    converts sparse tensors to dense, handles data type conversions, and
+    performs necessary reshaping for sequence data.
+
+    Args:
+        example_proto: Raw TFRecord example protocol buffer.
+        features_data (dict): Features schema from features.json.
+
+    Returns:
+        dict: Parsed episode data with properly shaped tensors.
+    """
+
+    feature_description, reshape_info, bool_fields = build_feature_description_from_json(features_data)
+
+    parsed = tf.io.parse_single_example(example_proto, feature_description)
+
+    # Convert sparse tensors to dense
+    for key in parsed:
+        if isinstance(parsed[key], tf.SparseTensor):
+            parsed[key] = tf.sparse.to_dense(parsed[key])
+
+    # Convert int64 fields back to bool where appropriate
+    for key in bool_fields:
+        if key in parsed:
+            parsed[key] = tf.cast(parsed[key], tf.bool)
+
+    # For debugging: let's check if the data is already properly shaped
+    # and avoid reshaping if it's not needed
+    if reshape_info:
+        step_keys = [k for k in parsed.keys() if k.startswith("steps/")]
+        if step_keys:
+            reference_key = step_keys[0]
+            num_steps = tf.shape(parsed[reference_key])[0]
+
+            for key, original_shape in reshape_info.items():
+                if key in parsed:
+                    current_tensor = parsed[key]
+                    current_shape = tf.shape(current_tensor)
+
+                    # Check if already has the right shape (might be [num_steps, feature_size] already)
+                    if len(original_shape) == 1:  # 1D feature like [7]
+                        expected_shape = [num_steps, original_shape[0]]
+
+                        # If current shape matches expected, no reshaping needed
+                        # If it's flattened [num_steps * feature_size], reshape it
+                        if tf.rank(current_tensor) == 1:
+                            # It's flattened, try to reshape
+                            total_elements = tf.size(current_tensor)
+                            feature_size = original_shape[0]
+
+                            # Only reshape if the math works out
+                            if tf.math.equal(total_elements % feature_size, 0):
+                                inferred_num_steps = total_elements // feature_size
+                                parsed[key] = tf.reshape(current_tensor, [inferred_num_steps, feature_size])
+
+    return parsed
+
+
+def save_episode_images(parsed_example, episode_num=1, base_output_dir="episodes"):
+    """
+    Save episode images to disk and prepare CSV data.
+
+    Extracts images and language instructions from a parsed episode,
+    saves images as PNG files in an episode-specific directory,
+    and returns CSV data mapping image paths to instructions.
+
+    Args:
+        parsed_example (dict): Parsed episode data from parse_bridge_episode.
+        episode_num (int): Episode number for directory naming (default: 1).
+        base_output_dir (str): Base directory for saving episodes (default: "episodes").
+
+    Returns:
+        dict: Episode metadata including:
+            - instructions: List of language instructions
+            - num_steps: Number of steps in the episode
+            - actions: Action data as numpy array
+            - states: State data as numpy array
+            - episode_dir: Path to episode directory
+            - csv_data: List of [image_path, instruction] pairs for CSV
+    """
+
+    # Create episode directory
+    episode_str = f"episode_{episode_num:04d}"
+    episode_dir = os.path.join(base_output_dir, episode_str)
+    os.makedirs(episode_dir, exist_ok=True)
+
+    # Get language instruction (should be the same for all steps in episode)
+    language_instructions = parsed_example["steps/language_instruction"]
+    if len(language_instructions) > 0:
+        instructions = [inst.numpy().decode("utf-8") for inst in language_instructions]
+
+    # Get images and already properly shaped action/state tensors
+    images = parsed_example["steps/observation/image"]
+    actions = parsed_example["steps/action"]
+    states = parsed_example["steps/observation/state"]
+
+    num_steps = len(images)
+
+    # Prepare CSV data
+    csv_data = []
+
+    # Save each image and record in CSV
+    for step_idx in range(num_steps):
+        # Decode image
+        image = tf.io.decode_png(images[step_idx], channels=3)
+        instruction = instructions[step_idx]
+        # Save image as PNG
+        image_filename = f"step_{step_idx:04d}.png"
+        image_path = os.path.abspath(os.path.join(episode_dir, image_filename))
+
+        # Convert to PIL Image and save
+        pil_image = Image.fromarray(image.numpy())
+        pil_image.save(image_path)
+
+        # Add to CSV data
+        csv_data.append([image_path, instruction])
+
+    print(f"Saved {num_steps} images to {episode_dir}")
+
+    return {
+        "instructions": instructions,
+        "num_steps": num_steps,
+        "actions": actions.numpy(),  # Already correctly shaped
+        "states": states.numpy(),  # Already correctly shaped
+        "episode_dir": episode_dir,
+        "csv_data": csv_data,
+    }
+
+
+# Create a simple test to see what we're getting
+def debug_first_episode(raw_dataset, features_data):
+    """
+    Debug function to inspect raw TFRecord data structure.
+
+    Parses and displays information about the first episode in the dataset
+    to help understand data shapes and structure before processing.
+
+    Args:
+        raw_dataset (tf.data.TFRecordDataset): Raw TFRecord dataset.
+        features_data (dict): Features schema from features.json.
+    """
+    feature_description, reshape_info, bool_fields = build_feature_description_from_json(features_data)
+
+    # Parse one example without reshaping
+    for raw_example in raw_dataset.take(1):
+        parsed = tf.io.parse_single_example(raw_example, feature_description)
+
+        # Convert sparse tensors to dense
+        for key in parsed:
+            if isinstance(parsed[key], tf.SparseTensor):
+                parsed[key] = tf.sparse.to_dense(parsed[key])
+
+        print("Raw parsed shapes and info:")
+        for key, tensor in parsed.items():
+            print(f"  {key}: shape={tensor.shape}, size={tf.size(tensor).numpy()}")
+
+        print("\nReshape info from features.json:")
+        for key, shape in reshape_info.items():
+            print(f"  {key}: expected shape per step = {shape}")
+
+        break
+
+
+def split_csv_data(input_csv_path, train_csv_path, eval_csv_path, split_percent=80):
+    """
+    Split a CSV file into training and evaluation sets.
+
+    Reads a CSV file, randomly shuffles the data, and splits it into
+    training and evaluation sets based on the specified percentage.
+
+    Args:
+        input_csv_path (str): Path to the input CSV file to split.
+        train_csv_path (str): Path where the training CSV will be saved.
+        eval_csv_path (str): Path where the evaluation CSV will be saved.
+        split_percent (int): Percentage of data for training (default: 80).
+
+    Raises:
+        ValueError: If split_percent is not between 1 and 99 (exclusive).
+    """
+    if not (0 < split_percent < 100):
+        raise ValueError("split_percent must be between 1 and 99 (exclusive).")
+
+    all_rows = []
+    with open(input_csv_path, "r", newline="", encoding="utf-8") as infile:
+        reader = csv.reader(infile)
+        header = next(reader)  # Read the header row
+        for row in reader:
+            all_rows.append(row)
+
+    random.shuffle(all_rows)  # Shuffle the rows to ensure random distribution
+
+    num_rows = len(all_rows)
+    num_train_rows = int(num_rows * (split_percent / 100))
+
+    train_data = all_rows[:num_train_rows]
+    eval_data = all_rows[num_train_rows:]
+
+    # Write to train.csv
+    with open(train_csv_path, "w", newline="", encoding="utf-8") as train_file:
+        writer = csv.writer(train_file, quoting=csv.QUOTE_ALL)
+        writer.writerow(header)  # Write the header
+        writer.writerows(train_data)
+
+    # Write to eval.csv
+    with open(eval_csv_path, "w", newline="", encoding="utf-8") as eval_file:
+        writer = csv.writer(eval_file, quoting=csv.QUOTE_ALL)
+        writer.writerow(header)  # Write the header
+        writer.writerows(eval_data)
+
+    print("Data split complete:")
+    print(f"  Total rows: {num_rows}")
+    print(f"  Training rows ({split_percent}%): {len(train_data)}")
+    print(f"  Evaluation rows ({100 - split_percent}%): {len(eval_data)}")
+    print(f"  Train CSV saved to: {os.path.abspath(train_csv_path)}")
+    print(f"  Eval CSV saved to: {os.path.abspath(eval_csv_path)}")
+
+
+def main(repo_id, train_csv_path, eval_csv_path, split_percent=80):
+    """
+    Main function to process the dataset from a Hugging Face repository.
+
+    Downloads and processes a Bridge dataset, extracts images and instructions,
+    saves them to a directory structure, and creates train/eval CSV files.
+
+    Args:
+        repo_id (str): The Hugging Face repository ID for the dataset.
+        train_csv_path (str): Path where the training CSV file will be saved.
+        eval_csv_path (str): Path where the evaluation CSV file will be saved.
+        split_percent (int): Percentage of data to use for training (default: 80).
+    """
+    # Get the dataset from the specified repository
+    raw_dataset, features_data = get_tfrecord_dataset(repo_id=repo_id)
+    debug_first_episode(raw_dataset, features_data)
+
+    parsed_dataset = raw_dataset.map(lambda x: parse_bridge_episode(x, features_data))
+    base_output_dir = os.path.dirname(train_csv_path)
+    csv_path = os.path.join(base_output_dir, "images_and_instructions.csv")
+    try:
+        with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
+            writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
+            # Write the header row
+            writer.writerow(["image_path", "language_instruction"])
+
+            for i, episode in enumerate(parsed_dataset):
+                episode_data = save_episode_images(episode, episode_num=i + 1, base_output_dir=base_output_dir)
+                for csv_row in episode_data["csv_data"]:
+                    writer.writerow(csv_row)
+
+        print(f"Successfully generated {csv_path}")
+        # Split the generated CSV into training and validation sets
+        if 0 < split_percent < 100:
+            print(f"Splitting CSV data into train and eval sets with split percent: {split_percent}")
+            split_csv_data(csv_path, train_csv_path, eval_csv_path, split_percent=split_percent)
+        else:
+            print("Skipping CSV split as split_percent is not between 1 and 99.")
+    except IOError as e:
+        print(f"Error writing to CSV file: {e}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+
+if __name__ == "__main__":
+    # Set up the argument parser
+    parser = argparse.ArgumentParser(description="Process bridge dataset episodes from a Hugging Face repository.")
+
+    # Add the argument for the repository ID
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        default="dusty-nv/bridge_orig_ep100",
+        help="The Hugging Face repository ID to pull the dataset from.",
+    )
+
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="/configs/bridge_train_config.yml",
+        help="The path to the yaml file containing the training configuration.",
+    )
+
+    args = parser.parse_args()
+    with open(args.config, "r") as f:
+        config_dict = yaml.safe_load(f)
+    # Use config to get the output folder of data from train_dataset variable
+    # this way only need to change train_dataset in config to change data folder
+    base_output_dir = os.path.dirname(config_dict["train_dataset"])
+    print("Saving custom dataset to: ", base_output_dir)
+    os.makedirs(base_output_dir, exist_ok=True)
+    if len(os.listdir(base_output_dir)) > 60:
+        print(f"Output directory {base_output_dir} already contains >60 files, skipping dataset preparation.")
+    else:
+        main(args.repo_id, config_dict["train_dataset"], config_dict["eval_dataset"])
diff --git a/workloads/vlm-lora-finetune/helm/output_image.png b/workloads/vlm-lora-finetune/helm/output_image.png
new file mode 100644
index 0000000..6dc8727
Binary files /dev/null and b/workloads/vlm-lora-finetune/helm/output_image.png differ
diff --git a/workloads/vlm-lora-finetune/helm/templates/_helpers.tpl b/workloads/vlm-lora-finetune/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..c2ab819
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/templates/_helpers.tpl
@@ -0,0 +1,15 @@
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  memory: "{{ max (mul .Values.gpus .Values.memoryPerGpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpusPerGpu) 1 }}"
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+limits:
+  memory: "{{ max (mul .Values.gpus .Values.memoryPerGpu) 4 }}Gi"
+  cpu: "{{ max (mul .Values.gpus .Values.cpusPerGpu) 1 }}"
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+{{- end -}}
diff --git a/workloads/vlm-lora-finetune/helm/templates/configmap.yaml b/workloads/vlm-lora-finetune/helm/templates/configmap.yaml
new file mode 100644
index 0000000..92db311
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: lora-finetune-configmap-{{ .Values.metadata.user_id }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/vlm-lora-finetune/helm/templates/finetuning_job.yaml b/workloads/vlm-lora-finetune/helm/templates/finetuning_job.yaml
new file mode 100644
index 0000000..6660bc1
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/templates/finetuning_job.yaml
@@ -0,0 +1,52 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vlm-lora-finetuning-job-{{ required "metadata.user_id is required and must be set" .Values.metadata.user_id }}
+spec:
+  ttlSecondsAfterFinished: {{ .Values.ttlSecondsAfterFinished }}
+  backoffLimit: 0
+  template:
+    spec:
+      containers:
+        - name: vlm-lora-finetuning-container-{{ .Values.metadata.user_id }}
+          image: {{ .Values.image }}
+          {{- if .Values.env }}
+          env:
+            {{- range $key, $value := .Values.env }}
+            - name: {{ $key }}
+              value: {{ $value | quote }}
+            {{- end }}
+          {{- end }}
+          command: ["/bin/sh", "-c"]
+          args: {{ .Values.entrypoint | toYaml | nindent 10 }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            - mountPath: /dev/shm
+              name: dshm
+            - mountPath: /workload/
+              name: lora-finetune-storage-{{ .Values.metadata.user_id }}
+            - mountPath: /mounted-files/
+              name: lora-finetune-configmap-{{ .Values.metadata.user_id }}
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        - name: {{ .Values.imagePullSecrets.name }}
+      {{- end }}
+      restartPolicy: Never
+      volumes:
+        - ephemeral:
+            volumeClaimTemplate:
+              spec:
+                accessModes: {{ .Values.storage.ephemeral.accessModes }}
+                resources:
+                  requests:
+                    storage: {{ .Values.storage.ephemeral.quantity }}
+                storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+          name: lora-finetune-storage-{{ .Values.metadata.user_id }}
+        - name: dshm
+          emptyDir:
+            medium: Memory
+            sizeLimit: 16Gi
+        - name: lora-finetune-configmap-{{ .Values.metadata.user_id }}
+          configMap:
+            name: lora-finetune-configmap-{{ .Values.metadata.user_id }}
diff --git a/workloads/vlm-lora-finetune/helm/values.yaml b/workloads/vlm-lora-finetune/helm/values.yaml
new file mode 100644
index 0000000..66103b4
--- /dev/null
+++ b/workloads/vlm-lora-finetune/helm/values.yaml
@@ -0,0 +1,41 @@
+metadata:
+  user_id: null
+
+image: ghcr.io/silogen/vlm-lora-finetune:0.2
+
+env:
+  HF_HOME: /workload/.hf_cache
+  TF_CPP_MIN_LOG_LEVEL: '3'  # These get rid of TF warnings, fine as we don't use TF in main script
+  TF_ENABLE_ONEDNN_OPTS: '0'
+
+gpus: 1
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+imagePullSecrets:
+  name: ghcr-regcred
+
+storage:
+  ephemeral:
+    quantity: 100Gi
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+  dshm:
+    sizeLimit: 32Gi
+
+ttlSecondsAfterFinished: 0
+
+# Run training and inference
+entrypoint:
+  - |
+    echo "Preparing custom example dataset..."
+    # exit on error in all commands
+    python /mounted-files/prepare_custom_dataset.py --config /mounted-files/bridge_train_config.yml && echo "Dataset preparation done" || { echo "Dataset preparation failed"; exit 1; }
+    echo "Starting python train script..."
+    python train.py --config /mounted-files/bridge_train_config.yml && echo "Training done" || { echo "Training failed"; exit 1; }
+    echo "Running example inference..."
+    python merge_and_infer.py --config /mounted-files/bridge_train_config.yml && echo "Inference done" || { echo "Inference failed"; exit 1; }
+    echo "Sleeping for 10 minutes in case user wants to copy results..."
+    sleep 600
+    echo "Script completed, exiting."
diff --git a/workloads/weather-inference-ai-models-aurora/helm/Chart.yaml b/workloads/weather-inference-ai-models-aurora/helm/Chart.yaml
new file mode 100644
index 0000000..8237187
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: ai-models-job
+description: A Helm chart for inference of aurora with ai-models
+version: 0.0.1
diff --git a/workloads/weather-inference-ai-models-aurora/helm/README.md b/workloads/weather-inference-ai-models-aurora/helm/README.md
new file mode 100644
index 0000000..f762a3c
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/README.md
@@ -0,0 +1,64 @@
+# ECMWF ai-models — Helm Deployment
+
+This Helm chart deploys aurora using ai-models library by ecmwf. It handles model setup, packaging, and inference in a fully containerized GPU environment.
+
+First you need to get API key from **The Climate Data Store (CDS)** in order to download ERA5 dataset. Instructions for generating the key are available on the [CDSAPI setup](https://cds.climate.copernicus.eu/how-to-api) page. This CDS API Key should be added as a Kubernetes secret (<code>cds-api-secret</code> in our case).
+
+**NOTE:** This workload requires the namespace to have this secret. Please work with your cluster administrator to make it available according to your cluster's best practices
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml # Helm chart metadata
+├── values.yaml # Main config values (e.g. image, GPU)
+├── templates/ # Helm templates and helpers for deployment, config, etc.
+│ ├── _helpers.tpl # Helpers common to other helm charts in the repo
+│ ├── configmap.yaml
+│ ├── inference-job.yaml
+├── mount/ # Files mounted into the container via ConfigMap
+│ ├── grib_visualizer.py
+│ ├── README.md
+│ ├── setup_env.sh
+│ ├── run_model.sh
+├── overrides/ # Optional values overrides
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template aurora ./helm | kubectl apply -f -
+
+```
+
+This sets up the pod, installs dependencies, downloads the model, runs inference and writes outputs to the `.grib` file. Then visualizes the `.grib` files and writes everything to minio.
+
+## Script params
+
+| **Parameter**           | **Description**                          | **Default**                   |
+|-------------------------|------------------------------------------|-------------------------------|
+| `model_name`            | model name (aurora, aurora-0.1-finetuned, aurora-2.5-finetuned, aurora-2.5-pretrained)       | `aurora`            |
+| `lead_time`             | Forecast lead time in hours              | `48`                          |
+| `visualize`             | Visualization (gifs)                     | `true`                        |
+| `model_out_dir`         | Directory for model outputs              | `/workspace/model_out`        |
+| `log_dir`               | Log file directory                       | `$MODEL_OUT_DIR/logs`         |
+| `pred_dir`              | Prediction output directory              | `$MODEL_OUT_DIR/predictions`  |
+| `assets_dir`            | Auxiliary files directory                | `/workspace/assets`           |
+| `mountdir`              | Mount directory for additional scripts   | `/workload/mount`             |
+| `date`                  | Date in YYYYMMDD format                  | `20230110`                    |
+| `time`                  | Time in HHMM format                      | `0000`                        |
+| `MINIO_BUCKET`          | MinIO bucket name for output storage     | `your-bucket-name`            |
+| `MINIO_PREFIX`          | Subdirectory/prefix added to upload paths| `$MODEL_NAME/$TIMESTAMP`      |
+
+
+## Environment Variables
+
+| **Parameter**              | **Description**                     | **Default**                                              |
+|----------------------------|-------------------------------------|----------------------------------------------------------|
+| `MINIO_ACCESS_KEY`        | MinIO access key ID                   | *(from secret: minio-credentials / minio-access-key)*     |
+| `MINIO_SECRET_KEY`    | MinIO secret key                      | *(from secret: minio-credentials / minio-secret-key)* |
+| `MINIO_ENDPOINT`           | MinIO or S3-compatible endpoint URI | `your-minio-endpoint`                                    |
+| `CDSAPI_URL`               | URL for the CDS API                 | *(from secret: cds-api-secret / url)*                    |
+| `CDSAPI_KEY`               | Key for the CDS API                 | *(from secret: cds-api-secret / key)*                    |
diff --git a/workloads/weather-inference-ai-models-aurora/helm/mount/README.md b/workloads/weather-inference-ai-models-aurora/helm/mount/README.md
new file mode 100644
index 0000000..a00efc5
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/mount/README.md
@@ -0,0 +1,51 @@
+# Weather Model Inference and Visualization
+
+Files in this directory are mounted to the workload at `/workload/mount`.
+
+**Note:** Subdirectories and binary files are not supported.
+
+
+##
+- run model inference for weather forecasts using the ai-models plugin
+- use simple tools to  visualize forecasts of models from ecmwf ai-models.
+- analyze performance using some simple tools
+
+## Quick Start
+- Edit the `date` parameter in the `run_model.sh` file to set the date for the date for input data.
+- The lead times are calculated from the input date and time
+
+To set a different lead time, or model, use the commands below:
+
+```bash
+# Run with default settings
+./run_model.sh
+
+# Run with custom settings
+bash run_model.sh --model_name=aurora --lead_time=24 --visualize=true
+```
+
+## Configuration
+
+The script accepts these environment variables:
+
+| Variable   | Default  | Description                                |
+|------------|----------|--------------------------------------------|
+| MODEL_NAME | aurora   | AI model to use (aurora, aurora-0.1-finetuned, aurora-2.5-finetuned, aurora-2.5-pretrained)             |
+| LEAD_TIME  | 24       | Forecast lead time in hours                |
+| VISUALIZE  | true     | Generate visualization animations          |
+
+## Output
+
+Forecast files are saved to `predictions/` directory.
+
+Visualizations are generated as animated GIFs in:
+- `outputs/{model_name}/{variable}.gif` for surface variables
+- `outputs/{model_name}/level_{pressure}/{variable}.gif` for pressure level variables
+
+## Scripts
+
+- `run_model.sh` - Main script that downloads models. data and runs inference
+- `grib_visualizer.py` - Creates visualizations (gifs) from GRIB files
+
+## Note
+Currently tested only aurora.
diff --git a/workloads/weather-inference-ai-models-aurora/helm/mount/grib_visualizer.py b/workloads/weather-inference-ai-models-aurora/helm/mount/grib_visualizer.py
new file mode 100644
index 0000000..dd44f95
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/mount/grib_visualizer.py
@@ -0,0 +1,172 @@
+"""
+GRIB File Visualizer
+
+This script loads a GRIB file and creates animated GIFs
+for each variable over time, optionally across pressure levels.
+
+Usage:
+    python grib_visualizer.py --input <path_to_grib_file> [--output <output_dir>]
+"""
+
+import os
+import sys
+from typing import List, Optional, Tuple
+
+import cfgrib
+import matplotlib
+import matplotlib.animation as animation
+import matplotlib.pyplot as plt
+import numpy as np
+import xarray as xr
+
+
+def open_grib_datasets(path: str) -> List[xr.Dataset]:
+    return cfgrib.open_datasets(path, decode_timedelta=xr.coding.times.CFTimedeltaCoder())
+
+
+def describe_datasets(datasets: List[xr.Dataset], path: str) -> List[dict]:
+    descriptions = []
+    for ds_idx, ds in enumerate(datasets):
+        model_name = "aurora" if "aurora" in path else "other"
+        descriptions.append(
+            {
+                "model_name": model_name,
+                "dataset_idx": ds_idx,
+                "dataset": ds,
+                "has_levels": "isobaricInhPa" in ds,
+                "has_time": "time" in ds.dims,
+                "has_step": "step" in ds.dims,
+            }
+        )
+    return descriptions
+
+
+def select_data(
+    data: xr.DataArray, has_step: bool, has_time: bool, max_steps: int, level: Optional[int] = None
+) -> xr.DataArray:
+    if has_step and data.sizes.get("step", 0) > max_steps:
+        data = data.isel(step=range(max_steps))
+    if has_time and data.sizes.get("time", 0) > max_steps:
+        data = data.isel(time=range(max_steps))
+    if level is not None and "isobaricInhPa" in data.coords:
+        data = data.sel(isobaricInhPa=level)
+    return data.sortby("latitude")
+
+
+def compute_scale(
+    data: xr.DataArray, center: Optional[float] = None, robust: bool = False
+) -> Tuple[matplotlib.colors.Normalize, str]:
+    vmin = np.nanpercentile(data, 2 if robust else 0)
+    vmax = np.nanpercentile(data, 98 if robust else 100)
+    if center is not None:
+        diff = max(vmax - center, center - vmin)
+        vmin = center - diff
+        vmax = center + diff
+    return matplotlib.colors.Normalize(vmin, vmax), ("RdBu_r" if center is not None else "viridis")
+
+
+def save_variable_gif(
+    data: xr.DataArray,
+    time_dim: str,
+    cmap: str,
+    norm: matplotlib.colors.Normalize,
+    var: str,
+    frames: int,
+    out_path: str,
+    nanoseconds_per_step: Optional[int],
+) -> None:
+    fig, ax = plt.subplots(figsize=(10, 5))
+    im = ax.imshow(data.isel({time_dim: 0}, missing_dims="ignore"), cmap=cmap, norm=norm, origin="lower")
+    ax.set_title(f"{var} ({time_dim})")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    plt.colorbar(im, ax=ax, orientation="vertical", shrink=0.4, pad=0.01)
+
+    def update(frame):
+        im.set_data(data.isel({time_dim: frame}, missing_dims="ignore"))
+        ax.set_title(f"{var} {time_dim}={frame}")
+
+    def step_to_fps(nanoseconds: int) -> float:
+        hours = nanoseconds / (60 * 60 * 1e9)
+        return 2 * 24 / hours if hours > 0 else 1.0
+
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ani = animation.FuncAnimation(fig, update, frames=frames)
+    fps = step_to_fps(nanoseconds_per_step) if nanoseconds_per_step else 5
+    ani.save(out_path, writer="pillow", fps=fps)
+    plt.close(fig)
+
+
+def process_dataset(
+    ds: xr.Dataset,
+    dataset_idx: int,
+    output_dir: str,
+    model_name: str,
+    has_levels: bool,
+    has_time: bool,
+    has_step: bool,
+    max_steps: int,
+) -> None:
+    step_to_nanoseconds = int(ds.get("step")[1]) if has_step and "step" in ds else None
+    model_output_dir = os.path.join(output_dir, model_name)
+
+    for var in ds.data_vars:
+        variable_data = ds[var]
+        levels = (
+            [int(level) for level in variable_data.coords["isobaricInhPa"].values]
+            if has_levels and "isobaricInhPa" in variable_data.coords
+            else [None]
+        )
+
+        print(f"""Processing: {var} on {'surface_level' if levels == [None] else 'pressure levels'}""")
+
+        for level in levels:
+            data = select_data(variable_data, has_step, has_time, max_steps, level)
+            if has_time and has_step:
+                data = data.isel(time=0)
+
+            time_dim = "step" if has_step else ("time" if has_time else None)
+            if not time_dim:
+                print(f"""Skipping {var}, no time dimension. Dataset #{dataset_idx} likely static map.""")
+                continue
+
+            frames = data.sizes[time_dim]
+            norm, cmap = compute_scale(data)
+            prefix = f"level_{level}" if level is not None else ""
+            gif_name = f"{var}.gif"
+            out_path = os.path.join(model_output_dir, prefix, gif_name)
+            save_variable_gif(data, time_dim, cmap, norm, var, frames, out_path, step_to_nanoseconds)
+
+
+def main():
+    if "--input" not in sys.argv:
+        print("Usage: python grib_visualizer.py --input <path_to_grib_file> [--output <output_dir>]")
+        sys.exit(1)
+
+    input_file = sys.argv[sys.argv.index("--input") + 1]
+
+    if "--output" in sys.argv:
+        output_dir = sys.argv[sys.argv.index("--output") + 1]
+    else:
+        output_dir = "outputs/"
+
+    input_file = sys.argv[sys.argv.index("--input") + 1]
+    datasets = open_grib_datasets(input_file)
+    descriptions = describe_datasets(datasets, input_file)
+
+    for desc in descriptions:
+        print("Processing dataset #", desc["dataset_idx"])
+        process_dataset(
+            ds=desc["dataset"],
+            dataset_idx=desc["dataset_idx"],
+            output_dir=output_dir,
+            model_name=desc["model_name"],
+            has_levels=desc["has_levels"],
+            has_time=desc["has_time"],
+            has_step=desc["has_step"],
+            max_steps=desc["dataset"].sizes.get("step", 40),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workloads/weather-inference-ai-models-aurora/helm/mount/minio_uploader.py b/workloads/weather-inference-ai-models-aurora/helm/mount/minio_uploader.py
new file mode 100644
index 0000000..37f2cd7
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/mount/minio_uploader.py
@@ -0,0 +1,51 @@
+"""
+MinIO Uploader
+
+Uploads a local directory to a MinIO bucket, creating it if needed.
+
+Usage:
+    python minio_uploader.py <directory_path> <bucket_name> [prefix]
+"""
+
+import os
+import sys
+
+import boto3
+
+
+def upload_directory_to_minio(directory, bucket_name, prefix=""):
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=os.getenv("MINIO_ACCESS_KEY"),
+        aws_secret_access_key=os.getenv("MINIO_SECRET_KEY"),
+        endpoint_url=os.getenv("MINIO_ENDPOINT"),
+    )
+
+    try:
+        s3.create_bucket(Bucket=bucket_name)
+        print(f"Created bucket: {bucket_name}")
+    except s3.exceptions.BucketAlreadyOwnedByYou:
+        pass
+    except s3.exceptions.BucketAlreadyExists:
+        pass
+
+    for root, _, files in os.walk(directory):
+        for file in files:
+            local_path = os.path.join(root, file)
+            relative_path = os.path.relpath(local_path, directory)
+            s3_key = os.path.join(prefix, relative_path) if prefix else relative_path
+            try:
+                s3.upload_file(local_path, bucket_name, s3_key)
+                print(f"Uploaded: {local_path} -> {s3_key}")
+            except Exception as e:
+                print(f"Failed to upload {local_path}: {e}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python minio_uploader.py <directory_path> <bucket_name> [prefix]")
+        sys.exit(1)
+    directory = sys.argv[1]
+    bucket_name = sys.argv[2]
+    prefix = sys.argv[3] if len(sys.argv) > 3 else ""
+    upload_directory_to_minio(directory, bucket_name, prefix)
diff --git a/workloads/weather-inference-ai-models-aurora/helm/mount/run_model.sh b/workloads/weather-inference-ai-models-aurora/helm/mount/run_model.sh
new file mode 100644
index 0000000..16bc5e6
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/mount/run_model.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+
+# Parse cmd line args
+for ARGUMENT in "$@"; do
+   ARG="${ARGUMENT#--}"
+   KEY="${ARG%%=*}"
+   VALUE="${ARG#*=}"
+   export "$KEY"="$VALUE"
+done
+
+# Set default values
+MODEL_NAME="${model_name:-aurora}"
+LEAD_TIME="${lead_time:-24}"
+VISUALIZE="${visualize:-true}"
+MODEL_OUT_DIR="${model_out_dir:-/workspace/model_out}"
+LOG_DIR="${log_dir:-$MODEL_OUT_DIR/logs}"
+PRED_DIR="${pred_dir:-$MODEL_OUT_DIR/predictions}"
+ASSETS_DIR="${assets_dir:-/workspace/assets}"
+MOUNT_DIR="${mountdir:-/workload/mount}"
+DATE="${date:-20230110}"
+TIME="${time:-0000}"
+
+# Model type & parameter validation
+if [[ "$MODEL_NAME" == *aurora* ]]; then
+  MODEL_TYPE="aurora"
+else
+  echo "Error: Only 'aurora' model family is supported."
+  exit 1
+fi
+
+
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+LOG_DIR="$LOG_DIR/${MODEL_NAME}_${TIMESTAMP}"
+TRACE_DIR="$LOG_DIR/traces"
+LOG_FILE="$LOG_DIR/output.log"
+
+mkdir -p "$MODEL_OUT_DIR"
+mkdir -p "$PRED_DIR"
+mkdir -p "$LOG_DIR"
+
+cp "$0" "$LOG_DIR/$(basename "$0" .sh).sh"
+
+echo "Starting model run for $MODEL_NAME at $(date)" | tee -a "$LOG_FILE"
+
+{
+  CMD_ARGS=(--download-assets
+            --assets $ASSETS_DIR/$MODEL_NAME
+            --input cds
+            --date "$DATE"
+            --path $PRED_DIR/$MODEL_NAME.grib
+            --time "$TIME"
+            --lead-time "$LEAD_TIME"
+            "$MODEL_NAME")
+
+
+
+  ai-models "${CMD_ARGS[@]}"
+
+  # Conditional visualization based on the VISUALIZE variable
+  if [ "$VISUALIZE" = "true" ]; then
+    echo "visualizing"
+    python3 $MOUNT_DIR/grib_visualizer.py --input "$PRED_DIR/$MODEL_NAME.grib" --output "$PRED_DIR/gifs_$MODEL_NAME"
+  fi
+
+  # Upload results to minio
+  echo "Uploading results to minio..."
+  MINIO_BUCKET="${minio_bucket:-default-bucket}"
+  MINIO_PREFIX="${minio_prefix:-$MODEL_NAME/$TIMESTAMP}"
+
+  python3 $MOUNT_DIR/minio_uploader.py "$MODEL_OUT_DIR" "$MINIO_BUCKET" "$MINIO_PREFIX"
+
+  # Print done
+  echo "Done!"
+  echo "Log saved to: $LOG_FILE"
+} 2>&1 | tee -a "$LOG_FILE"
diff --git a/workloads/weather-inference-ai-models-aurora/helm/mount/setup_env.sh b/workloads/weather-inference-ai-models-aurora/helm/mount/setup_env.sh
new file mode 100644
index 0000000..9fa827d
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/mount/setup_env.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+mkdir -p /workspace/models
+cd /workspace/models
+
+# ECMWF AI model framework
+git clone https://github.com/ecmwf-lab/ai-models.git
+pip install /workspace/models/ai-models
+
+# Aurora model
+git clone https://github.com/ecmwf-lab/ai-models-aurora.git
+pip install /workspace/models/ai-models-aurora
+
+
+# Install dependencies for minio uploader
+pip install boto3 botocore
+pip install matplotlib==3.10.3
+pip install --upgrade multiurl==0.3.1
diff --git a/workloads/weather-inference-ai-models-aurora/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/weather-inference-ai-models-aurora/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e842dc2
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
diff --git a/workloads/weather-inference-ai-models-aurora/helm/templates/_helpers.tpl b/workloads/weather-inference-ai-models-aurora/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..ba28a2c
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/templates/_helpers.tpl
@@ -0,0 +1,91 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+limits:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: {{ .Values.settings.mountdir }}
+  name: workload-mount
+- mountPath: /dev/shm
+  name: dshm
+- mountPath: /workload
+  name: ephemeral-storage
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+{{- if .Values.storage.ephemeral.storageClassName -}}
+- ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.storage.ephemeral.quantity }}
+        storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+  name: ephemeral-storage
+{{- else }}
+- emptyDir: {}
+  name: ephemeral-storage
+  sizeLimit: {{ .Values.storage.ephemeral.quantity }}
+{{- end }}
+- emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+  name: dshm
+- configMap:
+    name: {{ include "release.fullname" . }}
+  name: workload-mount
+{{- end -}}
diff --git a/workloads/weather-inference-ai-models-aurora/helm/templates/configmap.yaml b/workloads/weather-inference-ai-models-aurora/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/weather-inference-ai-models-aurora/helm/templates/inference-job.yaml b/workloads/weather-inference-ai-models-aurora/helm/templates/inference-job.yaml
new file mode 100644
index 0000000..38c474e
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/templates/inference-job.yaml
@@ -0,0 +1,72 @@
+{{- define "job_wrapped_with_kaiwoservice" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoService
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  deployment:
+    {{- include "job" . | nindent 6 }}
+{{- end -}}
+
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  namespace: {{ .Values.namespace | quote }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  backoffLimit: {{ .Values.job.backoffLimit | default 0 }}
+  ttlSecondsAfterFinished: {{ .Values.job.ttlSecondsAfterFinished | default 432000 }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      {{- if .Values.nodeSelector }}
+      nodeSelector:
+        {{- .Values.nodeSelector | toYaml | nindent 8 }}
+      {{- end }}
+      {{- if .Values.image.pullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.image.pullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}
+          image: "{{ .Values.image.name }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy | quote }}
+          command: ["sh", "-c"]
+          args:
+            - |
+              #!/bin/bash
+              bash {{ .Values.settings.mountdir }}/setup_env.sh && \
+              bash {{ .Values.settings.mountdir }}/run_model.sh{{- range $k, $v := .Values.settings }} --{{ $k }}={{ $v }}{{- end }}
+          {{- if .Values.env_vars }}
+          env:
+            {{- include "container.env" . | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      restartPolicy: {{ .Values.job.restartPolicy | default "Never" }}
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
+{{- end }}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "job_wrapped_with_kaiwoservice" . }}
+{{- else -}}
+{{- include "job" . }}
+{{- end -}}
diff --git a/workloads/weather-inference-ai-models-aurora/helm/values.yaml b/workloads/weather-inference-ai-models-aurora/helm/values.yaml
new file mode 100644
index 0000000..2cdba7b
--- /dev/null
+++ b/workloads/weather-inference-ai-models-aurora/helm/values.yaml
@@ -0,0 +1,65 @@
+# Job metadata and labels
+metadata:
+  labels: {}
+  name: "aurora-job"
+
+# Container image configuration
+image:
+  name: "rocm/pytorch"
+  tag: "latest"
+  pullPolicy: "Always"
+
+# Container resources
+resources:
+  gpus: 1
+  memoryPerGpu: 64
+  cpuPerGpu: 4
+
+# Container command and entrypoint
+settings:
+  model_name: aurora
+  lead_time: 48
+  visualize: true
+  model_out_dir: /workspace/model_out
+  assets_dir: /workspace/assets
+  date: "20230110"
+  time: "0000"
+  minio_bucket: weather
+  minio_prefix: inference_results
+  mountdir: /workload/mount
+
+# Volume configurations
+storage:
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+  dshm:
+    enabled: true
+    sizeLimit: "32Gi"
+
+
+# Environment variables
+env_vars:
+  MINIO_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  MINIO_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  MINIO_ENDPOINT: http://minio.minio-tenant-default.svc.cluster.local:80
+  CDSAPI_URL:
+    name: cds-api-secret
+    key: url
+  CDSAPI_KEY:
+    name: cds-api-secret
+    key: key
+
+# Job restart and timeout settings
+job:
+  restartPolicy: "Never"
+  backoffLimit: 0
+
+kaiwo:
+  enabled: false
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/Chart.yaml b/workloads/weather-inference-ai-models-gencast-pangu/helm/Chart.yaml
new file mode 100644
index 0000000..29a357c
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: ai-models-job
+description: A Helm chart for inference of gencast & panguweather with ai-models
+version: 0.0.1
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/README.md b/workloads/weather-inference-ai-models-gencast-pangu/helm/README.md
new file mode 100644
index 0000000..b43f172
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/README.md
@@ -0,0 +1,67 @@
+# ECMWF ai-models — Helm Deployment
+
+This Helm chart deploys GenCast and Panguweather using ai-models library by ecmwf. It handles model setup, packaging, and inference in a fully containerized GPU environment.
+
+First you need to get API key from **The Climate Data Store (CDS)** in order to download ERA5 dataset. Instructions for generating the key are available on the [CDSAPI setup](https://cds.climate.copernicus.eu/how-to-api) page. This CDS API Key should be added as a Kubernetes secret (<code>cds-api-secret</code> in our case).
+
+**NOTE:** This workload requires the namespace to have this secret. Please work with your cluster administrator to make it available according to your cluster's best practices
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml # Helm chart metadata
+├── values.yaml # Main config values (e.g. image, GPU)
+├── templates/ # Helm templates and helpers for deployment, config, etc.
+│ ├── _helpers.tpl # Helpers common to other helm charts in the repo
+│ ├── configmap.yaml
+│ ├── inference-job.yaml
+├── mount/ # Files mounted into the container via ConfigMap
+│ ├── grib_visualizer.py
+│ ├── README.md
+│ ├── setup_env.sh
+│ ├── rocm.patch
+│ ├── run_model.sh
+├── overrides/ # Optional values overrides
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template gencast ./helm | kubectl apply -f -
+
+```
+
+This sets up the pod, installs dependencies, downloads the model, runs inference, optionally profiles the run and writes outputs to the `.grib` file. Then visualizes the `.grib` files and writes everything to minio.
+
+## Script params
+
+| **Parameter**           | **Description**                          | **Default**                   |
+|-------------------------|------------------------------------------|-------------------------------|
+| `model_name`            | `gencast` or `panguweather` family       | `gencast-1.0-Mini`            |
+| `lead_time`             | Forecast lead time in hours              | `48`                          |
+| `visualize`             | Visualization (gifs)                     | `true`                        |
+| `model_out_dir`         | Directory for model outputs              | `/workspace/model_out`        |
+| `log_dir`               | Log file directory                       | `$MODEL_OUT_DIR/logs`         |
+| `pred_dir`              | Prediction output directory              | `$MODEL_OUT_DIR/predictions`  |
+| `assets_dir`            | Auxiliary files directory                | `/workspace/assets`           |
+| `jax_profiler`          | Enable JAX profiling                     | `false`                       |
+| `num_ensemble_members`  | Number of ensemble members               | `1`                           |
+| `date`                  | Date in YYYYMMDD format                  | `20230110`                    |
+| `time`                  | Time in HHMM format                      | `0000`                        |
+| `mountdir`              | Mount directory for additional scripts   | `/workload/mount`             |
+| `MINIO_BUCKET`          | MinIO bucket name for output storage     | `your-bucket-name`            |
+| `MINIO_PREFIX`          | Subdirectory/prefix added to upload paths| `$MODEL_NAME/$TIMESTAMP`      |
+
+
+## Environment Variables
+
+| **Parameter**              | **Description**                     | **Default**                                              |
+|----------------------------|-------------------------------------|----------------------------------------------------------|
+| `MINIO_ACCESS_KEY`        | MinIO access key ID                   | *(from secret: minio-credentials / minio-access-key)*     |
+| `MINIO_SECRET_KEY`    | MinIO secret key                      | *(from secret: minio-credentials / minio-secret-key)* |
+| `MINIO_ENDPOINT`           | MinIO or S3-compatible endpoint URI | `your-minio-endpoint`                                    |
+| `CDSAPI_URL`               | URL for the CDS API                 | *(from secret: cds-api-secret / url)*                    |
+| `CDSAPI_KEY`               | Key for the CDS API                 | *(from secret: cds-api-secret / key)*                    |
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/README.md b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/README.md
new file mode 100644
index 0000000..ebf8ef3
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/README.md
@@ -0,0 +1,51 @@
+# Weather Model Inference and Visualization
+
+Files in this directory are mounted to the workload at `/workload/mount`.
+
+**Note:** Subdirectories and binary files are not supported.
+
+
+##
+- run model inference for weather forecasts using the ai-models plugin
+- use simple tools to  visualize forecasts of models from ecmwf ai-models.
+- analyze performance using some simple tools
+
+## Quick Start
+- Edit the `date` parameter in the `run_model.sh` file to set the date for the date for input data.
+- The lead times are calculated from the input date and time
+
+To set a different lead time, or model, use the commands below:
+
+```bash
+# Run with default settings
+./run_model.sh
+
+# Run with custom settings
+bash run_model.sh --model_name=gencast --lead_time=24 --visualize=true
+```
+
+## Configuration
+
+The script accepts these environment variables:
+
+| Variable   | Default  | Description                                |
+|------------|----------|--------------------------------------------|
+| MODEL_NAME | gencast  | AI model to use (gencast, pangu, etc.)     |
+| LEAD_TIME  | 24       | Forecast lead time in hours                |
+| VISUALIZE  | true     | Generate visualization animations          |
+
+## Output
+
+Forecast files are saved to `predictions/` directory.
+
+Visualizations are generated as animated GIFs in:
+- `outputs/{model_name}/{variable}.gif` for surface variables
+- `outputs/{model_name}/level_{pressure}/{variable}.gif` for pressure level variables
+
+## Scripts
+
+- `run_model.sh` - Main script that downloads models. data and runs inference
+- `grib_visualizer.py` - Creates visualizations (gifs) from GRIB files
+
+## Note
+Currently tested only gencast and panguweather.
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/grib_visualizer.py b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/grib_visualizer.py
new file mode 100644
index 0000000..2edeeba
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/grib_visualizer.py
@@ -0,0 +1,172 @@
+"""
+GRIB File Visualizer
+
+This script loads a GRIB file and creates animated GIFs
+for each variable over time, optionally across pressure levels.
+
+Usage:
+    python grib_visualizer.py --input <path_to_grib_file> [--output <output_dir>]
+"""
+
+import os
+import sys
+from typing import List, Optional, Tuple
+
+import cfgrib
+import matplotlib
+import matplotlib.animation as animation
+import matplotlib.pyplot as plt
+import numpy as np
+import xarray as xr
+
+
+def open_grib_datasets(path: str) -> List[xr.Dataset]:
+    return cfgrib.open_datasets(path, decode_timedelta=xr.coding.times.CFTimedeltaCoder)
+
+
+def describe_datasets(datasets: List[xr.Dataset], path: str) -> List[dict]:
+    descriptions = []
+    for ds_idx, ds in enumerate(datasets):
+        model_name = "gencast" if "gencast" in path else ("panguweather" if "pangu" in path else "other")
+        descriptions.append(
+            {
+                "model_name": model_name,
+                "dataset_idx": ds_idx,
+                "dataset": ds,
+                "has_levels": "isobaricInhPa" in ds,
+                "has_time": "time" in ds.dims,
+                "has_step": "step" in ds.dims,
+            }
+        )
+    return descriptions
+
+
+def select_data(
+    data: xr.DataArray, has_step: bool, has_time: bool, max_steps: int, level: Optional[int] = None
+) -> xr.DataArray:
+    if has_step and data.sizes.get("step", 0) > max_steps:
+        data = data.isel(step=range(max_steps))
+    if has_time and data.sizes.get("time", 0) > max_steps:
+        data = data.isel(time=range(max_steps))
+    if level is not None and "isobaricInhPa" in data.coords:
+        data = data.sel(isobaricInhPa=level)
+    return data.sortby("latitude")
+
+
+def compute_scale(
+    data: xr.DataArray, center: Optional[float] = None, robust: bool = False
+) -> Tuple[matplotlib.colors.Normalize, str]:
+    vmin = np.nanpercentile(data, 2 if robust else 0)
+    vmax = np.nanpercentile(data, 98 if robust else 100)
+    if center is not None:
+        diff = max(vmax - center, center - vmin)
+        vmin = center - diff
+        vmax = center + diff
+    return matplotlib.colors.Normalize(vmin, vmax), ("RdBu_r" if center is not None else "viridis")
+
+
+def save_variable_gif(
+    data: xr.DataArray,
+    time_dim: str,
+    cmap: str,
+    norm: matplotlib.colors.Normalize,
+    var: str,
+    frames: int,
+    out_path: str,
+    nanoseconds_per_step: Optional[int],
+) -> None:
+    fig, ax = plt.subplots(figsize=(10, 5))
+    im = ax.imshow(data.isel({time_dim: 0}, missing_dims="ignore"), cmap=cmap, norm=norm, origin="lower")
+    ax.set_title(f"{var} ({time_dim})")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    plt.colorbar(im, ax=ax, orientation="vertical", shrink=0.4, pad=0.01)
+
+    def update(frame):
+        im.set_data(data.isel({time_dim: frame}, missing_dims="ignore"))
+        ax.set_title(f"{var} {time_dim}={frame}")
+
+    def step_to_fps(nanoseconds: int) -> float:
+        hours = nanoseconds / (60 * 60 * 1e9)
+        return 2 * 24 / hours if hours > 0 else 1.0
+
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    ani = animation.FuncAnimation(fig, update, frames=frames)
+    fps = step_to_fps(nanoseconds_per_step) if nanoseconds_per_step else 5
+    ani.save(out_path, writer="pillow", fps=fps)
+    plt.close(fig)
+
+
+def process_dataset(
+    ds: xr.Dataset,
+    dataset_idx: int,
+    output_dir: str,
+    model_name: str,
+    has_levels: bool,
+    has_time: bool,
+    has_step: bool,
+    max_steps: int,
+) -> None:
+    step_to_nanoseconds = int(ds.get("step")[1]) if has_step and "step" in ds else None
+    model_output_dir = os.path.join(output_dir, model_name)
+
+    for var in ds.data_vars:
+        variable_data = ds[var]
+        levels = (
+            [int(level) for level in variable_data.coords["isobaricInhPa"].values]
+            if has_levels and "isobaricInhPa" in variable_data.coords
+            else [None]
+        )
+
+        print(f"""Processing: {var} on {'surface_level' if levels == [None] else 'pressure levels'}""")
+
+        for level in levels:
+            data = select_data(variable_data, has_step, has_time, max_steps, level)
+            if has_time and has_step:
+                data = data.isel(time=0)
+
+            time_dim = "step" if has_step else ("time" if has_time else None)
+            if not time_dim:
+                print(f"""Skipping {var}, no time dimension. Dataset #{dataset_idx} likely static map.""")
+                continue
+
+            frames = data.sizes[time_dim]
+            norm, cmap = compute_scale(data)
+            prefix = f"level_{level}" if level is not None else ""
+            gif_name = f"{var}.gif"
+            out_path = os.path.join(model_output_dir, prefix, gif_name)
+            save_variable_gif(data, time_dim, cmap, norm, var, frames, out_path, step_to_nanoseconds)
+
+
+def main():
+    if "--input" not in sys.argv:
+        print("Usage: python grib_visualizer.py --input <path_to_grib_file> [--output <output_dir>]")
+        sys.exit(1)
+
+    input_file = sys.argv[sys.argv.index("--input") + 1]
+
+    if "--output" in sys.argv:
+        output_dir = sys.argv[sys.argv.index("--output") + 1]
+    else:
+        output_dir = "outputs/"
+
+    input_file = sys.argv[sys.argv.index("--input") + 1]
+    datasets = open_grib_datasets(input_file)
+    descriptions = describe_datasets(datasets, input_file)
+
+    for desc in descriptions:
+        print("Processing dataset #", desc["dataset_idx"])
+        process_dataset(
+            ds=desc["dataset"],
+            dataset_idx=desc["dataset_idx"],
+            output_dir=output_dir,
+            model_name=desc["model_name"],
+            has_levels=desc["has_levels"],
+            has_time=desc["has_time"],
+            has_step=desc["has_step"],
+            max_steps=desc["dataset"].sizes.get("step", 40),
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/minio_uploader.py b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/minio_uploader.py
new file mode 100644
index 0000000..37f2cd7
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/minio_uploader.py
@@ -0,0 +1,51 @@
+"""
+MinIO Uploader
+
+Uploads a local directory to a MinIO bucket, creating it if needed.
+
+Usage:
+    python minio_uploader.py <directory_path> <bucket_name> [prefix]
+"""
+
+import os
+import sys
+
+import boto3
+
+
+def upload_directory_to_minio(directory, bucket_name, prefix=""):
+    s3 = boto3.client(
+        "s3",
+        aws_access_key_id=os.getenv("MINIO_ACCESS_KEY"),
+        aws_secret_access_key=os.getenv("MINIO_SECRET_KEY"),
+        endpoint_url=os.getenv("MINIO_ENDPOINT"),
+    )
+
+    try:
+        s3.create_bucket(Bucket=bucket_name)
+        print(f"Created bucket: {bucket_name}")
+    except s3.exceptions.BucketAlreadyOwnedByYou:
+        pass
+    except s3.exceptions.BucketAlreadyExists:
+        pass
+
+    for root, _, files in os.walk(directory):
+        for file in files:
+            local_path = os.path.join(root, file)
+            relative_path = os.path.relpath(local_path, directory)
+            s3_key = os.path.join(prefix, relative_path) if prefix else relative_path
+            try:
+                s3.upload_file(local_path, bucket_name, s3_key)
+                print(f"Uploaded: {local_path} -> {s3_key}")
+            except Exception as e:
+                print(f"Failed to upload {local_path}: {e}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print("Usage: python minio_uploader.py <directory_path> <bucket_name> [prefix]")
+        sys.exit(1)
+    directory = sys.argv[1]
+    bucket_name = sys.argv[2]
+    prefix = sys.argv[3] if len(sys.argv) > 3 else ""
+    upload_directory_to_minio(directory, bucket_name, prefix)
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/rocm.patch b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/rocm.patch
new file mode 100644
index 0000000..a6ceba8
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/rocm.patch
@@ -0,0 +1,66 @@
+diff --git a/src/ai_models/__main__.py b/src/ai_models/__main__.py
+index 37b8ec4..2b72a7b 100644
+--- a/src/ai_models/__main__.py
++++ b/src/ai_models/__main__.py
+@@ -221,6 +221,18 @@ def _main(argv):
+         action="store_true",
+     )
+
++    parser.add_argument(
++        "--jax-profiler",
++        help="Currently needed only for GenCast",
++        action="store_true",
++    )
++
++    parser.add_argument(
++        "--trace-dir",
++        default="traces",
++        help="Where to output the profiling results",
++    )
++
+     # TODO: deprecate that option
+     parser.add_argument(
+         "--model-version",
+@@ -332,6 +344,10 @@ def run(cfg: dict, model_args: list):
+         sys.exit(0)
+
+     try:
++        profiling_gencast =  ('gencast' in cfg['model']) and cfg['jax_profiler']
++        if profiling_gencast:
++            import jax
++            jax.profiler.start_trace(cfg['trace_dir'])
+         model.run()
+     except FileNotFoundError as e:
+         LOG.exception(e)
+@@ -347,6 +363,8 @@ def run(cfg: dict, model_args: list):
+         sys.exit(1)
+
+     model.finalise()
++    if profiling_gencast:
++        jax.profiler.stop_trace()
+
+     if cfg["dump_provenance"]:
+         with Timer("Collect provenance information"):
+diff --git a/src/ai_models/model.py b/src/ai_models/model.py
+index c15ac10..1cc5ea9 100644
+--- a/src/ai_models/model.py
++++ b/src/ai_models/model.py
+@@ -220,7 +220,7 @@ class Model:
+
+         available_providers = ort.get_available_providers()
+         providers = []
+-        for n in ["CUDAExecutionProvider", "CPUExecutionProvider"]:
++        for n in [ "ROCMExecutionProvider", "CPUExecutionProvider"]:
+             if n in available_providers:
+                 providers.append(n)
+
+@@ -234,7 +234,7 @@ class Model:
+             if ort.get_device() == "CPU":
+                 raise RuntimeError("GPU is not available")
+
+-            providers = ["CUDAExecutionProvider"]
++            providers = ["ROCMExecutionProvider"]
+
+         LOG.info("ONNXRuntime providers: %s", providers)
+
++
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/run_model.sh b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/run_model.sh
new file mode 100644
index 0000000..02b5308
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/run_model.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Pre allocate memory fraction for XLA
+# Setting to 0.3 for MI300
+export XLA_PYTHON_CLIENT_MEM_FRACTION=0.30
+
+# Turns off mem preallocation
+# export XLA_PYTHON_CLIENT_PREALLOCATE=false
+
+# dynamic mem allocation
+# export XLA_PYTHON_CLIENT_ALLOCATOR=platform
+
+# Log level for TF, setting to 2 leaves only errors
+# good to have with profiling, otherwise too many warnings
+export TF_CPP_MIN_LOG_LEVEL=2
+
+# Affects how many concurrent kernel executions or data transfers can be scheduled on the GPU.
+# export GPU_MAX_HW_QUEUES=2
+
+# Parse cmd line args
+for ARGUMENT in "$@"; do
+   ARG="${ARGUMENT#--}"
+   KEY="${ARG%%=*}"
+   VALUE="${ARG#*=}"
+   export "$KEY"="$VALUE"
+done
+
+# Set default values
+MODEL_NAME="${model_name:-gencast}"
+LEAD_TIME="${lead_time:-24}"
+VISUALIZE="${visualize:-true}"
+MODEL_OUT_DIR="${model_out_dir:-/workspace/model_out}"
+LOG_DIR="${log_dir:-$MODEL_OUT_DIR/logs}"
+PRED_DIR="${pred_dir:-$MODEL_OUT_DIR/predictions}"
+ASSETS_DIR="${assets_dir:-/workspace/assets}"
+JAX_PROFILER="${jax_profiler:-false}"
+NUM_ENSEMBLE_MEMBERS="${num_ensemble_members:-1}"
+MOUNT_DIR="${mountdir:-/workload/mount}"
+DATE="${date:-20230110}"
+TIME="${time:-0000}"
+
+# Model type & parameter validation
+if [[ "$MODEL_NAME" == *gencast* ]]; then
+  MODEL_TYPE="gencast"
+elif [[ "$MODEL_NAME" == *panguweather* ]]; then
+  MODEL_TYPE="panguweather"
+else
+  echo "Error: Only 'gencast' and 'panguweather' models are supported."
+  exit 1
+fi
+
+if [[ "$JAX_PROFILER" == "true" && "$MODEL_TYPE" != "gencast" ]]; then
+  echo "Profiling is only supported for jax based models. Disabling profiler."
+  JAX_PROFILER="false"
+fi
+
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+LOG_DIR="$LOG_DIR/${MODEL_NAME}_${TIMESTAMP}"
+TRACE_DIR="$LOG_DIR/traces"
+LOG_FILE="$LOG_DIR/output.log"
+XLA_DUMP_PATH="$LOG_DIR/xla_dumps"
+
+mkdir -p "$MODEL_OUT_DIR"
+mkdir -p "$PRED_DIR"
+mkdir -p "$LOG_DIR"
+mkdir -p "$XLA_DUMP_PATH"
+
+cp "$0" "$LOG_DIR/$(basename "$0" .sh).sh"
+
+echo "Starting model run for $MODEL_NAME at $(date)" | tee -a "$LOG_FILE"
+
+{
+  CMD_ARGS=(--download-assets
+            --assets $ASSETS_DIR/$MODEL_NAME
+            --input cds
+            --date "$DATE"
+            --path $PRED_DIR/$MODEL_NAME.grib
+            --time "$TIME"
+            --lead-time "$LEAD_TIME"
+            "$MODEL_NAME")
+
+
+if [ "$NUM_ENSEMBLE_MEMBERS" -gt 1 ]; then
+  CMD_ARGS+=(--num-ensemble-members "$NUM_ENSEMBLE_MEMBERS")
+fi
+
+  if [ "$JAX_PROFILER" = "true" ]; then
+    export PROFILE_TB=1
+    export XLA_FLAGS="--xla_gpu_enable_command_buffer='' --xla_dump_to=$XLA_DUMP_PATH"
+    CMD_ARGS+=(--jax-profiler)
+    CMD_ARGS+=(--trace-dir "$TRACE_DIR")
+  fi
+
+  echo "XLA_FLAGS: $XLA_FLAGS" | tee -a "$LOG_FILE"
+
+  ai-models "${CMD_ARGS[@]}"
+
+  # Conditional visualization based on the VISUALIZE variable
+  if [ "$VISUALIZE" = "true" ]; then
+    echo "visualizing"
+    python3 $MOUNT_DIR/grib_visualizer.py --input "$PRED_DIR/$MODEL_NAME.grib" --output "$PRED_DIR/gifs_$MODEL_NAME"
+  fi
+
+  # Upload results to minio
+  echo "Uploading results to minio..."
+  MINIO_BUCKET="${minio_bucket:-default-bucket}"
+  MINIO_PREFIX="${minio_prefix:-$MODEL_NAME/$TIMESTAMP}"
+
+  python3 $MOUNT_DIR/minio_uploader.py "$MODEL_OUT_DIR" "$MINIO_BUCKET" "$MINIO_PREFIX"
+
+  # Print done
+  echo "Done!"
+  echo "Log saved to: $LOG_FILE"
+} 2>&1 | tee -a "$LOG_FILE"
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/setup_env.sh b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/setup_env.sh
new file mode 100644
index 0000000..185b866
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/mount/setup_env.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -e
+
+mkdir /workspace/models
+cd /workspace/models
+
+# Install graphcast from DeepMind
+pip install git+https://github.com/deepmind/graphcast.git
+
+# ECMWF AI model framework
+git clone https://github.com/ecmwf-lab/ai-models.git
+cp /workload/mount/rocm.patch /workspace/models/ai-models/
+cd /workspace/models/ai-models && git apply rocm.patch
+pip install /workspace/models/ai-models
+
+# GenCast model
+cd /workspace/models/
+git clone https://github.com/ecmwf-lab/ai-models-gencast.git
+pip install /workspace/models/ai-models-gencast
+
+# Override downgraded haiku
+pip install dm-haiku==0.0.13
+
+# PanguWeather model
+git clone https://github.com/ecmwf-lab/ai-models-panguweather.git
+pip install /workspace/models/ai-models-panguweather
+pip uninstall -y onnxruntime-gpu
+pip3 install onnxruntime-rocm -f https://repo.radeon.com/rocm/manylinux/rocm-rel-6.3.4/
+
+# Tensorboard and profiling tools
+pip install tensorflow==2.18.0 tensorboard-plugin-profile==2.18.0 importlib_resources etils
+pip install boto3 botocore
+pip install --upgrade ml_dtypes
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/weather-inference-ai-models-gencast-pangu/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e842dc2
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/_helpers.tpl b/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..ba28a2c
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/_helpers.tpl
@@ -0,0 +1,91 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+limits:
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  cpu: "{{ max (mul .Values.resources.gpus .Values.resources.cpuPerGpu) 1 }}"
+  memory: "{{ max (mul .Values.resources.gpus .Values.resources.memoryPerGpu) 4 }}Gi"
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: {{ .Values.settings.mountdir }}
+  name: workload-mount
+- mountPath: /dev/shm
+  name: dshm
+- mountPath: /workload
+  name: ephemeral-storage
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+{{- if .Values.storage.ephemeral.storageClassName -}}
+- ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: {{ .Values.storage.ephemeral.quantity }}
+        storageClassName: {{ .Values.storage.ephemeral.storageClassName }}
+  name: ephemeral-storage
+{{- else }}
+- emptyDir: {}
+  name: ephemeral-storage
+  sizeLimit: {{ .Values.storage.ephemeral.quantity }}
+{{- end }}
+- emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+  name: dshm
+- configMap:
+    name: {{ include "release.fullname" . }}
+  name: workload-mount
+{{- end -}}
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/configmap.yaml b/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/inference-job.yaml b/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/inference-job.yaml
new file mode 100644
index 0000000..3f04dea
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/templates/inference-job.yaml
@@ -0,0 +1,67 @@
+{{- define "job_wrapped_with_kaiwoservice" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoService
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  deployment:
+    {{- include "job" . | nindent 6 }}
+{{- end -}}
+
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  backoffLimit: {{ .Values.job.backoffLimit | default 0 }}
+  ttlSecondsAfterFinished: {{ .Values.job.ttlSecondsAfterFinished | default 432000 }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      {{- if .Values.image.pullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.image.pullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}
+          image: "{{ .Values.image.name }}:{{ .Values.image.tag }}"
+          imagePullPolicy: {{ .Values.image.pullPolicy | quote }}
+          command: ["sh", "-c"]
+          args:
+            - |
+              #!/bin/bash
+              bash {{ .Values.settings.mountdir }}/setup_env.sh && \
+              bash {{ .Values.settings.mountdir }}/run_model.sh{{- range $k, $v := .Values.settings }} --{{ $k }}={{ $v }}{{- end }}
+          {{- if .Values.env_vars }}
+          env:
+            {{- include "container.env" . | nindent 12 }}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      restartPolicy: {{ .Values.job.restartPolicy | default "Never" }}
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
+{{- end }}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "job_wrapped_with_kaiwoservice" . }}
+{{- else -}}
+{{- include "job" . }}
+{{- end -}}
diff --git a/workloads/weather-inference-ai-models-gencast-pangu/helm/values.yaml b/workloads/weather-inference-ai-models-gencast-pangu/helm/values.yaml
new file mode 100644
index 0000000..5cdc42f
--- /dev/null
+++ b/workloads/weather-inference-ai-models-gencast-pangu/helm/values.yaml
@@ -0,0 +1,67 @@
+# Job metadata and labels
+metadata:
+  labels: {}
+  name: "ecmwf-tooling-job"
+
+# Container image configuration
+image:
+  name: "rocm/jax-community"
+  tag: "latest"
+  pullPolicy: "Always"
+
+# Container resources
+resources:
+  gpus: 1
+  memoryPerGpu: 64
+  cpuPerGpu: 4
+
+# Container command and entrypoint
+settings:
+  model_name: gencast-1.0-Mini
+  lead_time: 48
+  visualize: true
+  model_out_dir: /workspace/model_out
+  assets_dir: /workspace/assets
+  jax_profiler: false
+  num_ensemble_members: 1
+  date: "20230110"
+  time: "0000"
+  minio_bucket: weather
+  minio_prefix: inference_results
+  mountdir: /workload/mount
+
+# Volume configurations
+storage:
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+  dshm:
+    enabled: true
+    sizeLimit: "32Gi"
+
+
+# Environment variables
+env_vars:
+  MINIO_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  MINIO_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  MINIO_ENDPOINT: http://minio.minio-tenant-default.svc.cluster.local:80
+  CDSAPI_URL:
+    name: cds-api-secret
+    key: url
+  CDSAPI_KEY:
+    name: cds-api-secret
+    key: key
+
+# Job restart and timeout settings
+job:
+  restartPolicy: "Never"
+  backoffLimit: 0
+
+kaiwo:
+  enabled: false
diff --git a/workloads/weatherbench-ecmwf-downloader/helm/Chart.yaml b/workloads/weatherbench-ecmwf-downloader/helm/Chart.yaml
new file mode 100644
index 0000000..3704d35
--- /dev/null
+++ b/workloads/weatherbench-ecmwf-downloader/helm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: weatherbench-ecmwf-downloader
+description: Runs a script to download ECMWF CDS data, preprocesses it for Weatherbench and uploads to Minio.
+type: application
+version: 0.0.1
+appVersion: "0.0.1"
diff --git a/workloads/weatherbench-ecmwf-downloader/helm/README.md b/workloads/weatherbench-ecmwf-downloader/helm/README.md
new file mode 100644
index 0000000..58b6b20
--- /dev/null
+++ b/workloads/weatherbench-ecmwf-downloader/helm/README.md
@@ -0,0 +1,54 @@
+# ECMWF data downloader and preprocessor
+
+This workload downloads data from the Climate Data Storage (CDS) operated by the
+European Centre for Medium-Range Weather Forecasts (ECMWF). The data is
+preprocessed to be compatible with Google's Weatherbench and then uploaded to
+Minio.
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml # Helm chart metadata.
+├── values.yaml # Main config values.
+├── templates/ # Helm templates and helpers for deployment, config, etc.
+│ ├── _helpers.tpl # Helpers common to other helm charts in the repo.
+│ ├── job.yaml
+├── overrides/ # Optional values overrides.
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template ./helm -f ./helm/overrides/my_override.yaml --name-template my-job-name | kubectl apply -f -
+
+```
+
+This sets up the pod, downloads the data, preprocesses it and uploads it into Minio.
+
+## Script params
+
+| **Parameter**           | **Description**                          | **Default**                      |
+|-------------------------|------------------------------------------|-------------------------------   |
+| `minio_bucket`          | Target bucket in Minio.                  | `your-bucket-name`               |
+| `minio_prefix`          | Prefix for uploads in Minio.             | `your-prefix`                    |
+| `start_date`            | Starting date for data.                  | `2023-01-01`                     |
+| `end_date`              | Ending date for data.                    | `2023-01-31`                     |
+| `tods`                  | Times of day for data.                   | `00:00 12:00`                    |
+| `surface_variables`     | Surface variables to download.           | Weatherbench headline variables. |
+| `vertical_variables`    | Vertical variables to download.          | Weatherbench headline variables. |
+| `pressure_levels`       | Pressure levels to download.             | Weatherbench defaults.           |
+
+
+## Environment Variables
+
+| **Parameter**              | **Description**                     | **Default**                                                  |
+|----------------------------|-------------------------------------|--------------------------------------------------------------|
+| `MINIO_ACCESS_KEY`         | Minio access key.                   | *(from secret: minio-credentials / minio_access_key_id)*     |
+| `MINIO_SECRET_ACCESS_KEY`  | Minio secret key.                   | *(from secret: minio-credentials / minio_secret_access_key)* |
+| `SSL_CERT_FILE`            | SSL certificate bundle path.        | *not defined (SSL not used)*                                 |
+| `MINIO_ENDPOINT`           | MinIO or S3-compatible endpoint URI | `your-minio-endpoint`                                        |
+| `CDSAPI_URL`               | URL for the CDS API                 | *(from secret: cds-api-secret / url)*                        |
+| `CDSAPI_KEY`               | Key for the CDS API                 | *(from secret: cds-api-secret / key)*                        |
diff --git a/workloads/weatherbench-ecmwf-downloader/helm/overrides/labels/kaiwo-managed-true.yaml b/workloads/weatherbench-ecmwf-downloader/helm/overrides/labels/kaiwo-managed-true.yaml
new file mode 100644
index 0000000..7337df8
--- /dev/null
+++ b/workloads/weatherbench-ecmwf-downloader/helm/overrides/labels/kaiwo-managed-true.yaml
@@ -0,0 +1,2 @@
+labels:
+  kaiwo.silogen.ai/managed: "true"
diff --git a/workloads/weatherbench-ecmwf-downloader/helm/templates/_helpers.tpl b/workloads/weatherbench-ecmwf-downloader/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..6fe7f95
--- /dev/null
+++ b/workloads/weatherbench-ecmwf-downloader/helm/templates/_helpers.tpl
@@ -0,0 +1,86 @@
+# Helper templates.
+
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $releaseName := include "release.name" . -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- printf "%s-job-%s" $releaseName .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if not (index .Values "releaseSuffix") -}}
+{{- /* ... store random release suffix under the .Values.releaseSuffix */ -}}
+{{-   $_ := set .Values "releaseSuffix" (randAlphaNum 5 | lower) -}}
+{{- end -}}
+{{- printf "%s-job-%s" $releaseName .Values.releaseSuffix | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Image pull secrets helper.
+{{- define "image.pull_secrets" -}}
+{{- if .Values.imagePullSecrets }}
+imagePullSecrets:
+  {{- range $index, $map:= .Values.imagePullSecrets }}
+  {{- range $key, $value:= $map }}
+  - {{ $key }}: {{ $value }}
+  {{- end -}}
+  {{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper.
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.memory }}
+  memory: "{{ .Values.resources.memory }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  {{- if .Values.resources.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.resources.ephemeral_storage.request }}"
+  {{- end }}
+limits:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.memory }}
+  memory: "{{ .Values.resources.memory }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  {{- if .Values.resources.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.resources.ephemeral_storage.limit }}"
+  {{- end }}
+{{- end }}
+
+
+# Container environment variables helper.
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- end -}}
+{{- end -}}
+
+# Container secrets helper.
+{{- define "container.env_secrets" -}}
+{{- range $key, $value := .Values.env_secrets }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end -}}
+{{- end -}}
diff --git a/workloads/weatherbench-ecmwf-downloader/helm/templates/job.yaml b/workloads/weatherbench-ecmwf-downloader/helm/templates/job.yaml
new file mode 100644
index 0000000..9771778
--- /dev/null
+++ b/workloads/weatherbench-ecmwf-downloader/helm/templates/job.yaml
@@ -0,0 +1,43 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  {{- if .Values.labels }}
+  labels:
+    {{- range $label, $value := .Values.labels }}
+    {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  backoffLimit: 0
+  template:
+    spec:
+      {{- include "image.pull_secrets" . | trim | nindent 6 }}
+      restartPolicy: Never
+      containers:
+      - name: resource-delivery
+        image: {{ .Values.image }}
+        imagePullPolicy: Always
+        command:
+        - sh
+        - -e
+        - -u
+        - -c
+        args:
+        - |
+          cd /workdir/run
+          ######### SETUP ENVIRONMENT VARIABLES FOR DOWNLOADER #########
+          export WB_SURFACE_VARIABLES="{{- range .Values.settings.surface_variables}} {{.}}{{ end }}"
+          export WB_VERTICAL_VARIABLES="{{- range .Values.settings.vertical_variables}} {{.}}{{ end }}"
+          export WB_PRESSURE_LEVELS="{{- range .Values.settings.pressure_levels}} {{.}}{{ end }}"
+          ######### RUN DOWNLOADER #########
+          echo "Downloading, preprocessing and uploading data"
+          sed -i 's/cert_check=not no_cert/secure=not no_cert/' /workdir/python/minio_uploader.py
+          bash ../weatherbench/download_and_preprocess.sh {{ .Values.settings.minio_bucket }} {{ .Values.settings.minio_prefix }} {{ .Values.settings.start_date }} {{ .Values.settings.end_date }} {{ .Values.settings.tods }}
+          echo "Done"
+        resources:
+          {{- include "container.resources" . | trim | nindent 12 }}
+        env:
+          {{- include "container.env" . | trim | nindent 12 }}
+          {{- include "container.env_secrets" . | trim | nindent 12 }}
diff --git a/workloads/weatherbench-ecmwf-downloader/helm/values.yaml b/workloads/weatherbench-ecmwf-downloader/helm/values.yaml
new file mode 100644
index 0000000..b1f6cef
--- /dev/null
+++ b/workloads/weatherbench-ecmwf-downloader/helm/values.yaml
@@ -0,0 +1,46 @@
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Image configuration.
+image: ghcr.io/silogen/weatherbench-preprocessor.dockerfile:latest
+
+# Container resources.
+resources:
+  cpu: 1
+  memory: "16Gi"
+  ephemeral_storage:
+    request: "200Gi"
+    limit: "300Gi"
+
+# Container command and entrypoint.
+settings:
+  # The following will be passed to the downloader script.
+  minio_bucket: "default-bucket" # Minio bucket name.
+  minio_prefix: "weatherbench-ecmwf/" # Minio bucket prefix.
+  start_date: "2023-01-01" # Starting date for data.
+  end_date: "2023-01-31" # Ending date for data (inclusive).
+  tods: "00:00 12:00" # Times of day for each day in the dataset.
+  surface_variables: [ '10m_u_component_of_wind', '10m_v_component_of_wind', '2m_temperature', 'mean_sea_level_pressure', 'total_precipitation' ] # Surface variables to download.
+  vertical_variables: [ 'geopotential', 'temperature', 'u_component_of_wind', 'v_component_of_wind', 'specific_humidity' ] # Vertically stratified variables to download.
+  pressure_levels: [ '50', '100', '150', '200', '250', '300', '400', '500', '600', '700', '850', '925', '1000' ] # Pressure levels to download.
+
+# Environment variable setup.
+env_vars:
+  MINIO_ENDPOINT: minio.minio-tenant-default.svc.cluster.local:80
+
+# Setup secrets given through environment variables.
+env_secrets:
+  MINIO_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  MINIO_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  # NOTE: User needs to provide if defined, otherwise needs to be left undefined!
+  #SSL_CERT_FILE: your-cert-file
+  CDSAPI_URL:
+    name: cds-api-secret
+    key: url
+  CDSAPI_KEY:
+    name: cds-api-secret
+    key: key
diff --git a/workloads/weatherbench-model-preprocessor/helm/Chart.yaml b/workloads/weatherbench-model-preprocessor/helm/Chart.yaml
new file mode 100644
index 0000000..d3fa51a
--- /dev/null
+++ b/workloads/weatherbench-model-preprocessor/helm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: weatherbench-model-preprocessor
+description: Runs a script to download ECMWF ai-models outputs for a single model, preprocesses and combines them for Weatherbench and uploads to Minio.
+type: application
+version: 0.0.1
+appVersion: "0.0.1"
diff --git a/workloads/weatherbench-model-preprocessor/helm/README.md b/workloads/weatherbench-model-preprocessor/helm/README.md
new file mode 100644
index 0000000..0303109
--- /dev/null
+++ b/workloads/weatherbench-model-preprocessor/helm/README.md
@@ -0,0 +1,57 @@
+# ECMWF ai-models output preprocessor
+
+This workload downloads, preprocesses and combines weather prediction model
+outputs produced by the `ai-models` tool developed by the European Centre for
+Medium-Range Weather Forecasts (ECMWF).
+
+The `ai-models` tool presents an unified front-end for running various
+state-of-the-art machine learning weather prediction models. However, the model
+outputs must be preprocessed for compatibility with the Weatherbench weather
+prediction model evaluation suite, developed by Google.
+
+This workload downloads the given `ai-models` output files for a particular
+model. It then preprocesses them for Weatherbench compatibility and combines
+them into a single file. This file is then uploaded back into Minio.
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml # Helm chart metadata.
+├── values.yaml # Main config values.
+├── templates/ # Helm templates and helpers for deployment, config, etc.
+│ ├── _helpers.tpl # Helpers common to other helm charts in the repo.
+│ ├── job.yaml
+├── overrides/ # Optional values overrides.
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template ./helm -f ./helm/overrides/my_override.yaml --name-template my-job-name | kubectl apply -f -
+
+```
+
+This sets up the pod, downloads the data, preprocesses it and uploads the result back into Minio.
+
+## Script params
+
+| **Parameter**           | **Description**                          | **Default**                   |
+|-------------------------|------------------------------------------|-------------------------------|
+| `minio_bucket`          | Target bucket in Minio.                  | `your-bucket-name`            |
+| `minio_prefix`          | Prefix for uploads in Minio.             | `your-prefix`                 |
+| `model_name`            | Indicates which preprocessor to use.     | `gencast`                     |
+| `output`                | Filename for the output.                 | `gencast.nc`                  |
+| `inputs`                | The model input files.                   | `[input1.grib, input2.grib]`  |
+
+
+## Environment Variables
+
+| **Parameter**              | **Description**                     | **Default**                                                  |
+|----------------------------|-------------------------------------|--------------------------------------------------------------|
+| `MINIO_ACCESS_KEY`         | Minio access key.                   | *(from secret: minio-credentials / minio_access_key_id)*     |
+| `MINIO_SECRET_ACCESS_KEY`  | Minio secret key.                   | *(from secret: minio-credentials / minio_secret_access_key)* |
+| `SSL_CERT_FILE`            | SSL certificate bundle path.        | *not defined (SSL not used)*                                 |
+| `MINIO_ENDPOINT`           | Minio or S3-compatible endpoint URI | `your-minio-endpoint`                                        |
diff --git a/workloads/weatherbench-model-preprocessor/helm/overrides/labels/kaiwo-managed-true.yaml b/workloads/weatherbench-model-preprocessor/helm/overrides/labels/kaiwo-managed-true.yaml
new file mode 100644
index 0000000..7337df8
--- /dev/null
+++ b/workloads/weatherbench-model-preprocessor/helm/overrides/labels/kaiwo-managed-true.yaml
@@ -0,0 +1,2 @@
+labels:
+  kaiwo.silogen.ai/managed: "true"
diff --git a/workloads/weatherbench-model-preprocessor/helm/templates/_helpers.tpl b/workloads/weatherbench-model-preprocessor/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..6fe7f95
--- /dev/null
+++ b/workloads/weatherbench-model-preprocessor/helm/templates/_helpers.tpl
@@ -0,0 +1,86 @@
+# Helper templates.
+
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $releaseName := include "release.name" . -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- printf "%s-job-%s" $releaseName .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if not (index .Values "releaseSuffix") -}}
+{{- /* ... store random release suffix under the .Values.releaseSuffix */ -}}
+{{-   $_ := set .Values "releaseSuffix" (randAlphaNum 5 | lower) -}}
+{{- end -}}
+{{- printf "%s-job-%s" $releaseName .Values.releaseSuffix | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Image pull secrets helper.
+{{- define "image.pull_secrets" -}}
+{{- if .Values.imagePullSecrets }}
+imagePullSecrets:
+  {{- range $index, $map:= .Values.imagePullSecrets }}
+  {{- range $key, $value:= $map }}
+  - {{ $key }}: {{ $value }}
+  {{- end -}}
+  {{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper.
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.memory }}
+  memory: "{{ .Values.resources.memory }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  {{- if .Values.resources.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.resources.ephemeral_storage.request }}"
+  {{- end }}
+limits:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.memory }}
+  memory: "{{ .Values.resources.memory }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  {{- if .Values.resources.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.resources.ephemeral_storage.limit }}"
+  {{- end }}
+{{- end }}
+
+
+# Container environment variables helper.
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- end -}}
+{{- end -}}
+
+# Container secrets helper.
+{{- define "container.env_secrets" -}}
+{{- range $key, $value := .Values.env_secrets }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end -}}
+{{- end -}}
diff --git a/workloads/weatherbench-model-preprocessor/helm/templates/job.yaml b/workloads/weatherbench-model-preprocessor/helm/templates/job.yaml
new file mode 100644
index 0000000..aa2b4e0
--- /dev/null
+++ b/workloads/weatherbench-model-preprocessor/helm/templates/job.yaml
@@ -0,0 +1,41 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  {{- if .Values.labels }}
+  labels:
+    {{- range $label, $value := .Values.labels }}
+    {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  backoffLimit: 0
+  template:
+    spec:
+      {{- include "image.pull_secrets" . | trim | nindent 6 }}
+      restartPolicy: Never
+      containers:
+      - name: resource-delivery
+        image: {{ .Values.image }}
+        imagePullPolicy: Always
+        command:
+        - sh
+        - -e
+        - -u
+        - -c
+        args:
+        - |
+          cd /workdir/run
+          echo "Saving command into run.txt"
+          sed -i 's/cert_check=not no_cert/secure=not no_cert/' /workdir/python/minio_uploader.py
+          echo "bash ../weatherbench/preprocess_models.sh {{ .Values.settings.minio_bucket }} {{ .Values.settings.minio_prefix }} {{ .Values.settings.model_name }} {{ .Values.settings.output }} {{- range .Values.settings.inputs }} {{.}}{{ end }}" > run.txt
+          echo "Downloading, preprocessing and uploading data"
+          bash ../weatherbench/preprocess_models.sh {{ .Values.settings.minio_bucket }} {{ .Values.settings.minio_prefix }} {{ .Values.settings.model_name }} {{ .Values.settings.output }} {{- range .Values.settings.inputs }} {{.}}{{ end }}
+          echo "Done"
+          sleep 3600
+        resources:
+          {{- include "container.resources" . | trim | nindent 12 }}
+        env:
+          {{- include "container.env" . | trim | nindent 12 }}
+          {{- include "container.env_secrets" . | trim | nindent 12 }}
diff --git a/workloads/weatherbench-model-preprocessor/helm/values.yaml b/workloads/weatherbench-model-preprocessor/helm/values.yaml
new file mode 100644
index 0000000..788c130
--- /dev/null
+++ b/workloads/weatherbench-model-preprocessor/helm/values.yaml
@@ -0,0 +1,37 @@
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Image configuration.
+image: ghcr.io/silogen/weatherbench-preprocessor.dockerfile:latest
+
+# Container resources.
+resources:
+  cpu: 1
+  memory: "16Gi"
+  ephemeral_storage:
+    request: "200Gi"
+    limit: "300Gi"
+
+# Container command and entrypoint.
+settings:
+  # The following will be passed to the downloader script.
+  minio_bucket: "default-bucket" # Minio bucket name.
+  minio_prefix: "weatherbench-ecmwf/" # Minio bucket prefix.
+  model_name: "gencast" # The model name supported by process_aimodels.py
+  output: "gencast.nc" # Filename for the preprocessed and combined output.
+  inputs: [ "input1.grib", "input2.grib" ] # Input file names.
+
+# Environment variable setup.
+env_vars:
+  MINIO_ENDPOINT: "minio.minio-tenant-default.svc.cluster.local:80"
+
+# Setup secrets given through environment variables.
+env_secrets:
+  MINIO_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  MINIO_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  # NOTE: User needs to provide if defined, otherwise needs to be left undefined!
+  #SSL_CERT_FILE: your-cert-file
diff --git a/workloads/weatherbench-runner/helm/Chart.yaml b/workloads/weatherbench-runner/helm/Chart.yaml
new file mode 100644
index 0000000..55e59b8
--- /dev/null
+++ b/workloads/weatherbench-runner/helm/Chart.yaml
@@ -0,0 +1,6 @@
+apiVersion: v2
+name: weatherbench-runner
+description: Runs a script to download reference data and model data, run the Weatherbench metrics on them, and then upload the result to Minio.
+type: application
+version: 0.0.1
+appVersion: "0.0.1"
diff --git a/workloads/weatherbench-runner/helm/README.md b/workloads/weatherbench-runner/helm/README.md
new file mode 100644
index 0000000..f40b8ee
--- /dev/null
+++ b/workloads/weatherbench-runner/helm/README.md
@@ -0,0 +1,59 @@
+# Weatherbench runner
+
+This workload downloads preprocessed European Centre for Medium-Range Weather
+Forecasts (ECMWF) reference data and preprocessed weather model forecast data,
+runs Google's Weatherbench on these datasets, and finally uploads the resulting
+metrics file to Minio.
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml # Helm chart metadata.
+├── values.yaml # Main config values.
+├── templates/ # Helm templates and helpers for deployment, config, etc.
+│ ├── _helpers.tpl # Helpers common to other helm charts in the repo.
+│ ├── job.yaml
+├── overrides/ # Optional values overrides.
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template ./helm -f ./helm/overrides/my_override.yaml --name-template my-job-name | kubectl apply -f -
+
+```
+
+This sets up the pod, downloads the data, preprocesses it and uploads the result back into Minio.
+
+## Script params
+
+| **Parameter**           | **Description**                               | **Default**                         |
+|-------------------------|-----------------------------------------------|-------------------------------------|
+| `minio_bucket`          | Target bucket in Minio.                       | `your-bucket-name`                  |
+| `minio_prefix`          | Prefix for uploads in Minio.                  | `your-prefix`                       |
+| `start_date`            | Start of calculation (inclusive, ISO format). | `2023-01-01`                        |
+| `end_date`              | End of calculation (exclusive, ISO format).   | `2023-01-03`                        |
+| `forecast_period`       | Lead time period to use (Pandas timedelta).   | `12h`                               |
+| `forecast_steps`        | Number of lead time steps.                    | `4`                                 |
+| `cds_file`              | Location of reference data.                   | `ecmwf_data/reference.nc`           |
+| `models`                | Model data as `name` `file` pairs.            | `[gencast, model_data/gencast.nc]`  |
+| `wb_metrics`            | Metrics to run (array).                       | `[rmse, mae]`                       |
+| `wb_variables`          | Variables to run the metrics over (array).    | `[geopotential, temperature]`       |
+
+
+## Environment Variables
+
+| **Parameter**              | **Description**                     | **Default**                                                  |
+|----------------------------|-------------------------------------|--------------------------------------------------------------|
+| `MINIO_ACCESS_KEY`         | Minio access key.                   | *(from secret: minio-credentials / minio_access_key_id)*     |
+| `MINIO_SECRET_ACCESS_KEY`  | Minio secret key.                   | *(from secret: minio-credentials / minio_secret_access_key)* |
+| `SSL_CERT_FILE`            | SSL certificate bundle path.        | *not defined (SSL not used)*                                 |
+| `MINIO_ENDPOINT`           | Minio or S3-compatible endpoint URI | `your-minio-endpoint`                                        |
+
+
+## 🏗 Target Platform
+
+This deployment is configured for the  **OSSCI cluster**.
diff --git a/workloads/weatherbench-runner/helm/overrides/labels/kaiwo-managed-true.yaml b/workloads/weatherbench-runner/helm/overrides/labels/kaiwo-managed-true.yaml
new file mode 100644
index 0000000..7337df8
--- /dev/null
+++ b/workloads/weatherbench-runner/helm/overrides/labels/kaiwo-managed-true.yaml
@@ -0,0 +1,2 @@
+labels:
+  kaiwo.silogen.ai/managed: "true"
diff --git a/workloads/weatherbench-runner/helm/templates/_helpers.tpl b/workloads/weatherbench-runner/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..6fe7f95
--- /dev/null
+++ b/workloads/weatherbench-runner/helm/templates/_helpers.tpl
@@ -0,0 +1,86 @@
+# Helper templates.
+
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $releaseName := include "release.name" . -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- printf "%s-job-%s" $releaseName .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if not (index .Values "releaseSuffix") -}}
+{{- /* ... store random release suffix under the .Values.releaseSuffix */ -}}
+{{-   $_ := set .Values "releaseSuffix" (randAlphaNum 5 | lower) -}}
+{{- end -}}
+{{- printf "%s-job-%s" $releaseName .Values.releaseSuffix | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Image pull secrets helper.
+{{- define "image.pull_secrets" -}}
+{{- if .Values.imagePullSecrets }}
+imagePullSecrets:
+  {{- range $index, $map:= .Values.imagePullSecrets }}
+  {{- range $key, $value:= $map }}
+  - {{ $key }}: {{ $value }}
+  {{- end -}}
+  {{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper.
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.memory }}
+  memory: "{{ .Values.resources.memory }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  {{- if .Values.resources.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.resources.ephemeral_storage.request }}"
+  {{- end }}
+limits:
+  {{- if .Values.resources.cpu }}
+  cpu: "{{ .Values.resources.cpu }}"
+  {{- end }}
+  {{- if .Values.resources.memory }}
+  memory: "{{ .Values.resources.memory }}"
+  {{- end }}
+  {{- if .Values.resources.gpus }}
+  amd.com/gpu: "{{ .Values.resources.gpus }}"
+  {{- end }}
+  {{- if .Values.resources.ephemeral_storage }}
+  ephemeral-storage: "{{ .Values.resources.ephemeral_storage.limit }}"
+  {{- end }}
+{{- end }}
+
+
+# Container environment variables helper.
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- end -}}
+{{- end -}}
+
+# Container secrets helper.
+{{- define "container.env_secrets" -}}
+{{- range $key, $value := .Values.env_secrets }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end -}}
+{{- end -}}
diff --git a/workloads/weatherbench-runner/helm/templates/job.yaml b/workloads/weatherbench-runner/helm/templates/job.yaml
new file mode 100644
index 0000000..72f0c44
--- /dev/null
+++ b/workloads/weatherbench-runner/helm/templates/job.yaml
@@ -0,0 +1,45 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: {{ include "release.fullname" . }}
+  {{- if .Values.labels }}
+  labels:
+    {{- range $label, $value := .Values.labels }}
+    {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  backoffLimit: 0
+  template:
+    spec:
+      {{- include "image.pull_secrets" . | trim | nindent 6 }}
+      restartPolicy: Never
+      containers:
+      - name: runner
+        image: {{ .Values.image }}
+        imagePullPolicy: Always
+        command:
+        - sh
+        - -e
+        - -u
+        - -c
+        args:
+        - |
+          cd /workdir/run
+          echo "Downloading data, running weatherbench and uploading data"
+          ######### SETUP ENVIRONMENT VARIABLES FOR WEATHERBENCH #########
+          export WB_METRICS="{{- range .Values.settings.wb_metrics}} {{.}}{{ end }}"
+          export WB_VARIABLES="{{- range .Values.settings.wb_variables}} {{.}}{{ end }}"
+          ######### RUN WEATHERBENCH #####################################
+          sed -i 's/cert_check=not no_cert/secure=not no_cert/' /workdir/python/minio_uploader.py
+          bash ../weatherbench/run_weatherbench.sh {{ .Values.settings.minio_bucket }} {{ .Values.settings.minio_prefix }} \
+            {{ .Values.settings.start_date }} {{ .Values.settings.end_date }} \
+            {{ .Values.settings.forecast_period }} {{ .Values.settings.forecast_steps }} \
+            {{ .Values.settings.cds_file }} {{- range .Values.settings.models}} {{.}}{{ end }}
+          echo "Done"
+        resources:
+          {{- include "container.resources" . | trim | nindent 12 }}
+        env:
+          {{- include "container.env" . | trim | nindent 12 }}
+          {{- include "container.env_secrets" . | trim | nindent 12 }}
diff --git a/workloads/weatherbench-runner/helm/values.yaml b/workloads/weatherbench-runner/helm/values.yaml
new file mode 100644
index 0000000..eca3b28
--- /dev/null
+++ b/workloads/weatherbench-runner/helm/values.yaml
@@ -0,0 +1,42 @@
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Image configuration.
+image: ghcr.io/silogen/weatherbench-runner.dockerfile:latest
+
+# Container resources.
+resources:
+  cpu: 1
+  memory: "128Gi"
+  ephemeral_storage:
+    request: "200Gi"
+    limit: "300Gi"
+
+# Container command and entrypoint.
+settings:
+  # The following will be passed to the downloader script.
+  minio_bucket: "default-bucket" # Minio bucket name.
+  minio_prefix: "weatherbench-ecmwf/" # Minio bucket prefix.
+  start_date: "2023-01-01" # Computation starting date (inclusive).
+  end_date: "2023-01-03" # Computation ending date (exclusive).
+  forecast_period: "12h" # Lead time step size.
+  forecast_steps: "4" # Number of lead times to compute the metrics for.
+  cds_file: "ecmwf_data/reference.nc" # Location of the preprocessed ECMWF reference data.
+  models: [ gencast, model_data/gencast.nc ] # Model data as name file pairs.
+  wb_metrics: [ rmse, mae ] # Metrics to run.
+  wb_variables: [ geopotential, temperature ] # Variables to run the metrics over.
+
+# Environment variable setup.
+env_vars:
+  MINIO_ENDPOINT: "minio.minio-tenant-default.svc.cluster.local:80"
+
+# Setup secrets given through environment variables.
+env_secrets:
+  MINIO_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  MINIO_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  # NOTE: User needs to provide if defined, otherwise needs to be left undefined!
+  #SSL_CERT_FILE: your-cert-file
diff --git a/workloads/workloads-overview.md b/workloads/workloads-overview.md
index 0f65faf..ed76fc9 100644
--- a/workloads/workloads-overview.md
+++ b/workloads/workloads-overview.md
@@ -146,3 +146,23 @@ Return from any regular `k9s` view with `Esc` .
 To create a new workload, you can duplicate an existing workload and adapt as needed.
 
 [Example: Using your own model and data in the workloads](https://github.com/silogen/ai-workloads/blob/main/docs/tutorials/tutorial-01-deliver-resources-and-finetune.md#next-steps-how-to-use-your-own-model-and-data)
+
+```{toctree}
+---
+caption: Reference AI Workloads
+maxdepth: 1
+hidden: True
+---
+dev-chatui-openwebui <dev-chatui-openwebui/helm/README.md>
+dev-openvscode-server <dev-workspace-vscode/helm/README.md>
+dev-text2image-comfyui <dev-text2image-comfyui/helm/README.md>
+download-data-to-bucket <download-data-to-bucket/helm/README.md>
+download-huggingface-model-to-bucket <download-huggingface-model-to-bucket/helm/README.md>
+k8s-namespace-setup <k8s-namespace-setup/helm/README.md>
+LLM fine-tuning overview <llm-finetune-silogen-engine/helm/README.md>
+Fine-tuning config for the workload <llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md>
+llm-inference-llamacpp-mi300x <llm-inference-llamacpp-mi300x/helm/README.md>
+llm-inference-sglang <llm-inference-sglang/helm/README.md>
+llm-inference-vllm <llm-inference-vllm/helm/README.md>
+llm-inference-vllm-benchmark-mad <llm-inference-vllm-benchmark-mad/helm/README.md>
+```
diff --git a/workloads/xlstm-inference-torchserve/README.md b/workloads/xlstm-inference-torchserve/README.md
new file mode 100644
index 0000000..5fee06f
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/README.md
@@ -0,0 +1,66 @@
+# Serving xLstm Model with Torchserve — Helm Deployment
+
+This Helm chart deploys a TorchServe service serving xLSTM model (developed by NXAI). This heml chart handles model setup, preprocessing and serving in a fully containerized GPU environment with AMD ROCm support.
+
+## 📋 Prerequisites
+
+- MinIO or S3-compatible storage with access credentials stored in Kubernetes secrets.
+- Package and upload your model to MinIO storage using `torchserve-model-packager` helm chart:
+
+```bash
+helm template workloads/torchserve-model-packager/helm \
+    -f workloads/torchserve-model-packager/helm/overrides/NXAI-XLSTM-7B.yaml \
+    --name-template xlstm \
+    | kubectl apply -f -
+```
+
+## 🔧 Project Structure
+
+```
+helm/
+├── Chart.yaml                 # Helm chart metadata
+├── values.yaml                # Main config values (e.g. image, ports, GPU, storage)
+├── values.schema.json         # JSON schema for values validation
+├── templates/                 # Helm templates and helpers for deployment, config, etc.
+│   ├── _helpers.tpl           # Helpers common to other helm charts in the repo
+│   ├── configmap.yaml
+│   ├── deployment.yaml        # Main deployment template
+│   └── service.yaml           # Service for TorchServe endpoints
+├── mount/                     # Files mounted into the container via ConfigMap
+│   ├── entrypoint.sh          # Main entrypoint script
+│   ├── xlstm_handler.py        # Custom TorchServe handler for xlstm model
+│   └── config.properties      # Configs for TorchServe ports, timeouts, etc.
+└── overrides/                 # Optional values overrides
+```
+
+## 🚀 Quickstart
+
+Deploy directly with:
+
+```bash
+helm template xlstm ./helm  | kubectl apply -f -
+```
+
+This sets up the pod, downloads the packaged xLSTM model from MinIO, archives it into a `.mar` file, and starts the inference server.
+
+Override with:
+
+```bash
+helm template test ./helm -f ./helm/overrides/models/xlstm-7b.yaml | kubectl apply -f -
+```
+
+## 🌐 Access the API
+
+**Port forward to access the service:**
+
+```bash
+kubectl port-forward deployment/xlstm-inference-torchserve-test 8080:8080
+```
+
+## 📹 Usage Example
+
+Generate a response from xLSTM with given prompt:
+
+```bash
+python xlstm_query.py --url "http://localhost:8080/predictions/NX-AI_xLSTM-7b" --prompt "there is a whale in this package that does not look like a"
+```
diff --git a/workloads/xlstm-inference-torchserve/helm/Chart.yaml b/workloads/xlstm-inference-torchserve/helm/Chart.yaml
new file mode 100644
index 0000000..008b733
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: xlstm-inference-torchserve
+description: A Helm chart for XLSTM inference using TorchServe
+version: 0.0.1
diff --git a/workloads/xlstm-inference-torchserve/helm/mount/config.properties b/workloads/xlstm-inference-torchserve/helm/mount/config.properties
new file mode 100644
index 0000000..e586bed
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/mount/config.properties
@@ -0,0 +1,6 @@
+inference_address=http://0.0.0.0:8080
+management_address=http://0.0.0.0:8081
+metrics_address=http://0.0.0.0:8082
+
+default_response_timeout=3000
+default_startup_timeout=500
diff --git a/workloads/xlstm-inference-torchserve/helm/mount/entrypoint.sh b/workloads/xlstm-inference-torchserve/helm/mount/entrypoint.sh
new file mode 100644
index 0000000..343d7ce
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/mount/entrypoint.sh
@@ -0,0 +1,97 @@
+MOD_MODEL_NAME="${MODEL_NAME//\//_}"
+
+DEBIAN_FRONTEND=noninteractive apt-get update && apt-get install -y -qq \
+    openjdk-17-jdk \
+    zip \
+    curl
+
+echo "[entrypoint] Using TS_REPO_URL=${TS_REPO_URL}"
+echo "[entrypoint] Cloning TorchServe repo into $TS_REPO_DIR"
+git clone "$TS_REPO_URL" "$TS_REPO_DIR"
+
+echo "[entrypoint] Removing content from requirements/torch_rocm62.txt"
+cd "$TS_REPO_DIR"
+: > requirements/torch_rocm62.txt
+
+echo "[entrypoint] Installing xlstm specific requirements"
+python -m pip install xlstm mlstm_kernels
+python -m pip uninstall -y transformers
+python -m pip install --upgrade --force-reinstall --no-deps 'transformers @ git+https://github.com/NX-AI/transformers.git@integrate_xlstm'
+python -m pip install pygit2==1.17.0
+python -m pip install 'accelerate>=0.26.0'
+python3 -m pip install --upgrade --force-reinstall "tokenizers<0.21"
+
+echo "[entrypoint] Installing TorchServe and dependencies"
+echo "[entrypoint] Installing torch-model-archiver"
+python3 -m pip install --no-cache-dir torch-model-archiver
+
+echo "[entrypoint] Installing ROCm dependencies"
+python3 ts_scripts/install_dependencies.py --rocm=rocm62
+
+echo "[entrypoint] Building TorchServe from source"
+python3 ts_scripts/install_from_src.py
+
+# MINIO CLIENT INSTALLATION AND CONFIGURATION
+# Download MinIO client binary to custom directory (no root access needed)
+echo '[entrypoint] Installing MinIO client'
+curl -s https://dl.min.io/client/mc/release/linux-amd64/mc \
+    --create-dirs \
+    -o /minio-binaries/mc
+chmod +x /minio-binaries/mc
+export PATH="${PATH}:/minio-binaries/"
+
+# Configure MinIO client
+mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_ACCESS_KEY $BUCKET_SECRET_KEY
+
+# MODEL DOWNLOAD FROM STORAGE
+# Download model.zip from storage based on MODEL_NAME
+echo "[entrypoint] Selected model: ${MODEL_NAME}"
+echo "[entrypoint] Attempting to download ${MODEL_NAME} model from MinIO..."
+
+if mc cp minio-host/$BUCKET_PATH/$MODEL_NAME/packaged/model.zip \
+    ./model.zip; then
+    echo "[entrypoint] SUCCESS: Downloaded model.zip for ${MODEL_NAME} successfully"
+    echo "[entrypoint] File size: $(du -h ./model.zip | cut -f1)"
+    echo "[entrypoint] File location: $(pwd)/model.zip"
+else
+    echo "[entrypoint] ERROR: Failed to download model.zip for ${MODEL_NAME} from storage"
+    exit 1
+fi
+
+
+# MODEL ARCHIVING AND TORCHSERVE STARTUP
+# Package handler into .mar
+echo "[entrypoint] Archiving model with custom handler"
+mkdir -p "$MODEL_STORE"
+if torch-model-archiver \
+  --model-name "$MOD_MODEL_NAME" \
+  --version 1.0 \
+  --export-path "$MODEL_STORE" \
+  --handler /mount/xlstm_handler.py \
+  --extra-files model.zip \
+  --force; then
+    echo "[entrypoint] SUCCESS: Model archived successfully"
+    echo "[entrypoint] DEBUG: Archive contents: $(ls -la "$MODEL_STORE")"
+else
+    echo "[entrypoint] ERROR: torch-model-archiver failed"
+    exit 1
+fi
+
+mkdir -p "$TEMP"
+
+# Start TorchServe
+echo "[entrypoint] Starting TorchServe"
+if torchserve \
+  --start \
+  --ncs \
+  --model-store "$MODEL_STORE" \
+  --ts-config "$TS_CONFIG" \
+  --models "${MOD_MODEL_NAME}.mar" \
+  --disable-token-auth; then
+    echo "[entrypoint] TorchServe started."
+else
+    echo "[entrypoint] TorchServe failed to start."
+    exit 1
+fi
+
+while true; do sleep 10; done;
diff --git a/workloads/xlstm-inference-torchserve/helm/mount/xlstm_handler.py b/workloads/xlstm-inference-torchserve/helm/mount/xlstm_handler.py
new file mode 100644
index 0000000..e43bfec
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/mount/xlstm_handler.py
@@ -0,0 +1,135 @@
+import json
+import logging
+import os
+import zipfile
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from ts.torch_handler.base_handler import BaseHandler
+
+logger = logging.getLogger(__name__)
+
+
+class xLSTMHandler(BaseHandler):
+    """
+    TorchServe handler using HuggingFace Transformers for causal language modeling.
+    """
+
+    def __init__(self):
+        super(xLSTMHandler, self).__init__()
+        self.initialized = False
+
+    def initialize(self, ctx):
+        """
+        Initialize the handler, loading model and tokenizer.
+
+        Args:
+            ctx: Context object with system properties and manifest.
+
+        The method:
+        1. Sets up the device (GPU or CPU) based on system properties
+        2. Extracts the model.zip file from the model directory
+        3. Loads the tokenizer and model using HuggingFace's transformers
+        """
+        self.manifest = ctx.manifest
+        properties = ctx.system_properties
+        model_dir = properties.get("model_dir")
+
+        self.device = torch.device(
+            "cuda:" + str(properties.get("gpu_id"))
+            if torch.cuda.is_available() and properties.get("gpu_id") is not None
+            else "cpu"
+        )
+
+        with zipfile.ZipFile(model_dir + "/model.zip", "r") as zip_ref:
+            zip_ref.extractall(model_dir)
+
+        weights_dir = os.path.join(model_dir, "model")
+        tokenizer_dir = os.path.join(model_dir, "tokenizer")
+
+        # Load tokenizer and model
+        logger.info(f"Loading tokenizer from {tokenizer_dir}")
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
+        logger.info(f"Tokenizer loaded successfully from {tokenizer_dir}")
+
+        logger.info(f"Loading model from {weights_dir}")
+        self.model = AutoModelForCausalLM.from_pretrained(weights_dir)
+        logger.info(f"Model loaded successfully from {weights_dir}")
+
+        self.model.to(self.device)
+        self.model.eval()
+
+        self.initialized = True
+
+    def preprocess(self, requests):
+        """
+        Extract a prompt string from each incoming request and
+        tokenize it into input_ids for the model.
+        Supports:
+        - POST /predictions with JSON under "body", "data", or "prompt"
+        - POST /invocations with a raw string
+        """
+        inputs = []
+        for idx, data in enumerate(requests):
+            logger.info(f"[preprocess] request #{idx}: {data!r}")
+            prompt = None
+
+            # 1) If they sent a dict with "body" or "data"
+            if isinstance(data, dict):
+                raw = data.get("body") or data.get("data")
+                if isinstance(raw, dict):
+                    prompt = raw.get("prompt")
+                if isinstance(raw, (bytes, bytearray)):
+                    raw = raw.decode("utf-8")
+                if isinstance(raw, str):
+                    # try JSON parse
+                    try:
+                        obj = json.loads(raw)
+                        prompt = obj.get("prompt")
+                    except json.JSONDecodeError:
+                        prompt = raw
+                # fallback to top‐level "prompt" key
+                if prompt is None:
+                    prompt = data.get("prompt")
+
+            # 2) If they sent a raw string (invocations endpoint)
+            elif isinstance(data, str):
+                prompt = data
+
+            # 3) If they sent raw bytes
+            elif isinstance(data, (bytes, bytearray)):
+                prompt = data.decode("utf-8")
+
+            if not isinstance(prompt, str):
+                logger.error("No prompt found in request #%d: %r", idx, data)
+
+            inputs.append(prompt)
+
+        if not inputs:
+            raise ValueError("No valid prompts provided")
+
+        # tokenize the batch of prompts
+        encoding = self.tokenizer(
+            inputs,
+            return_tensors="pt",
+        )
+
+        return encoding["input_ids"].to(self.model.device)
+
+    def inference(self, input_ids):
+        """
+        Generate text.
+        """
+        # Generate up to 1000 new tokens as per the example
+        outputs = self.model.generate(input_ids, max_new_tokens=1000, do_sample=True)
+        return outputs
+
+    def postprocess(self, inference_output):
+        """
+        Decode generated token IDs to text outputs.
+        """
+        results = []
+        decoded = self.tokenizer.batch_decode(inference_output, skip_special_tokens=True)
+        for decoded_output in decoded:
+            results.append({"output": decoded_output})
+        return results
diff --git a/workloads/xlstm-inference-torchserve/helm/overrides/dev-console/default.yaml b/workloads/xlstm-inference-torchserve/helm/overrides/dev-console/default.yaml
new file mode 100644
index 0000000..ddf8cdf
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/overrides/dev-console/default.yaml
@@ -0,0 +1 @@
+# by default, no extra parameters are set from Developer Console
diff --git a/workloads/xlstm-inference-torchserve/helm/overrides/dev-console/model.yaml b/workloads/xlstm-inference-torchserve/helm/overrides/dev-console/model.yaml
new file mode 100644
index 0000000..f1474ab
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/overrides/dev-console/model.yaml
@@ -0,0 +1,2 @@
+# huggingface model: <org>/<model_name>
+model: "NX-AI/xLSTM-7b"
diff --git a/workloads/xlstm-inference-torchserve/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/xlstm-inference-torchserve/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/xlstm-inference-torchserve/helm/overrides/labels/kaiwo-managed-true.yaml b/workloads/xlstm-inference-torchserve/helm/overrides/labels/kaiwo-managed-true.yaml
new file mode 100644
index 0000000..7337df8
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/overrides/labels/kaiwo-managed-true.yaml
@@ -0,0 +1,2 @@
+labels:
+  kaiwo.silogen.ai/managed: "true"
diff --git a/workloads/xlstm-inference-torchserve/helm/overrides/models/xlstm-7b.yaml b/workloads/xlstm-inference-torchserve/helm/overrides/models/xlstm-7b.yaml
new file mode 100644
index 0000000..9445fa4
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/overrides/models/xlstm-7b.yaml
@@ -0,0 +1 @@
+model: "NX-AI/xLSTM-7b"
diff --git a/workloads/xlstm-inference-torchserve/helm/templates/_helpers.tpl b/workloads/xlstm-inference-torchserve/helm/templates/_helpers.tpl
new file mode 100644
index 0000000..18d762c
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/templates/_helpers.tpl
@@ -0,0 +1,106 @@
+# Release name helper
+{{- define "release.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+
+# Release fullname helper
+{{- define "release.fullname" -}}
+{{- $currentTime := now | date "20060102-1504" -}}
+{{- if .Values.fullnameOverride -}}
+{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- if ne .Release.Name "release-name" -}}
+{{- include "release.name" . }}-{{ .Release.Name | trunc 63 | trimSuffix "-" -}}
+{{- else -}}
+{{- include "release.name" . }}-{{ $currentTime | lower | trunc 63 | trimSuffix "-" -}}
+{{- end -}}
+{{- end -}}
+{{- end -}}
+
+# Container resources helper
+{{- define "container.resources" -}}
+requests:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  {{- if .Values.memoryPerGpu }}
+  memory: "{{ max (mul .Values.gpus .Values.memoryPerGpu) 4 }}Gi"
+  {{- end }}
+  {{- if .Values.cpuPerGpu }}
+  cpu: "{{ max (mul .Values.gpus .Values.cpuPerGpu) 1 }}"
+  {{- end }}
+  {{- if .Values.ephemeralStorage }}
+  ephemeral-storage: "{{ .Values.ephemeralStorage }}"
+  {{- end }}
+limits:
+  {{- if .Values.gpus }}
+  amd.com/gpu: "{{ .Values.gpus }}"
+  {{- end }}
+  {{- if .Values.memoryPerGpu }}
+  memory: "{{ max (mul .Values.gpus .Values.memoryPerGpu) 4 }}Gi"
+  {{- end }}
+  {{- if .Values.cpuPerGpu }}
+  cpu: "{{ max (mul .Values.gpus .Values.cpuPerGpu) 1 }}"
+  {{- end }}
+  {{- if .Values.ephemeralStorage }}
+  ephemeral-storage: "{{ .Values.ephemeralStorage }}"
+  {{- end }}
+{{- end -}}
+
+# Container environment variables helper
+{{- define "container.env" -}}
+{{- range $key, $value := .Values.env_vars }}
+{{- if (typeIs "string" $value) }}
+- name: {{ $key }}
+  value: {{ $value | quote }}
+{{- else }}
+- name: {{ $key }}
+  valueFrom:
+    secretKeyRef:
+      name: {{ $value.name }}
+      key: {{ $value.key }}
+{{- end }}
+{{- end }}
+{{- end -}}
+
+# Container volume mounts helper
+{{- define "container.volumeMounts" -}}
+- mountPath: /mount
+  name: mount
+- mountPath: /dev/shm
+  name: dshm
+- name: ephemeral-storage
+  mountPath: "{{ .Values.storage.ephemeral.mountPath }}"
+{{- end -}}
+
+# Container volumes helper
+{{- define "container.volumes" -}}
+- configMap:
+    name: {{ include "release.fullname" . }}
+  name: mount
+- emptyDir:
+    medium: Memory
+    sizeLimit: {{ .Values.storage.dshm.sizeLimit }}
+  name: dshm
+
+{{- if .Values.storage.ephemeral.storageClassName }}
+- name: ephemeral-storage
+  ephemeral:
+    volumeClaimTemplate:
+      spec:
+        {{- if .Values.storage.ephemeral.accessModes }}
+        accessModes: {{ .Values.storage.ephemeral.accessModes }}
+        {{- else }}
+        accessModes:
+          - ReadWriteOnce
+        {{- end }}
+        resources:
+          requests:
+            storage: "{{ .Values.storage.ephemeral.quantity }}"
+        storageClassName: "{{ .Values.storage.ephemeral.storageClassName }}"
+{{- else }}
+- name: ephemeral-storage
+  emptyDir:
+    sizeLimit: "{{ .Values.storage.ephemeral.quantity }}"
+{{- end }}
+{{- end }}
diff --git a/workloads/xlstm-inference-torchserve/helm/templates/configmap.yaml b/workloads/xlstm-inference-torchserve/helm/templates/configmap.yaml
new file mode 100644
index 0000000..db5a6c7
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/templates/configmap.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: {{ include "release.fullname" . }}
+data:
+{{- $files := .Files }}
+{{- range $path, $_ := .Files.Glob "mount/*" }}
+  {{ $key := $path | trimPrefix "mount/" }}
+  {{- $key }}: |
+{{ $files.Get $path | indent 4 }}
+{{- end }}
diff --git a/workloads/xlstm-inference-torchserve/helm/templates/deployment.yaml b/workloads/xlstm-inference-torchserve/helm/templates/deployment.yaml
new file mode 100644
index 0000000..a2d791e
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/templates/deployment.yaml
@@ -0,0 +1,94 @@
+{{- define "deployment" -}}
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: {{ include "release.fullname" . }}
+  template:
+    metadata:
+      labels:
+        app: {{ include "release.fullname" . }}
+    spec:
+      {{- if .Values.nodeSelector }}
+      nodeSelector:
+{{ toYaml .Values.nodeSelector | indent 8 }}
+      {{- end }}
+      {{- if .Values.affinity }}
+      affinity:
+{{ toYaml .Values.affinity | indent 8 }}
+      {{- end }}
+      containers:
+        - name: {{ .Chart.Name }}
+          args:
+          - |
+            {{- .Values.entrypoint | nindent 12 }}
+          command: ["sh", "-c"]
+          env:
+            - name: MODEL_NAME
+              value: {{ .Values.model | quote }}
+            {{- if .Values.env_vars }}
+            {{- include "container.env" . | nindent 12 }}
+            {{- end }}
+          image: {{ .Values.image | quote }}
+          imagePullPolicy: {{ default "Always" .Values.imagePullPolicy | quote }}
+          {{- if .Values.livenessProbe }}
+          livenessProbe:
+            {{- .Values.livenessProbe | toYaml | nindent 12 -}}
+          {{- end }}
+          ports:
+            {{- range $key, $value := .Values.deployment.ports }}
+            - name: {{ $key }}
+              containerPort: {{ $value }}
+            {{- end }}
+          {{- if .Values.readinessProbe }}
+          readinessProbe:
+            {{- .Values.readinessProbe | toYaml | nindent 12 -}}
+          {{- end }}
+          resources:
+            {{- include "container.resources" . | nindent 12 }}
+          {{- if .Values.startupProbe }}
+          startupProbe:
+            {{- .Values.startupProbe | toYaml | nindent 12 -}}
+          {{- end }}
+          volumeMounts:
+            {{- include "container.volumeMounts" . | nindent 12 }}
+      volumes:
+        {{- include "container.volumes" . | nindent 8 }}
+{{- end -}}
+
+{{- define "deployment_stripped" -}}
+{{- $deployment := include "deployment" . | fromYaml }}
+{{- $ := unset $deployment "metadata" }}
+{{- $ := unset $deployment.spec.template "metadata" }}
+{{- $deployment | toYaml }}
+{{- end -}}
+
+{{- define "deployment_wrapped_with_kaiwoservice" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoService
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+    {{- range $key, $value := .Values.metadata.labels }}
+    {{ $key }}: {{ $value | quote }}
+    {{- end }}
+spec:
+  deployment:
+    {{- include "deployment" . | nindent 4 }}
+{{- end -}}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "deployment_wrapped_with_kaiwoservice" . }}
+{{- else -}}
+{{- include "deployment" . }}
+{{- end -}}
diff --git a/workloads/xlstm-inference-torchserve/helm/templates/service.yaml b/workloads/xlstm-inference-torchserve/helm/templates/service.yaml
new file mode 100644
index 0000000..e4968e9
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/templates/service.yaml
@@ -0,0 +1,22 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: {{ include "release.fullname" . }}
+  labels:
+    app: {{ include "release.fullname" . }}
+spec:
+  type: ClusterIP
+  ports:
+    {{ range $name, $port := .Values.deployment.ports }}
+    {{- if ne $name "http" }}
+    - name: {{ $name }}
+      port: {{ $port }}
+      targetPort: {{ $port }}
+    {{- else -}}
+    - name: {{ $name }}
+      port: 80
+      targetPort: {{ $port }}
+    {{- end }}
+    {{- end }}
+  selector:
+    app: {{ include "release.fullname" . }}
diff --git a/workloads/xlstm-inference-torchserve/helm/values.schema.json b/workloads/xlstm-inference-torchserve/helm/values.schema.json
new file mode 100644
index 0000000..f9931c4
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/values.schema.json
@@ -0,0 +1,234 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "metadata": {
+      "type": "object",
+      "description": "Metadata for the deployment",
+      "properties": {
+        "labels": {
+          "type": "object",
+          "description": "Labels to apply to the deployment",
+          "additionalProperties": {
+            "type": "string"
+          }
+        }
+      },
+      "required": ["labels"]
+    },
+    "image": {
+      "type": "string",
+      "description": "Docker image to use for the deployment"
+    },
+    "imagePullPolicy": {
+      "type": "string",
+      "description": "Image pull policy",
+      "enum": ["Always", "IfNotPresent", "Never"]
+    },
+    "imagePullSecrets": {
+      "type": "array",
+      "description": "Image pull secrets for private registries"
+    },
+    "model": {
+      "type": "string",
+      "description": "Model to use for inference"
+    },
+    "entrypoint": {
+      "type": "string",
+      "description": "Entrypoint for the container"
+    },
+    "gpus": {
+      "type": "integer",
+      "description": "Number of GPUs to allocate",
+      "minimum": 1
+    },
+    "memoryPerGpu": {
+      "type": "integer",
+      "description": "Memory per GPU in Gi",
+      "minimum": 1
+    },
+    "cpuPerGpu": {
+      "type": "integer",
+      "description": "CPU cores per GPU",
+      "minimum": 1
+    },
+    "ephemeralStorage": {
+      "type": "string",
+      "description": "Ephemeral storage space written with as <value>Gi",
+      "minimum": 1
+    },
+    "vllm_engine_args": {
+      "type": "object",
+      "description": "Arguments for the vllm engine",
+      "additionalProperties": {
+        "type": "string"
+      }
+    },
+    "env_vars": {
+      "type": "object",
+      "description": "Environment variables for the deployment",
+      "properties": {
+        "BUCKET_STORAGE_HOST": {
+          "type": "string",
+          "description": "Bucket storage host URL",
+          "format": "uri"
+        },
+        "BUCKET_STORAGE_ACCESS_KEY": {
+          "type": "object",
+          "description": "Access key for bucket storage",
+          "properties": {
+            "name": {
+              "type": "string",
+              "description": "Name of the secret containing the access key"
+            },
+            "key": {
+              "type": "string",
+              "description": "Key in the secret containing the access key"
+            }
+          },
+          "required": ["name", "key"]
+        },
+        "BUCKET_STORAGE_SECRET_KEY": {
+          "type": "object",
+          "description": "Secret key for bucket storage",
+          "properties": {
+            "name": {
+              "type": "string",
+              "description": "Name of the secret containing the secret key"
+            },
+            "key": {
+              "type": "string",
+              "description": "Key in the secret containing the secret key"
+            }
+          },
+          "required": ["name", "key"]
+        },
+        "HF_HOME": {
+          "type": "string",
+          "description": "Home directory for Hugging Face"
+        }
+      }
+    },
+    "storage": {
+      "type": "object",
+      "description": "Storage configuration",
+      "properties": {
+        "dshm": {
+          "type": "object",
+          "description": "Shared memory configuration",
+          "properties": {
+            "sizeLimit": {
+              "type": "string",
+              "description": "Size limit for shared memory"
+            }
+          },
+          "required": ["sizeLimit"]
+        }
+      },
+      "required": ["dshm"]
+    },
+    "deployment": {
+      "type": "object",
+      "description": "Deployment configuration",
+      "properties": {
+        "ports": {
+          "type": "object",
+          "description": "Ports for the deployment",
+          "properties": {
+            "http": {
+              "type": "integer",
+              "description": "HTTP port for the deployment"
+            }
+          },
+          "required": ["http"]
+        }
+      },
+      "required": ["ports"]
+    },
+    "nodeSelector": {
+      "type": "object",
+      "properties": {
+        "dev": {
+          "type": "string",
+          "description": "If true, use the dev node selector"
+        }
+      }
+    },
+    "affinity": {
+      "type": "object",
+      "description": "Node/pod affinity rules",
+      "properties": {
+        "nodeAffinity": {
+          "type": "object",
+          "properties": {
+            "requiredDuringSchedulingIgnoredDuringExecution": {
+              "type": "object",
+              "properties": {
+                "nodeSelectorTerms": {
+                  "type": "array",
+                  "items": {
+                    "type": "object",
+                    "properties": {
+                      "matchExpressions": {
+                        "type": "array",
+                        "items": {
+                          "type": "object",
+                          "properties": {
+                            "key": { "type": "string" },
+                            "operator": {
+                              "type": "string",
+                              "enum": ["In", "NotIn", "Exists", "DoesNotExist"]
+                            },
+                            "values": {
+                              "type": "array",
+                              "items": { "type": "string" }
+                            }
+                          },
+                          "required": ["key", "operator", "values"],
+                          "additionalProperties": false
+                        }
+                      }
+                    },
+                    "required": ["matchExpressions"],
+                    "additionalProperties": false
+                  }
+                }
+              },
+              "required": ["nodeSelectorTerms"],
+              "additionalProperties": false
+            }
+          },
+          "required": ["requiredDuringSchedulingIgnoredDuringExecution"],
+          "additionalProperties": false
+        }
+      },
+      "additionalProperties": false
+    },
+    "kaiwo": {
+      "type": "object",
+      "properties": {
+        "enabled": {
+          "type": "boolean",
+          "description": "If true, use Kaiwo CRDs to have Kaiwo operator manage the workload"
+        }
+      }
+    },
+    "startupProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Startup probe configuration for the container"
+    },
+    "livenessProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Liveness probe configuration for the container"
+    },
+    "readinessProbe": {
+      "type": ["object"],
+      "additionalProperties": true,
+      "description": "Readiness probe configuration for the container"
+    }
+  },
+  "required": ["metadata", "image", "imagePullPolicy", "gpus", "storage", "deployment", "kaiwo"],
+  "additionalProperties": false
+}
diff --git a/workloads/xlstm-inference-torchserve/helm/values.yaml b/workloads/xlstm-inference-torchserve/helm/values.yaml
new file mode 100644
index 0000000..7a72050
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/helm/values.yaml
@@ -0,0 +1,61 @@
+metadata:
+  labels: {}
+
+image: "rocm/pytorch:rocm6.4_ubuntu24.04_py3.12_pytorch_release_2.6.0"
+imagePullPolicy: Always
+
+model: "NX-AI/xLSTM-7b"
+
+gpus: 1
+memoryPerGpu: 256 # Gi
+cpuPerGpu: 4
+
+entrypoint: |
+  bash /mount/entrypoint.sh
+storage:
+  dshm:
+    sizeLimit: 32Gi
+  ephemeral:
+    quantity: 200Gi
+    storageClassName: mlstorage
+    accessModes:
+      - ReadWriteOnce
+    mountPath: /workload
+
+env_vars:
+  TS_REPO_URL: "https://github.com/pytorch/serve.git"
+  TS_REPO_DIR: "/workload/serve"
+  MODEL_STORE: "/workload/model-store"
+  TEMP: "/workload/tmp"
+  TS_CONFIG: "/mount/config.properties"
+  BUCKET_ACCESS_KEY:
+    name: minio-credentials
+    key: minio-access-key
+  BUCKET_SECRET_KEY:
+    name: minio-credentials
+    key: minio-secret-key
+  BUCKET_STORAGE_HOST: http://minio.minio-tenant-default.svc.cluster.local:80
+  BUCKET_PATH: "default-bucket/models"
+
+startupProbe:
+  httpGet:
+    path: /ping
+    port: http
+  periodSeconds: 10
+  failureThreshold: 360 # 360 x 10s => allow for 60 minutes startup time
+livenessProbe:
+  httpGet:
+    path: /ping
+    port: http
+readinessProbe:
+  httpGet:
+    path: /ping
+    port: http
+
+deployment:
+  ports:
+    http: 8080
+
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
diff --git a/workloads/xlstm-inference-torchserve/xlstm_query.py b/workloads/xlstm-inference-torchserve/xlstm_query.py
new file mode 100644
index 0000000..585c21f
--- /dev/null
+++ b/workloads/xlstm-inference-torchserve/xlstm_query.py
@@ -0,0 +1,21 @@
+import argparse
+
+import requests  # type: ignore
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--url",
+    type=str,
+    required=True,
+    help="Torchserve inference endpoint for xLSTM",
+)
+parser.add_argument("--prompt", type=str, required=True, help="Prompt for xLSTM")
+args = parser.parse_args()
+
+payload = {
+    "prompt": args.prompt,
+}
+
+response = requests.post(args.url, json=payload).json()["output"]
+
+print(f"Response: {response}")