diff --git a/docker/llm-evaluation/requirements.txt b/docker/llm-evaluation/requirements.txt
index ad6196a..f7ad356 100644
--- a/docker/llm-evaluation/requirements.txt
+++ b/docker/llm-evaluation/requirements.txt
@@ -3,5 +3,6 @@ dataclasses-json==0.6.7
 evaluate==0.4.3
 jsonlines==4.0.0
 minio==7.2.15
+mlflow==3.1.0
 openai==1.64.0
 sentencepiece==0.2.0
diff --git a/docker/llm-evaluation/run_inference_and_judge_evaluation.py b/docker/llm-evaluation/run_inference_and_judge_evaluation.py
index e24dcf9..86a62e0 100644
--- a/docker/llm-evaluation/run_inference_and_judge_evaluation.py
+++ b/docker/llm-evaluation/run_inference_and_judge_evaluation.py
@@ -89,10 +89,8 @@ async def main(args: Namespace):
 
     saved_results = []
     parameters: dict = {}
-    llm_url_no_protocol = args.llm_base_url.removeprefix("http://").removeprefix(
-        "https://"
-    )  # the Minio python client handles protocol itself
-    client = get_llm_client(base_url=llm_url_no_protocol, port=args.llm_port, endpoint=args.llm_endpoint)
+
+    client = get_llm_client(base_url=args.llm_base_url, port=args.llm_port, endpoint=args.llm_endpoint)
 
     async for inference_result in run_call_inference_container(
         dataset=ds,
@@ -123,10 +121,7 @@ async def main(args: Namespace):
     logger.info(inferences_data)
     logger.info("Inference ran.")
 
-    judge_url_no_protocol = args.judge_base_url.removeprefix("http://").removeprefix(
-        "https://"
-    )  # the Minio python client handles protocol itself
-    judge_client = get_llm_client(base_url=judge_url_no_protocol, port=args.judge_port, endpoint=args.judge_endpoint)
+    judge_client = get_llm_client(base_url=args.judge_base_url, port=args.judge_port, endpoint=args.judge_endpoint)
 
     aggregated_judge_results = AggregatedJudgeResults(
         judge_results={},
diff --git a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
index 6973182..57e4f28 100644
--- a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
+++ b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
@@ -14,9 +14,9 @@
 from llm_evaluation.call_inference_container.call_inference_container import (
     save_inference_results,
 )
-from llm_evaluation.metrics.run_metrics_evaluation import read_inference_data
+from llm_evaluation.metrics.run_metrics_evaluation import get_bert_score_distribution_graphs, read_inference_data
 from llm_evaluation.metrics.run_metrics_evaluation import run as run_metrics_evaluation
-from llm_evaluation.metrics.utils import save_results
+from llm_evaluation.metrics.utils import log_metrics_in_mlflow, save_results
 
 
 async def main(args: Namespace):
@@ -115,6 +115,20 @@ async def main(args: Namespace):
 
     eval_results = run_metrics_evaluation(data)
 
+    distribution_graphs = get_bert_score_distribution_graphs(
+        scores=eval_results.scores,
+    )
+
+    if args.mlflow_server_uri:
+        logger.info("Logging results to MLFlow...")
+        log_metrics_in_mlflow(
+            distribution_graphs,
+            eval_results.scores,
+            mlflow_server_uri=args.mlflow_server_uri,
+            mlflow_experiment_name=args.mlflow_experiment_name,
+            mlflow_run_name=args.mlflow_run_name,
+        )
+
     logger.info("Evaluation results:")
     logger.info(eval_results)
 
diff --git a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
index e78c078..6cc0b9d 100644
--- a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
+++ b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
@@ -12,7 +12,11 @@ def get_inference_parser() -> ArgumentParser:
     parser.add_argument("-p", "--llm-port", type=str, default="8080", help="Port number of the LLM service.")
     parser.add_argument("-e", "--llm-endpoint", type=str, default="v1", help="Endpoint of the LLM service.")
     parser.add_argument(
-        "-d", "--evaluation-dataset", type=str, default="abisee/cnn_dailymail", help="Name of the evaluation dataset."
+        "-d",
+        "--evaluation-dataset-name",
+        type=str,
+        default="abisee/cnn_dailymail",
+        help="Name of the evaluation dataset.",
     )
     parser.add_argument(
         "-v", "--evaluation-dataset-version", type=str, default="3.0.0", help="Version of the evaluation dataset."
@@ -65,6 +69,24 @@ def get_inference_parser() -> ArgumentParser:
         default="/home/evaluation/example_prompts/example_summary_prompt.txt",
         help="Path to the prompt template file.",
     )
+    parser.add_argument(
+        "--mlflow-server-uri",
+        type=str,
+        default="",  # leave this argument empty to disable MLFlow tracking
+        help="MLFlow server URI for tracking.",
+    )
+    parser.add_argument(
+        "--mlflow-experiment-name",
+        type=str,
+        default="llm-evaluation-experiment",
+        help="MLFlow experiment name for tracking.",
+    )
+    parser.add_argument(
+        "--mlflow-run-name",
+        type=str,
+        default="llm-evaluation-run",
+        help="MLFlow run name for tracking.",
+    )
     return parser
 
 
diff --git a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
index b5d5e19..ebdba31 100644
--- a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
+++ b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
@@ -10,10 +10,12 @@
 @dataclass_json
 @dataclass
 class EvaluationScores:
-    precision_bert: float
-    recall_bert: float
-    f1_bert: float
-    f1_list: List[float]
+    precision_avg_bert: float
+    recall_avg_bert: float
+    f1_avg_bert: float
+    precision_list_bert: List[float]
+    recall_list_bert: List[float]
+    f1_list_bert: List[float]
     bleu_score: float
     accuracy: float
 
diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
index 3b8ba5a..76797e4 100644
--- a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
+++ b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
@@ -7,7 +7,7 @@
 
 def compute_bertscore(
     predictions: List[str], references: List[str], language: str = "en"
-) -> Tuple[float, float, float, List[float]]:
+) -> Tuple[List[float], List[float], List[float]]:
     """
     Computes the BERTScore for a set of predictions and references.
 
@@ -32,13 +32,7 @@ def compute_bertscore(
     recall_list = convert_negatives_to_zero(array=np.array(results["recall"]))
     f1_list = convert_negatives_to_zero(array=np.array(results["f1"]))
 
-    precision_bert = round(np.average(precision_list), 4)
-    recall_bert = round(np.average(recall_list), 4)
-    f1_bert = round(np.average(f1_list), 4)
-
-    f1_list = [round(f1, 4) for f1 in f1_list]
-
-    return precision_bert, recall_bert, f1_bert, f1_list
+    return precision_list, recall_list, f1_list
 
 
 def compute_exact_match(
diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
index 7d0985d..f2070b9 100644
--- a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
+++ b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
@@ -6,6 +6,9 @@
 from typing import Any, Dict, List
 
 import jsonlines
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
 from llm_evaluation import logger
 from llm_evaluation.argument_parsers import get_metrics_parser
 from llm_evaluation.data.data_classes import EvaluationResults, EvaluationScores
@@ -28,7 +31,13 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
 
     bert_score_start_time = time.time()
 
-    precision_bert, recall_bert, f1_bert, f1_list = compute_bertscore(predictions=predictions, references=references)
+    precision_list_bert, recall_list_bert, f1_list_bert = compute_bertscore(
+        predictions=predictions, references=references
+    )
+
+    precision_avg_bert = round(np.average(precision_list_bert), 4)
+    recall_avg_bert = round(np.average(recall_list_bert), 4)
+    f1_avg_bert = round(np.average(f1_list_bert), 4)
 
     logger.info(f"BERT-score computation took {time.time() - bert_score_start_time:.2f} seconds")
 
@@ -45,15 +54,53 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
     logger.info(f"Exact match computation took {time.time() - exact_match_start_time:.2f} seconds")
 
     return EvaluationScores(
-        precision_bert=precision_bert,
-        recall_bert=recall_bert,
-        f1_bert=f1_bert,
-        f1_list=f1_list,
+        precision_avg_bert=precision_avg_bert,
+        recall_avg_bert=recall_avg_bert,
+        f1_avg_bert=f1_avg_bert,
+        precision_list_bert=precision_list_bert,
+        recall_list_bert=recall_list_bert,
+        f1_list_bert=f1_list_bert,
         bleu_score=bleu_score,
         accuracy=accuracy,
     )
 
 
+def get_bert_score_distribution_graphs(scores: EvaluationScores) -> Dict[str, str]:
+    """
+    Generate PNG images of the distributions of BERTScore precision, recall, and F1,
+    each with the mean value marked.
+
+    Args:
+        precision_list (list of float): List of BERTScore precision values.
+        recall_list (list of float): List of BERTScore recall values.
+        f1_list (list of float): List of BERTScore F1 values.
+
+    Returns:
+        dict: Dictionary with keys 'precision', 'recall', 'f1', each containing PNG image bytes.
+    """
+    results = {}
+    metrics = [
+        ("precision", scores.precision_list_bert),
+        ("recall", scores.recall_list_bert),
+        ("f1", scores.f1_list_bert),
+    ]
+    for name, values in metrics:
+        fig, ax = plt.subplots()
+        values = np.array(values)
+        mean_val = np.mean(values)
+        ax.hist(values, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
+        ax.axvline(mean_val, color="red", linestyle="dashed", linewidth=2, label=f"Mean: {mean_val:.4f}")
+        ax.set_title(f"BERTScore {name.capitalize()} Distribution")
+        ax.set_xlabel(name.capitalize())
+        ax.set_ylabel("Frequency")
+        ax.legend()
+        plt.tight_layout()
+        plt.savefig(f"{name}_distribution.png", format="png")
+        plt.close(fig)
+        results[name] = f"{name}_distribution.png"
+    return results
+
+
 def read_inference_data(input_path: str) -> List[Dict[str, Any]]:
     """
     Reads inference data from a file or directory containing JSON/JSONL files.
diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
index 23fa6f2..84c917f 100644
--- a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
+++ b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
@@ -4,13 +4,14 @@
 from typing import Any, Dict, List
 
 import jsonlines
+import mlflow
+import numpy as np
 from llm_evaluation import logger
 from llm_evaluation.data.data_classes import AggregatedJudgeResults, EvaluationResults
 from minio import Minio, S3Error
-from numpy import ndarray
 
 
-def convert_negatives_to_zero(array: ndarray) -> ndarray:
+def convert_negatives_to_zero(array: np.ndarray) -> np.ndarray:
     """Converts all negative values in an array to zero.
 
     Args:
@@ -129,3 +130,40 @@ def read_jsonl_data(input_file_path: str) -> List[Dict[str, Any]]:
         for line in reader.iter(type=dict, skip_invalid=True):
             generations.append(line)
     return generations
+
+
+def log_metrics_in_mlflow(distribution_graphs, scores, mlflow_server_uri, mlflow_experiment_name, mlflow_run_name):
+
+    logger.info(f"Using MLflow tracking URI: {mlflow_server_uri}")
+
+    experiment_description = "Evaluation of LLM using BERTScore metric."
+
+    experiment_tags = {
+        "project_name": mlflow_experiment_name,
+        "mlflow.note.content": experiment_description,
+    }
+
+    client = mlflow.MlflowClient(tracking_uri=mlflow_server_uri)
+
+    # Create the Experiment, providing a unique name
+    try:
+        test_experiment = client.create_experiment(name=mlflow_experiment_name, tags=experiment_tags)
+        logger.info(f"Created experiment with ID: {test_experiment}")
+    except mlflow.exceptions.MlflowException as e:
+        # If the experiment already exists, retrieve its ID
+        logger.warning(f"Experiment '{mlflow_experiment_name}' already exists. Using existing experiment.")
+        test_experiment = client.get_experiment_by_name(mlflow_experiment_name).experiment_id
+        logger.info(f"Using existing experiment with ID: {test_experiment}")
+
+    mlflow.set_tracking_uri(mlflow_server_uri)
+    mlflow.set_experiment(experiment_name=mlflow_experiment_name)
+    with mlflow.start_run(run_name=mlflow_run_name, experiment_id=test_experiment) as run:
+
+        for name, file in distribution_graphs.items():
+            mlflow.log_metric("bert_score_mean_precision" + name, np.mean(scores.precision_avg_bert))
+            mlflow.log_metric("bert_score_mean_recall" + name, np.mean(scores.recall_avg_bert))
+            mlflow.log_metric("bert_score_mean_f1" + name, np.mean(scores.f1_avg_bert))
+            logger.info(
+                f"Saving artifact {file} (abs path: {os.path.abspath(file)}) to MLflow run {run.info.run_id}..."
+            )
+            mlflow.log_artifact(os.path.abspath(file), artifact_path="metrics_distributions")
diff --git a/docker/logistics/requirements.txt b/docker/logistics/requirements.txt
index 7405971..46ecaaa 100644
--- a/docker/logistics/requirements.txt
+++ b/docker/logistics/requirements.txt
@@ -4,3 +4,4 @@ google-cloud-storage
 hf_transfer
 huggingface_hub[cli]
 minio
+wandb
diff --git a/docs/contributing.md b/docs/contributing.md
index 870eaff..94e1f5e 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -18,6 +18,56 @@ Thank you for considering contributing to the SiloGen AI Workloads development!
    # install packages you need
    ```
 
+### Pre-commit setup
+
+We use [pre-commit](https://pre-commit.com/) for consistent formatting and cleaner code. Hooks are specified in `ai-workloads-dev/.pre-commit-config.yaml`.
+
+To install:<br />
+`cd ai-workloads-dev` (this is necessary for `pre-commit install`, which runs particular to a git repository)<br />
+`source your_venv`<br />
+`pip install pre-commit`<br />
+`pre-commit install --config .pre-commit-config.yaml`<br />
+`git commit -m "test commit"`<br />
+
+With the final command, pre-commit should run automatically, with output something like the following:
+
+   >check json...........................................(no files to check)Skipped<br />
+   check yaml...........................................(no files to check)Skipped<br />
+   fix end of files.....................................(no files to check)Skipped<br />
+   fix requirements.txt.................................(no files to check)Skipped<br />
+   trim trailing whitespace.............................(no files to check)Skipped<br />
+   black................................................(no files to check)Skipped<br />
+   flake8...............................................(no files to check)Skipped<br />
+   isort (python).......................................(no files to check)Skipped<br />
+   mypy.................................................(no files to check)Skipped<br />
+   helmlint.............................................(no files to check)Skipped<br />
+
+It's also possible to manually run pre-commit using
+
+`pre-commit run --all-files`
+
+#### Troubleshooting pre-commit
+
+Many pre-commit bugs come from having an incorrect version of pre-commit active. Pre-commit can hang around as a system-wide version, in python venvs, or in your pre-commit cache.
+
+ It's easiest to use pre-commit as part of a python virtual environment. To check that the right pre-commit is being found, run `which pre-commit` and confirm that the binaries inside your venv are shown. For example: `/../../venvs/your_venv/bin/pre-commit`. A different path could indicate that your system is choosing the wrong pre-commit install.
+
+
+From system:
+`brew uninstall pre-commit` (mac)
+`sudo apt remove pre-commit` (linux)
+
+From venv:
+`pip uninstall pre-commit`
+
+Just the pre-commit hooks uninstall:
+`pre-commit uninstall`
+`pre-commit clean`
+
+
+Then reinstall pre-commit from scratch as described above.
+
+
 ## Development Workflow
 
 1. Create a branch for your feature or bugfix:
diff --git a/workloads/dev-workspace-jupyterlab/helm/values.yaml b/workloads/dev-workspace-jupyterlab/helm/values.yaml
index 2ac6519..9c238b6 100644
--- a/workloads/dev-workspace-jupyterlab/helm/values.yaml
+++ b/workloads/dev-workspace-jupyterlab/helm/values.yaml
@@ -47,8 +47,17 @@ entrypoint: |
   pip install pipx ipykernel
   pipx install --include-deps jupyter
   pipx inject --include-deps jupyter jupyterlab-lsp 'python-lsp-server[all]' ipywidgets jupyterlab-git jupyterlab_code_formatter
-  python -m ipykernel install --user --name=default-python3
-  jupyter-lab --ServerApp.token='' --ServerApp.ip='0.0.0.0' --ServerApp.allow_root=True --ServerApp.base_url=$BASE_URL --no-browser --ServerApp.root_dir='/workload'
+  python -m ipykernel install --user --name=default-python3 --display-name="Python 3 (default)"
+
+  jupyter-lab --no-browser \
+    --IdentityProvider.token='' \
+    --ServerApp.ip='0.0.0.0' \
+    --ServerApp.allow_root=True \
+    --ServerApp.base_url=$BASE_URL \
+    --ServerApp.root_dir='/workload' \
+    --MultiKernelManager.default_kernel_name=default-python3 \
+    --KernelSpecManager.allowed_kernelspecs=default-python3 \
+    --KernelSpecManager.ensure_native_kernel=False
 
 # kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
 kaiwo:
diff --git a/workloads/download-data-to-bucket/helm/templates/job.yaml b/workloads/download-data-to-bucket/helm/templates/job.yaml
index f232a17..2dd6cc7 100644
--- a/workloads/download-data-to-bucket/helm/templates/job.yaml
+++ b/workloads/download-data-to-bucket/helm/templates/job.yaml
@@ -33,8 +33,8 @@ spec:
           mkdir -p /downloads/datasets
           python /scripts/data_script.py
           ########################
-          echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" }}'
-          mc cp -recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/
+          echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" | replace  "'" "'\\''" }}'
+          mc cp -recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace  "'" "'\\''" }}'/
           ########################
           echo 'Done'
         env:
diff --git a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
index 7514746..b91d2b9 100644
--- a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
+++ b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
@@ -47,13 +47,14 @@ spec:
             {{- end }}
             --local-dir local_models/downloaded_model
           ###################################
-          echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" }}'
+          echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace  "'" "'\\''" }}'
+          {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace  "'" "'\\''") }}
           mc mirror --exclude '.cache/huggingface/*' \
             --exclude '.gitattributes' \
             {{- if .Values.allowOverwrite }}
             --overwrite \
             {{- end }}
-            local_models/downloaded_model/ minio-host/{{ .Values.bucketPath | trimSuffix "/" }}
+            local_models/downloaded_model/ {{ $remotePath }}
         env:
           {{- if .Values.hfTokenSecret }}
           - name: HF_TOKEN
diff --git a/workloads/download-wandb-model-to-bucket/helm/Chart.yaml b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml
new file mode 100644
index 0000000..557e352
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: download-wandb-model-to-bucket
+description: A Helm chart for downloading a Weights and Biases model to a bucket
+version: 0.0.1
diff --git a/workloads/download-wandb-model-to-bucket/helm/README.md b/workloads/download-wandb-model-to-bucket/helm/README.md
new file mode 100644
index 0000000..1d6519c
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/README.md
@@ -0,0 +1,14 @@
+# Download a model from Weights and Biases to bucket storage
+
+This is an workload which downloads a model from weights and biases and uploads it to bucket storage.
+
+Run example:
+```bash
+helm template "dl-from-wandb" workloads/download-wandb-model-to-bucket/helm \
+    -f workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml \
+    | kubectl create -f -
+```
+
+## User input values
+
+See the `values.yaml` file for the user input values that you can provide, with instructions.
diff --git a/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml b/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml
new file mode 100644
index 0000000..337ad4d
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml
@@ -0,0 +1,18 @@
+# Which model to download
+artifactPath: test-proj-1/test-model-2
+
+# Where the resources should be stored:
+bucketPath: default-bucket/models/examples/tiny-random-test-model-2
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Download & Upload configuration:
+allowOverwrite: false
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "20Gi"
+
+# HF Token:
+wandbTokenSecret:
+  name: wandb-token
+  key: wandb-token
diff --git a/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml b/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml
new file mode 100644
index 0000000..52bd8f8
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml
@@ -0,0 +1,103 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "{{ .Release.Name }}-job"
+  {{- if .Values.labels }}
+  labels:
+    {{- range $label, $value := .Values.labels }}
+    {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  backoffLimit: 0
+  template:
+    spec:
+      restartPolicy: Never
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      containers:
+      - name: hf-to-bucket
+        image: {{ .Values.image }}
+        imagePullPolicy: Always
+        workingDir: /app
+        command:
+        - sh
+        - -e
+        - -u
+        - -c
+        args:
+        - |
+          ###################################
+          echo 'Setting up minio'
+          mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
+          ###################################
+          echo 'Downloading the artifact from wandb to the container'
+          {{- $safeArtifactPath := printf "'%s'" (.Values.artifactPath | replace  "'" "'\\''") }}
+          wandb artifact get --type {{ .Values.artifactType }} {{ $safeArtifactPath }} --root local_artifact
+          ###################################
+          echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace  "'" "'\\''" }}'
+          {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace  "'" "'\\''") }}
+          mc mirror \
+            {{- if .Values.allowOverwrite }}
+            --overwrite \
+            {{- end }}
+            local_artifact/ {{ $remotePath }}
+        env:
+          - name: WANDB_API_KEY
+            valueFrom:
+              secretKeyRef:
+                name: {{ .Values.wandbTokenSecret.name }}
+                key: {{ .Values.wandbTokenSecret.key }}
+          - name: BUCKET_STORAGE_HOST
+            value: {{ .Values.bucketStorageHost }}
+          - name: BUCKET_STORAGE_ACCESS_KEY
+            valueFrom:
+              secretKeyRef:
+                name: {{ .Values.bucketCredentialsSecret.name }}
+                key: {{ .Values.bucketCredentialsSecret.accessKeyKey }}
+          - name: BUCKET_STORAGE_SECRET_KEY
+            valueFrom:
+              secretKeyRef:
+                name: {{ .Values.bucketCredentialsSecret.name }}
+                key: {{ .Values.bucketCredentialsSecret.secretKeyKey }}
+        resources:
+          requests:
+            memory: 1Gi
+            cpu: 1
+          limits:
+            memory: 1Gi
+            cpu: 1
+        volumeMounts:
+          - mountPath: /app
+            name: {{ .Release.Name }}-volume
+        securityContext:
+          allowPrivilegeEscalation: false
+          runAsNonRoot: true
+          runAsUser: 1000
+          runAsGroup: 1000
+          seccompProfile:
+            type: RuntimeDefault
+          capabilities:
+            drop: ["ALL"]
+      securityContext:
+        fsGroup: 1000
+      volumes:
+        - name: {{ .Release.Name }}-volume
+        {{- if .Values.storageClass }}
+          ephemeral:
+            volumeClaimTemplate:
+              spec:
+                accessModes: [ "ReadWriteOnce" ]
+                storageClassName: {{ .Values.storageClass }}
+                resources:
+                  requests:
+                    storage: "{{ .Values.storageQuantity }}"
+        {{- else }}
+          emptyDir:
+            sizeLimit: "{{ .Values.storageQuantity }}"
+        {{- end }}
diff --git a/workloads/download-wandb-model-to-bucket/helm/values.yaml b/workloads/download-wandb-model-to-bucket/helm/values.yaml
new file mode 100644
index 0000000..993dd0e
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/values.yaml
@@ -0,0 +1,34 @@
+### General chart values ###
+image: ghcr.io/silogen/logistics:v0.2
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+  # Example:
+  # labels:
+  #   kaiwo.silogen.ai/managed: "true"
+
+# Extra annotations such as an imagePullSecrets
+imagePullSecrets: []
+  # Example:
+  # imagePullSecrets:
+  #   - "regcred"
+
+# Configure these to match the credentials in your cluster:
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Secret reference that contains the Weights and Biases token
+wandbTokenSecret:
+  name: wandb-token
+  key: wandb-token
+
+# Inputs:
+artifactPath: "" # wandb artifact path which is in the format of project/artifact-name
+artifactType: model # wandb artifact type, e.g. model or dataset
+bucketPath: "" # Path in the bucket storage where this model should be stored. In the format bucket-name/path/separated/by/slashes/name-for-resulting-directory
+allowOverwrite: false # Optionally set to true to allow overiwriting existing files in the bucket
+storageQuantity: 64Gi  # How much space needs to be allocated to store the model in the container (before pushing to bucket storage).
+storageClass: mlstorage # Set this to use a specific storageClass for the storage. If not specified, will simply use an ephemeral_storage request.
diff --git a/workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml b/workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml
similarity index 75%
rename from workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml
rename to workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml
index 85baf06..b6b91c5 100644
--- a/workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml
+++ b/workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml
@@ -1,4 +1,4 @@
-# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
+# Overrides file for running judge evaluation, using Llama-3.2-3B-Instruct to judge Qwen2.5-3B-Instruct on the default dataset
 general:
   job_name: judge-job-3container-qwen
 model_inference_container:
diff --git a/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml b/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml
new file mode 100644
index 0000000..3f48ea0
--- /dev/null
+++ b/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml
@@ -0,0 +1,14 @@
+# Overrides file for running judge evaluation, using Llama-3.2-3B-Instruct to judge Llama-3.2-3B-Instruct on the default dataset
+general:
+  job_name: judge-job-s3-llama-3.2-3B
+model_inference_container:
+  image: rocm/vllm-dev:nightly_main_20250430
+  model: Llama-3.2-3B-Instruct
+  model_path: hf://meta-llama/Llama-3.2-3B-Instruct
+judge_inference_container:
+  image: rocm/vllm-dev:nightly_main_20250430
+  model: Llama-3.2-3B-Instruct
+  model_path: s3://default-bucket/models/meta-llama/Llama-3.2-3B-Instruct
+judge_evaluation_container:
+  image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1
+  use_data_subset: 0
diff --git a/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml b/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml
deleted file mode 100644
index 4738fcf..0000000
--- a/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
-general:
-  job_name: judge-job-minio
-model_inference_container:
-  image: rocm/vllm-dev:20241205-tuned
-  model: llama-3.2-3B
-  model_path: hf://meta-llama/Llama-3.2-3B-Instruct
-judge_inference_container:
-  model: llama-3.2-3B
-  model_path: s3://default-bucket/models/meta-llama/Llama-3.2-3B-Instruct
-judge_evaluation_container:
-  image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1
-  use_data_subset: 5
diff --git a/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml b/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml
index 7d15fec..6c41581 100644
--- a/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml
+++ b/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml
@@ -144,16 +144,6 @@ spec:
             requests:
               memory: "{{ .Values.judge_evaluation_container.memory }}"
               cpu: "{{ .Values.judge_evaluation_container.cpu_count }}"
-          startupProbe:
-            exec:
-              command:
-                - sh
-                - -c
-                - |
-                  curl -sf http://localhost:8080/health && curl -sf http://localhost:8081/health
-            initialDelaySeconds: 60
-            periodSeconds: 10
-            failureThreshold: 30
           command: ["sh", "-c"]
           args:
             - |
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml
index b2d0fc2..52cb1b1 100644
--- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml
@@ -1,5 +1,5 @@
 model_inference_container:
-  image: rocm/vllm-dev:20241205-tuned
+  image: rocm/vllm-dev:nightly_main_20250430
 evaluation_container:
   image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1
   dataset_path: FiscalNote/billsum
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml
index 2f220e0..ab463b3 100644
--- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml
@@ -1,6 +1,6 @@
 # Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
 model_inference_container:
-  image: rocm/vllm-dev:20241205-tuned
+  image: rocm/vllm-dev:nightly_main_20250430
   model: Llama-3.2-3B-Instruct
   model_path: meta-llama/Llama-3.2-3B-Instruct
 evaluation_container:
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml
index 85de2bd..3e39cd3 100644
--- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml
@@ -1,6 +1,6 @@
 # Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
 model_inference_container:
-  image: rocm/vllm-dev:20241205-tuned
+  image: rocm/vllm-dev:nightly_main_20250430
   model: Llama-3.2-3B-Instruct
   model_path: meta-llama/Llama-3.2-3B-Instruct
 evaluation_container:
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml
new file mode 100644
index 0000000..97d79b8
--- /dev/null
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml
@@ -0,0 +1,12 @@
+model_inference_container:
+  image: rocm/vllm-dev:nightly_main_20250430
+  model: Llama-3.2-3B-Instruct
+  model_path: meta-llama/Llama-3.2-3B-Instruct
+evaluation_container:
+  image: ghcr.io/silogen/evaluation-workloads-metrics-debug:v0.1
+  use_data_subset: 3
+storage:
+  mlflow:
+    server_uri: http://10.242.3.71:8082
+    experiment_name: metrics-demo-experiment
+    run_name: metrics-demo-run
diff --git a/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml b/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml
index a4979d8..a4178b6 100644
--- a/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml
+++ b/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml
@@ -83,7 +83,7 @@ spec:
               echo "Running evaluation:\nDownloading Dataset, Running inference, Evaluating inferences with bertscore...";
               python3 run_inference_and_metrics_evaluation.py \
                     --llm-base-url="http://localhost" \
-                    --evaluation-dataset="{{ .Values.evaluation_container.dataset_path }}" \
+                    --evaluation-dataset-name="{{ .Values.evaluation_container.dataset_path }}" \
                     --evaluation-dataset-version="{{ .Values.evaluation_container.dataset_version }}" \
                     --dataset-split="{{ .Values.evaluation_container.dataset_split }}" \
                     --prompt-template-path="{{ .Values.evaluation_container.prompt_template_path }}" \
@@ -95,7 +95,10 @@ spec:
                     --context-column-name="{{ .Values.evaluation_container.dataset_info.context_column_name}}" \
                     --id-column-name="{{ .Values.evaluation_container.dataset_info.id_column_name}}" \
                     --gold-standard-column-name="{{ .Values.evaluation_container.dataset_info.gold_standard_column_name}}" \
-                    --use-data-subset="{{ .Values.evaluation_container.use_data_subset}}" ;
+                    --use-data-subset="{{ .Values.evaluation_container.use_data_subset}}" \
+                    --mlflow-server-uri="{{ .Values.storage.mlflow.server_uri }}" \
+                    --mlflow-experiment-name="{{ .Values.storage.mlflow.experiment_name }}" \
+                    --mlflow-run-name="{{ .Values.storage.mlflow.run_name }}" ;
           env:
             - name: TRANSFORMERS_CACHE
               value: /HF_HOME
diff --git a/workloads/llm-evaluation-metrics/helm/values.yaml b/workloads/llm-evaluation-metrics/helm/values.yaml
index 7338287..e146c0e 100644
--- a/workloads/llm-evaluation-metrics/helm/values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/values.yaml
@@ -33,3 +33,7 @@ storage:
       - ReadWriteOnce
   bucket_storage_host: minio.minio-tenant-default.svc.cluster.local:80
   bucket_storage_bucket: default-bucket
+  mlflow:
+    server_uri: http://10.242.3.198:8082
+    experiment_name: mlflow-experiment
+    run_name: mlflow-run
diff --git a/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl b/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl
index d1b22f6..e27a123 100644
--- a/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl
+++ b/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl
@@ -3,9 +3,10 @@
 # Setup MinIO
 mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY}
 # Sync checkpoints from remote to local
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''") }}
 {{- if .Values.checkpointsRemote }}
-if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then
-  echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+  echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''"}} to /workdir/checkpoints'
   ls -lah /workdir/checkpoints
 else
   echo 'No checkpoints found yet'
@@ -17,12 +18,13 @@ fi
 {{- define "finetuningAndUploadEntrypoint" -}}
 # Print GPU Info:
 rocm-smi
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''") }}
 {{- if .Values.checkpointsRemote }}
 echo "Starting checkpoint sync process"
 mc mirror \
   --watch \
   /workdir/checkpoints \
-  minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ &
+  {{ $checkpointsRemotePath }} &
 uploadPID=$!
 {{- end }}
 # Run training:
@@ -36,7 +38,7 @@ wait $uploadPID || true
 echo 'Training done, syncing once more...'
 mc mirror \
   /workdir/checkpoints \
-  minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/
+  {{ $checkpointsRemotePath }}
 {{- end }}
 echo 'All done, exiting'
 {{- end }}
diff --git a/workloads/llm-finetune-axolotl/helm/values.yaml b/workloads/llm-finetune-axolotl/helm/values.yaml
index 3ed333d..9f3234e 100644
--- a/workloads/llm-finetune-axolotl/helm/values.yaml
+++ b/workloads/llm-finetune-axolotl/helm/values.yaml
@@ -27,4 +27,4 @@ finetuningGpus: 1
 configFile:  # name of config file to use, include the file in the mount/ directory
 
 ### Model output path ###
-checkpointsRemote:  # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
+checkpointsRemote: ""  # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
diff --git a/workloads/llm-finetune-llama-factory/helm/README.md b/workloads/llm-finetune-llama-factory/helm/README.md
index d213a5b..a495741 100644
--- a/workloads/llm-finetune-llama-factory/helm/README.md
+++ b/workloads/llm-finetune-llama-factory/helm/README.md
@@ -2,7 +2,6 @@
 
 This is a Helm Chart for running a finetuning job using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)
 
-Currently the base model and input data are assumed to be from HuggingFace, or some other source directly supported by LLaMA-Factory.
 The output is saved with MinIO in the directory specified by `checkpointsRemote`.
 
 ## Configuration
@@ -22,6 +21,22 @@ helm template workloads/llm-finetune-llama-factory/helm \
   | kubectl create -f -
 ```
 
+## Data specification
+
+Specify the name of data set used for training as `dataset`. This can include datasets predefined in LLaMA-Factory or those defined in `datasetInfo`. Use commas to separate multiple data sets.
+
+To use other datasets, create an entry in `datasetInfo` following the [LLaMA-Factory dataset info format](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md). Note that LLaMA-Factory directly supports loading datasets from HuggingFace, ModelScope, or s3/gcs cloud storage by setting the urls according to the documentation.
+
+This workload adds a custom way to load data from MinIO. In `datasetInfo` specify the path to the dataset in the remote bucket as `pathRemote`, and the workload will load the file and update the configuration. See the override file [`overrides/finetune-model_data_from_minio.yaml`](overrides/finetune-model_data_from_minio.yaml) for an example of finetuning where the data and model are loaded from MinIO.
+
+## Model specification
+
+To use a base model from HuggingFace or other source directly supported by LLaMA-Factory, specify the model name in `modelName`.
+
+Alternatively to use a model from MinIO, specify the path to the model in `modelRemote`.
+
+Either `modelName` or `modelRemote` must be specified. If both are included, the model from `modelRemote` is used.
+
 ## Cleanup
 
 After the jobs are completed, please delete the resources created. In particular for multi-node ray jobs, a `PersistentVolumeClaim` is used as shared storage and persists on the cluster after the job is completed.
@@ -37,7 +52,7 @@ helm template workloads/llm-finetune-llama-factory/helm \
 
 ## Multi-node finetuning with ray
 
-The chart supports multi-node jobs by setting `nodes` to an integer greater than 1. Doing so enables ray and creates a RayJob instead. An example config is provided in [`overrides/finetune-lora-ray.yaml`](overrides/finetune-lora-ray.yaml)
+The chart supports multi-node jobs by setting `nodes` to an integer greater than 1. Doing so enables ray and creates a RayJob instead. An example config is provided in [`overrides/finetune-lora-ray.yaml`](overrides/finetune-lora-ray.yaml). The example also shows how to use [DeepSpeed ZeRO Stage 2](https://deepspeed.readthedocs.io/en/latest/zero3.html) to partition the gradients. To enable DeepSpeed, set the `deepspeed` parameter in the LLaMA-Factory config to point to one of the [deepspeed configs](https://github.com/hiyouga/LLaMA-Factory/tree/main/examples/deepspeed) included in LLaMA-Factory or a dictionary.
 
 When configuring ray jobs, the resources you are requesting (`nodes` and `gpusPerNode`) are automatically specified for LLaMA-Factory, and do not need to be included separately in the `llamaFactoryConfig`.
 
diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml
index 7dd542b..c2835c7 100644
--- a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml
@@ -1,12 +1,18 @@
+### Model ###
+modelName: meta-llama/Llama-3.1-8B-Instruct
+
+### Data ###
+dataset: identity,alpaca_en_demo
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora"
+
 # Resources:
 checkpointsReservedSize: 10Gi
 nodes: 2
 gpusPerNode: 1
 memoryPerNode: 32Gi
 
-### Model output path ###
-checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora"
-
 hfTokenSecret:
   name: hf-token
   key: hf-token
@@ -15,7 +21,6 @@ hfTokenSecret:
 ### this example adapted from https://github.com/hiyouga/LLaMA-Factory/blob/main/examples/train_lora/llama3_lora_sft_ray.yaml
 llamaFactoryConfig:
   ### model
-  model_name_or_path: meta-llama/Llama-3.1-8B-Instruct  # or use local absolute path
   trust_remote_code: true
 
   ### method
@@ -24,10 +29,9 @@ llamaFactoryConfig:
   finetuning_type: lora
   lora_rank: 8
   lora_target: all
+  deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z2_config.json  # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config
 
   ### dataset
-  dataset: identity,alpaca_en_demo
-  dataset_dir: REMOTE:llamafactory/demo_data  # or use local absolute path
   template: llama3
   cutoff_len: 2048
   max_samples: 1000
@@ -62,11 +66,3 @@ llamaFactoryConfig:
   warmup_ratio: 0.1
   bf16: true
   ddp_timeout: 180000000
-  resume_from_checkpoint: null
-
-  ### eval
-  # eval_dataset: alpaca_en_demo
-  # val_size: 0.1
-  # per_device_eval_batch_size: 1
-  # eval_strategy: steps
-  # eval_steps: 500
diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml
index 1bf8a3f..e7f5d83 100644
--- a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml
@@ -1,9 +1,15 @@
-# Resources:
-checkpointsReservedSize: 10Gi
+### Model ###
+modelName: meta-llama/Llama-3.1-8B-Instruct
+
+### Data ###
+dataset: identity,alpaca_en_demo
 
 ### Model output path ###
 checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora"
 
+# Resources:
+checkpointsReservedSize: 10Gi
+
 hfTokenSecret:
   name: hf-token
   key: hf-token
@@ -12,7 +18,6 @@ hfTokenSecret:
 ### this example from https://github.com/hiyouga/LLaMA-Factory/blob/main/examples/train_lora/llama3_lora_sft.yaml
 llamaFactoryConfig:
   ### model
-  model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
   trust_remote_code: true
 
   ### method
@@ -23,8 +28,6 @@ llamaFactoryConfig:
   lora_target: all
 
   ### dataset
-  dataset: identity,alpaca_en_demo
-  dataset_dir: REMOTE:llamafactory/demo_data  # or use local absolute path
   template: llama3
   cutoff_len: 2048
   max_samples: 1000
@@ -49,11 +52,3 @@ llamaFactoryConfig:
   warmup_ratio: 0.1
   bf16: true
   ddp_timeout: 180000000
-  resume_from_checkpoint: null
-
-  ### eval
-  # eval_dataset: alpaca_en_demo
-  # val_size: 0.1
-  # per_device_eval_batch_size: 1
-  # eval_strategy: steps
-  # eval_steps: 500
diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml
new file mode 100644
index 0000000..9a70ec3
--- /dev/null
+++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml
@@ -0,0 +1,64 @@
+### Model ###
+modelRemote: "default-bucket/models/tiny-llama/tinyllama-1.1b-chat-v1.0"
+
+### Data ###
+# list datasets to use, can include datasets predefined in LLaMA-Factory or those defined in datasetInfo
+dataset: argilla
+# for remote datasets to be loaded from MinIO, specify the path to the dataset in the remote bucket as pathRemote
+datasetInfo:
+  argilla:
+    pathRemote: "default-bucket/datasets/argilla-mistral-large-human-prompts.jsonl"
+    formatting: sharegpt
+    columns:
+      messages: "messages"
+    tags:
+      role_tag: "role"
+      content_tag: "content"
+      user_tag: "user"
+      assistant_tag: "assistant"
+      system_tag: "system"
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/tinyllama-argilla-llama-factory-lora"
+resumeFromCheckpoint: true
+
+# Resources:
+checkpointsReservedSize: 10Gi
+
+### llama-factory config ###
+llamaFactoryConfig:
+  ### model
+  trust_remote_code: true
+
+  ### method
+  stage: sft
+  do_train: true
+  finetuning_type: lora
+  lora_rank: 8
+  lora_target: all
+
+  ### dataset
+  template: llama2
+  cutoff_len: 8192
+  max_samples: 1000
+  overwrite_cache: true
+  preprocessing_num_workers: 16
+  dataloader_num_workers: 4
+
+  ### output
+  logging_steps: 10
+  save_steps: 500
+  plot_loss: true
+  overwrite_output_dir: true
+  save_only_model: false
+  report_to: none  # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+  ### train
+  per_device_train_batch_size: 1
+  gradient_accumulation_steps: 8
+  learning_rate: 1.0e-4
+  num_train_epochs: 3.0
+  lr_scheduler_type: cosine
+  warmup_ratio: 0.1
+  bf16: true
+  ddp_timeout: 180000000
diff --git a/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl b/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl
index 645aceb..69f9319 100644
--- a/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl
+++ b/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl
@@ -88,4 +88,8 @@ spec:
           mode: 0777
         - key: llama_factory_config.yaml
           path: llama_factory_config.yaml
+        {{- if .Values.datasetInfo }}
+        - key: remote_dataset_info.json
+          path: remote_dataset_info.json
+        {{- end }}
 {{- end }}
diff --git a/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml b/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml
index f75cf3b..7d384ea 100644
--- a/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml
@@ -4,6 +4,13 @@ metadata:
   name: "{{ .Release.Name }}-configs"
 data:
   llama_factory_config.yaml: |
+    {{- if .Values.modelRemote }}
+    model_name_or_path: /workdir/basemodel
+    {{- else }}
+    model_name_or_path: "{{ .Values.modelName }}"
+    {{- end }}
+    dataset: {{ .Values.dataset }}
+    dataset_dir: /workspace/LLaMA-Factory/data
     output_dir: /workdir/checkpoints
     {{- if ne (int $.Values.nodes) 1 }}
     ray_run_name: "{{ .Release.Name }}"
@@ -13,35 +20,74 @@ data:
       GPU: {{ .Values.gpusPerNode }}
     {{- end }}
 {{ toYaml .Values.llamaFactoryConfig | indent 4 }}
+{{ if .Values.datasetInfo }}
+  remote_dataset_info.json: |
+{{ toPrettyJson .Values.datasetInfo | indent 4 }}
+{{- end }}
   entrypoint.sh: |
     #!/bin/bash
+    set -e
     # Print GPU Info:
     rocm-smi
     mkdir -p /workdir/checkpoints
-    {{- if .Values.checkpointsRemote }}
+    mkdir -p /workdir/datasets
+    cd /workspace/LLaMA-Factory
+    cp /configs/llama_factory_config.yaml llama_factory_config.yaml
+    {{- if .Values.datasetInfo }}
+    cp /configs/remote_dataset_info.json remote_dataset_info.json
+    {{- end }}
     # Setup MinIO
     mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_STORAGE_ACCESS_KEY $BUCKET_STORAGE_SECRET_KEY
+    {{- if .Values.modelRemote }}
+    # copy model from remote to local
+    mc cp --recursive \
+      minio-host/{{ .Values.modelRemote | trimSuffix "/" }}/ \
+      /workdir/basemodel
+    {{- end }}
+    {{- range .Values.datasetInfo }}
+    {{- if .pathRemote }}
+    # copy dataset from remote to local
+    mc cp \
+      minio-host/{{ .pathRemote }} \
+      /workdir/datasets/{{ .pathRemote | replace "/" "_" }}
+    sed -i 's;"pathRemote": "{{ .pathRemote }}";"file_name": "/workdir/datasets/{{ .pathRemote | replace "/" "_" }}";g' remote_dataset_info.json
+    {{- end }}
+    {{- end }}
+    {{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''") }}
+    {{- if .Values.checkpointsRemote }}
+    {{- if .Values.resumeFromCheckpoint }}
     # Sync checkpoints from remote to local
-    if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then
-      echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+    if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+      echo 'Downloaded checkpoints from' {{ $checkpointsRemotePath }} 'to /workdir/checkpoints'
       ls -lah /workdir/checkpoints
+      echo "resume_from_checkpoint: /workdir/checkpoints" >> llama_factory_config.yaml
     else
       echo 'No checkpoints found yet'
     fi
+    {{- end }}
     echo "Starting checkpoint sync process"
     mc mirror \
       --watch \
       /workdir/checkpoints \
-      minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ &
+      {{ $checkpointsRemotePath }} &
     uploadPID=$!
+    # Check if the sync process started successfully
+    sleep 1
+    if ! ps -p $uploadPID > /dev/null; then
+      echo "ERROR: Sync process failed to start"
+      exit 1
+    fi
     {{- end }}
     # Run training:
     echo "Starting training process"
-    cd LLaMA-Factory/
+    {{- if .Values.datasetInfo }}
+    jq -s add remote_dataset_info.json /workspace/LLaMA-Factory/data/dataset_info.json > dataset_info.json
+    cp dataset_info.json /workspace/LLaMA-Factory/data/dataset_info.json
+    {{- end }}
     {{- if ne (int $.Values.nodes) 1 }}
     export USE_RAY=1
     {{- end }}
-    llamafactory-cli train /configs/llama_factory_config.yaml
+    llamafactory-cli train llama_factory_config.yaml
     {{- if .Values.checkpointsRemote }}
     echo "Training done, stop the upload process"
     kill $uploadPID
@@ -50,6 +96,6 @@ data:
     echo 'Training done, syncing once more...'
     mc mirror --overwrite \
       /workdir/checkpoints \
-      minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/
+      {{ $checkpointsRemotePath }}
     {{- end }}
     echo 'All done, exiting'
diff --git a/workloads/llm-finetune-llama-factory/helm/values.schema.json b/workloads/llm-finetune-llama-factory/helm/values.schema.json
index 95d8a9d..5c62416 100644
--- a/workloads/llm-finetune-llama-factory/helm/values.schema.json
+++ b/workloads/llm-finetune-llama-factory/helm/values.schema.json
@@ -6,6 +6,22 @@
             "type": "string",
             "description": "Container image for finetuning"
         },
+        "modelName": {
+            "type": "string",
+            "description": "Model path in Huggginface"
+        },
+        "modelRemote": {
+            "type": "string",
+            "description": "Model path in remote MinIO storage, format: bucketName/path/in/bucket"
+        },
+        "dataset": {
+            "type": "string",
+            "description": "Name of data set used for training. Use commas to separate multiple data sets."
+        },
+        "datasetInfo": {
+            "type": "object",
+            "description": "Additional datasets can be specified in datasetInfo, according to the LLaMA-Factory dataset format, see https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md"
+        },
         "kaiwo": {
             "type": "object",
             "properties": {
@@ -109,12 +125,14 @@
             "default": "16Gi"
         },
         "checkpointsRemote": {
-            "type": [
-                "string",
-                "null"
-            ],
+            "type": "string",
             "description": "Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket"
         },
+        "resumeFromCheckpoint": {
+            "type": "boolean",
+            "description": "If true, resume from the last checkpoint in checkpointsRemote (if available)",
+            "default": false
+        },
         "hfTokenSecret": {
             "type": "object",
             "properties": {
diff --git a/workloads/llm-finetune-llama-factory/helm/values.yaml b/workloads/llm-finetune-llama-factory/helm/values.yaml
index 06339b8..50e5117 100644
--- a/workloads/llm-finetune-llama-factory/helm/values.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/values.yaml
@@ -1,6 +1,20 @@
 ### General chart values ###
 finetuningImage: ghcr.io/silogen/llama-factory-rocm-pytorch-training:v0.3
 
+### Model ###
+# either modelRemote OR modelName must be set
+# to use a base model directly from Hugging Face, set modelName to the model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+modelName: ""
+# for remote models to be loaded from MinIO, specify the path to the model in the remote bucket as modelRemote
+modelRemote: ""
+
+### Data ###
+# list datasets to use, can include datasets predefined in LLaMA-Factory or those defined in datasetInfo
+dataset: ""
+# Additional datasets can be specified in datasetInfo, according to the LLaMA-Factory dataset format, see https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md
+# For remote datasets to be loaded from MinIO, specify the path to the dataset in the remote bucket as pathRemote
+datasetInfo: {}
+
 # kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
 kaiwo:
   enabled: false
@@ -9,8 +23,10 @@ kaiwo:
 labels: {}
 
 # Extra annotations such as an imagePullSecrets
-imagePullSecrets:
-  - "regcred"
+imagePullSecrets: []
+  # Example:
+  # imagePullSecrets:
+  #   - "regcred"
 
 # Configure these to match the credentials in your cluster:
 bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
@@ -37,7 +53,8 @@ llamaFactoryConfig:
   stage: sft
 
 ### Model output path ###
-checkpointsRemote:  # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
+checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
+resumeFromCheckpoint: false  # Set to true to resume from the last checkpoint in checkpointsRemote (if available)
 
 hfTokenSecret: {} # Optional secret reference that contains the Huggingface token
 # Example:
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml
index 1a3ad6d..784283a 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml
@@ -19,8 +19,6 @@ finetuning_config:
   data_conf:
     training_data:
       type: CONCATENATION
-      datasets:
-        - path: "PLACEHOLDER"
     validation_data:
       type: AUTO_SPLIT
       ratio: 0.1
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml
index 50ee0f4..b0d5dac 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml
@@ -13,8 +13,6 @@ finetuning_config:
   data_conf:
     training_data:
       type: CONCATENATION
-      datasets:
-        - path: "PLACEHOLDER"
     validation_data:
       type: AUTO_SPLIT
       ratio: 0.1
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml
index 8f73618..d8426b8 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml
@@ -13,8 +13,6 @@ finetuning_config:
   data_conf:
     training_data:
       type: CONCATENATION
-      datasets:
-        - path: "PLACEHOLDER"
     validation_data:
       type: AUTO_SPLIT
       ratio: 0.1
diff --git a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md b/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
index d229420..3c980b1 100644
--- a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
+++ b/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
@@ -9,16 +9,17 @@ See the various sub-configs for their options. Additional properties are not all
 
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
-| method | `const` |  | `sft` | `"sft"` |  |
 | data_conf | `object` | ✅ | [ChatTrainValidConfig](#chattrainvalidconfig) |  | The data input config |
 | training_args | `object` | ✅ | [SilogenTrainingArguments](#silogentrainingarguments) |  | Transformer TrainingArguments with some restrictions |
-| overrides | `object` |  | [Overrides](#overrides) | `{"num_train_epochs": null, "lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface |
 | batchsize_conf | `object` | ✅ | [BatchsizeConfig](#batchsizeconfig) |  | Batch size configuration |
-| peft_conf | `object` | ✅ | [NoPeftConfig](#nopeftconfig) or [PretrainedPeftConfig](#pretrainedpeftconfig) or [GenericPeftConfig](#genericpeftconfig) |  | Adapter configuration |
+| peft_conf | `object` | ✅ | [GenericPeftConfig](#genericpeftconfig) and/or [NoPeftConfig](#nopeftconfig) and/or [PretrainedPeftConfig](#pretrainedpeftconfig) |  | Adapter configuration |
 | run_conf | `object` | ✅ | [RunConfig](#runconfig) |  | Model related configuration |
-| tracking | `object` or `null` |  | [FinetuningTrackingConfig](#finetuningtrackingconfig) |  | MLFlow tracking configuration |
-| quant_conf | `object` |  | [BnBQuantizationConfig](#bnbquantizationconfig) or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration |
 | sft_args | `object` | ✅ | [SFTArguments](#sftarguments) |  | SFT specific arguments |
+| method | `const` |  | `sft` | `"sft"` |  |
+| overrides | `object` |  | [Overrides](#overrides) | `{"lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface |
+| tracking | `object` or `null` |  | [FinetuningTrackingConfig](#finetuningtrackingconfig) |  | MLFlow tracking configuration |
+| quant_conf | `object` |  | [BnBQuantizationConfig](#bnbquantizationconfig) and/or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration |
+
 
 ---
 
@@ -33,7 +34,7 @@ Automatic validation split from the training data
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | type | `const` | ✅ | `AUTO_SPLIT` |  |  |
-| data_type | `string` |  | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
 | ratio | `number` |  | number | `0.2` | Ratio of the training data to use for validation |
 | seed | `integer` |  | integer | `1289525893` | Seed for the random number generator for splitting |
 
@@ -78,11 +79,20 @@ see: https://huggingface.co/docs/transformers/en/main_classes/quantization#trans
 | bnb_4bit_use_double_quant | `boolean` |  | boolean | `False` |  |
 | bnb_4bit_quant_storage | `string` or `null` |  | string |  |  |
 
+## ChatTemplateName
+
+Chat template to use.
+
+#### Type: `string`
+
+**Possible Values:** `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31`
+
 ## ChatTrainValidConfig
 
-Training time data configuration.
+Training time data configuration
 
-Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput is also allowed for the validation side.
+Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput
+is also allowed for the validation side.
 
 Additionally includes chat template and padding configurations, as those are part of the data input pipeline.
 
@@ -90,9 +100,9 @@ Additionally includes chat template and padding configurations, as those are par
 
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
-| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) or [WeightedMixDataInput](#weightedmixdatainput) |  |  |
-| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) or [ConcatenationDataInput](#concatenationdatainput) or [NoneDataInput](#nonedatainput) |  |  |
-| chat_template_name | `string` |  | `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31` | `"mistral-with-system"` |  |
+| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) and/or [WeightedMixDataInput](#weightedmixdatainput) |  |  |
+| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) and/or [ConcatenationDataInput](#concatenationdatainput) and/or [NoneDataInput](#nonedatainput) |  |  |
+| chat_template_name | `string` |  | [ChatTemplateName](#chattemplatename) | `"mistral-with-system"` |  |
 | padding_side | `string` |  | string | `"right"` | Padding side, right is usually right. |
 | missing_pad_token_strategy | `string` |  | [MissingPadTokenStrategy](#missingpadtokenstrategy) | `"bos-repurpose"` | See the MissingPadTokenStrategys for descriptions of the options |
 
@@ -117,7 +127,7 @@ For DPO this means lines of:
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | type | `const` | ✅ | `CONCATENATION` |  |  |
 | datasets | `array` | ✅ | [DatasetDefinition](#datasetdefinition) |  |  |
-| data_type | `string` |  | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
 
 ## DatasetDefinition
 
@@ -137,11 +147,11 @@ Settings that define how run details are logged
 
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
-| mlflow_server_uri | `string` | ✅ | string |  | MLflow server URI. Can be local path |
-| experiment_name | `string` | ✅ | string |  | Experiment name that is used for MLFlow tracking |
-| run_id | `string` or `null` |  | string |  | Run id, to resume logging to previousely started run |
-| run_name | `string` or `null` |  | string |  | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified |
-| hf_mlflow_log_artifacts | `string` |  | string | `"False"` | Whether to store model artifacts in MLFlow |
+| mlflow_server_uri | `string` | ✅ | string |  | MLflow server URI. Can be local path. |
+| experiment_name | `string` | ✅ | string |  | Experiment name that is used for MLFlow tracking. |
+| run_id | `string` or `null` |  | string |  | Run id, to resume logging to previously started run. |
+| run_name | `string` or `null` |  | string |  | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified. |
+| hf_mlflow_log_artifacts | `string` |  | string | `"False"` | Whether to store model artifacts in MLFlow. |
 
 ## GenericPeftConfig
 
@@ -150,7 +160,8 @@ Config for any new initialized PEFT Adapter
 See https://huggingface.co/docs/peft/tutorial/peft_model_config for the possible kwargs
 and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py for the types.
 
-### Example
+Example:
+
     >>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM',
     ...         'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}}
     >>> generic_conf = GenericPeftConfig(**loaded_data)
@@ -171,8 +182,6 @@ and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py
 | task_type | `string` |  | [TaskType](#tasktype) | `"CAUSAL_LM"` |  |
 | peft_kwargs | `object` |  | object |  |  |
 
-
-
 ## MissingPadTokenStrategy
 
 Specifies the available missing pad token strategies.
@@ -207,9 +216,10 @@ See parameter docstrings and help at:
 https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained
 See below in "Parameters for big model inference" too, it affects training too. Also note that this link takes you
 to the transformers main branch version - be sure to compare with the installed version of transformers (that keeps
-changing over time, and it is difficult to keep this doctstring up to date, so we wanted to link to the latest here).
+changing over time, and it is difficult to keep this docstring up to date, so we wanted to link to the latest here).
 
 Some important parameters to consider are:
+
 - device_map :
     A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer
     name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass
@@ -230,26 +240,26 @@ NOTE:
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | torch_dtype | `const` |  | `auto` | `"auto"` |  |
-| device_map | `object` or `string` or `null` |  | object and/or string |  | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential" |
+| device_map | `object` or `string` or `null` |  | object and/or string |  | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential". |
 | max_memory | `object` or `null` |  | object |  |  |
 | low_cpu_mem_usage | `boolean` |  | boolean | `False` |  |
-| attn_implementation | `string` or `null` |  | string |  | Note: this can be set to "sdpa", "flash_attention_2", "eager" |
+| attn_implementation | `string` or `null` |  | string |  | Note: this can be set to "sdpa", "flash_attention_2", "eager". |
 | offload_folder | `string` or `null` |  | string |  |  |
 | offload_state_dict | `boolean` or `null` |  | boolean |  | Default is True if offloading (otherwise no effect) |
 | offload_buffers | `boolean` or `null` |  | boolean |  |  |
-| use_cache | `boolean` |  | boolean | `True` | Saves generated hidden states to speed up generation. See: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 use_cache is mutually exclusive with gradient_checkpointing |
+| use_cache | `boolean` |  | boolean | `true` | Saves generated hidden states to speed up generation, see: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 This is mutually exclusive with gradient_checkpointing. |
 | cache_dir | `string` or `null` |  | string |  |  |
-| force_download | `boolean` |  | boolean | `False`  |  |
-| local_files_only | `boolean` |  | boolean | `False`  |  |
+| force_download | `boolean` |  | boolean | `False` |  |
+| local_files_only | `boolean` |  | boolean | `False` |  |
 | proxies | `object` or `null` |  | object |  |  |
-| resume_download | `boolean` |  | boolean | `False`  |  |
+| resume_download | `boolean` |  | boolean | `False` |  |
 | revision | `string` |  | string | `"main"` |  |
 | code_revision | `string` |  | string | `"main"` |  |
 | subfolder | `string` or `null` |  | string |  |  |
 | token | `string` or `null` |  | string |  |  |
 | use_safetensors | `boolean` or `null` |  | boolean |  |  |
 | variant | `string` or `null` |  | string |  |  |
-| trust_remote_code | `boolean` |  | boolean | `False`  | Warning: if set to `True`, allows execution of downloaded remote code |
+| trust_remote_code | `boolean` |  | boolean | `False` | Warning: if set to True, allows execution of downloaded remote code. |
 
 ## NoPeftConfig
 
@@ -280,23 +290,20 @@ A special type for not using data e.g. in validation
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | type | `const` | ✅ | `NONE` |  |  |
-| data_type | `string` |  | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
 
 ## Overrides
 
-Override options that allow simple interfaces for charts using these configs
+Override options
 
-This is particularly useful for a helm chart interface where we include the finetuning package config
-as a part of the values.yaml file. These a more flexible helm interface with certain keys brought to the
-top level.
+These implement dynamic scaling for the learning rate.
 
 #### Type: `object`
 
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
-| num_train_epochs | `integer` or `number` or `null` |  | number |  | Overrides the number of epochs in the training_args |
 | lr_multiplier | `number` |  | number | `1.0` | Multiplier applied to the learning rate in the training_args |
-| lr_batch_size_scaling | `string` |  | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size. `none`: No scaling. `sqrt`: Multiplies learning rate by square root of batch size (a classic scaling rule). `linear`: Multiplies learning rate by the batch size (a more modern scaling rule). |
+| lr_batch_size_scaling | `string` |  | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size.             'none': No scaling.             'sqrt': Multiplies learning rate by square root of batch size (a classic scaling rule).             'linear': Multiplies learning rate by the batch size (a more modern scaling rule). |
 
 ## PeftType
 
@@ -335,7 +342,7 @@ PEFT adapter uses the config and initialisation from a pretrained adapter
 | Property | Type | Required | Possible values | Description |
 | -------- | ---- | -------- | --------------- | ----------- |
 | peft_type | `const` | ✅ | `PRETRAINED_PEFT` |  |
-| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft |
+| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft. |
 
 ## RunConfig
 
@@ -345,12 +352,13 @@ Experiment running configuration
 
 | Property | Type | Required | Possible values | Default | Description |
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
-| model | `string` |  | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be `/local_resources/basemodel` |
+| model | `string` |  | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be /local_resources/basemodel |
 | model_args | `object` |  | [ModelArguments](#modelarguments) | `{"torch_dtype": "auto", "device_map": "auto", "max_memory": null, "low_cpu_mem_usage": false, "attn_implementation": null, "offload_folder": null, "offload_state_dict": null, "offload_buffers": null, "use_cache": true, "cache_dir": null, "force_download": false, "local_files_only": false, "proxies": null, "resume_download": false, "revision": "main", "code_revision": "main", "subfolder": null, "token": null, "use_safetensors": null, "variant": null, "trust_remote_code": false}` |  |
 | tokenizer | `string` or `null` |  | string |  | Model HuggingFace ID, or path, or None to use the one associated with the model |
-| use_fast_tokenizer | `boolean` |  | boolean | `True` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. |
-| resume_from_checkpoint | `boolean` or `string` | | boolean and/or string | `False`  | Normally should be set to 'auto' to continue if a checkpoint exists. Can set to `True` to always try to continue, `False` to never try, or a path to load from a specific path. |
+| use_fast_tokenizer | `boolean` |  | boolean | `true` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. |
+| resume_from_checkpoint | `boolean` or `string` |  | boolean and/or string |  | Normally should be set to 'auto' to continue if a checkpoint exists.        Can set to True to always try to continue, False to never try, or a path to load from a specific path. |
 | final_checkpoint_name | `string` |  | string | `"checkpoint-final"` | Name of final checkpoint. Should be left as default |
+| determinism | `string` |  | `no` `half` `full` | `"no"` | Set the level of determinism in implementations. Deterministic implementations are not always available,            and when they are, they are usually slower than their non-deterministic counterparts. Recommended for            debugging only.            'no': No determinism.            'half': Prefer deterministic implementations.            'full': Only fully deterministic implementations, error out on operations that only have non-deterministic                    implementations. |
 
 ## SFTArguments
 
@@ -425,5 +433,5 @@ For DPO this means lines of:
 | -------- | ---- | -------- | --------------- | ------- | ----------- |
 | type | `const` | ✅ | `PRECOMPUTE_WEIGHTED_MIX` |  |  |
 | datasets | `array` | ✅ | [WeightedDatasetDefinition](#weighteddatasetdefinition) |  |  |
-| data_type | `string` |  | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` |  | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
 | seed | `integer` |  | integer | `19851243` | Seed for the random number generator for interleaving draws |
diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
index 75c4189..c1be6ca 100644
--- a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
+++ b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
@@ -4,29 +4,30 @@
 echo 'Copying resources to container...';
 mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY}
 mc cp --recursive \
-  minio-host/{{ .Values.basemodel | trimSuffix "/" }}/ \
+  minio-host/'{{ .Values.basemodel | trimSuffix "/" }}'/ \
   /local_resources/basemodel
 {{- if $.Values.trainingData }}
 mc cp \
-  minio-host/{{ $.Values.trainingData }} \
-  /local_resources/{{ $.Values.trainingData | replace "/" "_" }}
+  minio-host/'{{ $.Values.trainingData | replace  "'" "'\\''" }}' \
+  /local_resources/'{{ $.Values.trainingData | replace  "'" "'\\''" | replace "/" "_" }}'
 {{- else }}
 {{- range .Values.finetuning_config.data_conf.training_data.datasets }}
 mc cp \
-  minio-host/{{ .path }} \
-  /local_resources/{{ .path | replace "/" "_" }}
+  minio-host/'{{ .path | replace  "'" "'\\''" }}' \
+  /local_resources/'{{ .path | replace  "'" "'\\''" | replace "/" "_" }}'
 {{- end }}
 {{- if (or (eq .Values.finetuning_config.data_conf.validation_data.type "AUTO_SPLIT" ) (eq .Values.finetuning_config.data_conf.validation_data.type "NONE")) }}
 {{- range .Values.finetuning_config.data_conf.validation_data.datasets }}
 mc cp \
-  minio-host/{{ .path }} \
-  /local_resources/{{ .path | replace "/" "_" }}
+  minio-host/'{{ .path | replace  "'" "'\\''" }}' \
+  /local_resources/'{{ .path | replace  "'" "'\\''" | replace "/" "_" }}'
 {{- end }}
 {{- end }}
 {{- end }}
 # Sync checkpoints from remote to local
-if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then
-  echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''") }}
+if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+  echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''" }} to /workdir/checkpoints'
   ls -lah /workdir/checkpoints
 else
   echo 'No checkpoints found yet'
@@ -35,25 +36,38 @@ fi
 
 {/* ####################################################################################################################################################### */}}
 {{- define "finetuningAndUploadEntrypoint" -}}
-{{- $logs_path := (default ( .Values.checkpointsRemote | trimSuffix "/" | printf "%s/logs/" ) .Values.logsRemote ) -}}
+# quote paths with single quotes to avoid issues with special characters in paths, and replace any existing single quote with escaped single quote
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''") }}
+{{- $logsRemotePath := printf "minio-host/'%s'/" ( (default ( .Values.checkpointsRemote | trimSuffix "/" | printf "%s/logs" ) .Values.logsRemote ) | trimSuffix "/" | replace  "'" "'\\''") -}}
 # Print GPU Info:
 rocm-smi
 echo "Starting checkpoint sync process"
 mc mirror \
   --watch \
   /workdir/checkpoints \
-  minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ &
+  {{ $checkpointsRemotePath }} &
 uploadPID=$!
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $uploadPID > /dev/null; then
+  echo "ERROR: Sync process failed to start"
+  exit 1
+fi
 # Run training:
 {{- if .Values.runTensorboard }}
 tensorboard --logdir /workdir/logs --port 6006 &
 echo "Serving tensorboard on port 6006. Port-forward to access training logs during the training process lifetime."
-echo "Also starting logs upload process, uploading to {{ $logs_path }}"
+echo "Also starting logs upload process, uploading to {{ $logsRemotePath }}"
 mc mirror \
   --watch \
   /workdir/logs \
-  minio-host/{{ $logs_path }} &
+  {{ $logsRemotePath }} &
 logsPID=$!
+sleep 1
+if ! ps -p $logsPID > /dev/null; then
+  echo "ERROR: Logs sync process failed to start"
+  exit 1
+fi
 {{- end }}
 echo "Starting training process"
 accelerate launch \
@@ -81,16 +95,16 @@ merge_adapter $merge_base ./checkpoints/checkpoint-final ./checkpoints/checkpoin
 echo 'Training done, syncing once more...'
 mc mirror \
   /workdir/checkpoints \
-  minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/
+  {{ $checkpointsRemotePath }}
 {{- if .Values.runTensorboard }}
 mc mirror \
   /workdir/logs \
-  minio-host/{{ $logs_path }}
+  {{ $logsRemotePath }}
 {{- end }}
 # Sync the final checkpoint with overwrite to carry over vLLM-compatibility changes
 mc mirror \
   --overwrite \
   /workdir/checkpoints/checkpoint-final \
-  minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/checkpoint-final/
+  {{ $checkpointsRemotePath }}checkpoint-final/
 echo 'All done, exiting'
 {{- end }}
diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
index 739122d..ec33a0b 100644
--- a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
@@ -47,7 +47,7 @@ data:
     main_process_port: null
     mixed_precision: bf16
     num_machines: 1
-    num_processes: {{ .Values.finetuningGpus }}
+    num_processes: 1
     use_cpu: false
     {{- else if (eq .Values.distributedType "auto-ddp") }}
     compute_environment: LOCAL_MACHINE
diff --git a/workloads/llm-finetune-verl/helm/Chart.yaml b/workloads/llm-finetune-verl/helm/Chart.yaml
new file mode 100644
index 0000000..ca3ef76
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: llm-finetune-verl-example
+description: VeRL finetuning on SiloGen stack
+version: 0.0.1
diff --git a/workloads/llm-finetune-verl/helm/README.md b/workloads/llm-finetune-verl/helm/README.md
new file mode 100644
index 0000000..6408efa
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/README.md
@@ -0,0 +1,49 @@
+# Finetuning with VeRL
+
+This is a Helm Chart for running a finetuning job using [VeRL](https://github.com/volcengine/verl)
+
+The output is saved with MinIO in the directory specified by `checkpointsRemote`.
+
+## Configuration
+
+Include any parameters for VeRL in the `verlConfig` parameter. See the override file [`overrides/ppo_qwen_gsm8k.yaml`](overrides/ppo_qwen_gsm8k.yaml) for an example and the [VeRL documentation](https://verl.readthedocs.io/en/latest/examples/config.html) for more details.
+
+## Running the workload
+
+The simplest is to run `helm template` and pipe the result to `kubectl create`.
+
+Example command using the example override file `overrides/ppo_qwen_gsm8k.yaml`:
+
+```bash
+helm template workloads/llm-finetune-verl/helm \
+  --values workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml \
+  --name-template ppo-qwen-gsm8k-verl \
+  | kubectl create -f -
+```
+
+## Data specification
+
+VeRL requires that the data is prepared for the policy training in a [particular way](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html).
+
+Some example data preprocess scripts are provided, to use one of these, specify the name of data set used for training as `dataset`. Available datasets are "full_hh_rlhf", "geo3k", "gsm8k", "hellaswag", "math_dataset".
+
+To use your own datasets from MinIO, specify the path as `datasetRemote`. It should point to a directory with files that have already been appropriately processed (`train.parquet` and `test.parquet`).
+
+## Model specification
+
+To use a base model from HuggingFace or other source directly supported by LLaMA-Factory, specify the model name in `modelName`.
+
+Alternatively to use a model from MinIO, specify the path to the model in `modelRemote`.
+
+Either `modelName` or `modelRemote` must be specified. If both are included, the model from `modelRemote` is used.
+
+## Cleanup
+
+After the jobs are completed, please delete the resources created. To delete the resources, you can run the same `helm template` command, only replacing `kubectl create` with `kubectl delete`, e.g.:
+
+```bash
+helm template workloads/llm-finetune-verl/helm \
+  --values workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml \
+  --name-template ppo-qwen-gsm8k-verl \
+  | kubectl delete -f -
+```
diff --git a/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml b/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml
new file mode 100644
index 0000000..99f6801
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml
@@ -0,0 +1,45 @@
+### Model ###
+modelName: "Qwen/Qwen2-7B-Instruct"
+
+### Data ###
+dataset: "gsm8k"
+
+# Resources:
+checkpointsReservedSize: 512Gi
+storageClass: mlstorage
+finetuningGpus: 2
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/Qwen2_7B_Instruct_GRPO_gsm8k_verl"
+
+verlConfig:
+  algorithm:
+    adv_estimator: grpo
+    kl_ctrl:
+      kl_coef: 0.001
+  data:
+    train_batch_size: 1024
+    max_prompt_length: 512
+    max_response_length: 1024
+  actor_rollout_ref:
+    model:
+      use_remove_padding: True
+      enable_gradient_checkpointing: True
+    actor:
+      ppo_micro_batch_size_per_gpu: 80
+      use_kl_loss: True
+      kl_loss_coef: 0.001
+      kl_loss_type: low_var_kl
+    rollout:
+      n: 5
+      log_prob_micro_batch_size_per_gpu: 40
+      tensor_model_parallel_size: 2
+      gpu_memory_utilization: 0.6
+    ref:
+      log_prob_micro_batch_size_per_gpu: 40
+      fsdp_config:
+        param_offload: True
+  trainer:
+    total_epochs: 10
diff --git a/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: true
diff --git a/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml b/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml
new file mode 100644
index 0000000..fac53d8
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml
@@ -0,0 +1,50 @@
+### Model ###
+modelName: "Qwen/Qwen2-7B-Instruct"
+
+### Data ###
+dataset: "gsm8k"
+
+# Resources:
+checkpointsReservedSize: 512Gi
+storageClass: mlstorage
+finetuningGpus: 2
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/Qwen2_7B_Instruct_PPO_gsm8k_verl"
+
+verlConfig:
+  data:
+    train_batch_size: 1024
+    max_prompt_length: 1024
+    max_response_length: 512
+  actor_rollout_ref:
+    model:
+      use_remove_padding: True
+      enable_gradient_checkpointing: True
+    actor:
+      ppo_micro_batch_size_per_gpu: 16
+    rollout:
+      log_prob_micro_batch_size_per_gpu: 40
+      tensor_model_parallel_size: 2
+      gpu_memory_utilization: 0.6
+    ref:
+      log_prob_micro_batch_size_per_gpu: 40
+      fsdp_config:
+        param_offload: True
+  critic:
+    optim:
+      lr: 1e-5
+    model:
+      use_remove_padding: True
+      enable_gradient_checkpointing: True
+    ppo_micro_batch_size_per_gpu: 32
+    fsdp_config:
+      param_offload: False
+      optimizer_offload: False
+  algorithm:
+    kl_ctrl:
+      kl_coef: 0.001
+  trainer:
+    total_epochs: 10
diff --git a/workloads/llm-finetune-verl/helm/templates/configmap.yaml b/workloads/llm-finetune-verl/helm/templates/configmap.yaml
new file mode 100644
index 0000000..89e0424
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/templates/configmap.yaml
@@ -0,0 +1,119 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: "{{ .Release.Name }}-configs"
+data:
+  verl_config.yaml: |
+    # @package _global_
+{{ toYaml .Values.verlConfig | indent 4 }}
+  entrypoint.sh: |
+    #!/bin/bash
+    set -eu
+    # Print GPU Info:
+    rocm-smi
+    mkdir -p /workdir/checkpoints
+    mkdir -p /workdir/datasets
+
+    echo "Installing MinIO:"
+    curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+          --create-dirs \
+          -o /minio-binaries/mc
+    chmod +x /minio-binaries/mc
+    export PATH="${PATH}:/minio-binaries/"
+    # Setup MinIO
+    mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_STORAGE_ACCESS_KEY $BUCKET_STORAGE_SECRET_KEY
+    {{- if .Values.modelRemote }}
+    # copy model from remote to local
+    echo "Downloading model from remote: {{ .Values.modelRemote }}"
+    mc cp --recursive \
+      minio-host/{{ .Values.modelRemote | trimSuffix "/" }}/ \
+      /workdir/basemodel
+    MODEL_PATH=/workdir/basemodel
+    {{- else if .Values.modelName }}
+    MODEL_PATH={{ .Values.modelName }}
+    {{- else }}
+    {{- fail "either modelName or modelRemote must be set" }}
+    {{- end }}
+    python3 -c "import transformers;transformers.pipeline('text-generation', model='$MODEL_PATH')"
+
+    {{- if .Values.datasetRemote }}
+    echo "Downloading dataset from remote: {{ .Values.datasetRemote }}"
+    mc cp --recursive \
+      minio-host/{{ .Values.datasetRemote | trimSuffix "/" }}/ \
+      /workdir/datasets/{{ .Values.datasetRemote | trimSuffix "/" }}
+    DATASET_PATH=/workdir/datasets/{{ .Values.datasetRemote | trimSuffix "/" }}
+    {{- else if .Values.dataset }}
+    {{- if eq .Values.dataset "full_hh_rlhf" }}
+    python3 /app/examples/data_preprocess/{{ .Values.dataset }}.py --split rm --local_dir /workdir/datasets/{{ .Values.dataset }}
+    DATASET_PATH=/workdir/datasets/{{ .Values.dataset }}/rm
+    {{- else }}
+    python3 /app/examples/data_preprocess/{{ .Values.dataset }}.py --local_dir /workdir/datasets/{{ .Values.dataset }}
+    DATASET_PATH=/workdir/datasets/{{ .Values.dataset }}
+    {{- end }}
+    {{- else }}
+    {{- fail "either dataset or datasetRemote must be set" }}
+    {{- end }}
+
+    {{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace  "'" "'\\''") }}
+    {{- if .Values.checkpointsRemote }}
+    {{- if .Values.resumeFromCheckpoint }}
+    # Sync checkpoints from remote to local
+    if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+      echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+      ls -lah /workdir/checkpoints
+      RESUME_MODE='resume_path'
+    else
+      echo 'No checkpoints found yet'
+      RESUME_MODE='disable'
+    fi
+    {{- else }}
+    RESUME_MODE='disable'
+    {{- end }}
+    echo "Starting checkpoint sync process"
+    mc mirror \
+      --watch \
+      --overwrite \
+      /workdir/checkpoints \
+      {{ $checkpointsRemotePath }} &
+    uploadPID=$!
+    # Check if the sync process started successfully
+    sleep 1
+    if ! ps -p $uploadPID > /dev/null; then
+      echo "ERROR: Sync process failed to start"
+      exit 1
+    fi
+    {{- end }}
+
+    export HIP_VISIBLE_DEVICES=$(rocm-smi --showall --csv | grep -P '^card\d+,' | cut -d',' -f1 | sed 's/card//g' | paste -sd ',' -)
+    export NUM_GPUS=$(echo $HIP_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
+    export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+    export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+
+    # copy config file into the verl directory, this is necessary to apply it as an override with hydra
+    mkdir -p /app/verl/trainer/config/override
+    cp /configs/verl_config.yaml /app/verl/trainer/config/override/helm.yaml
+
+    echo "Starting training process"
+    python3 -m verl.trainer.main_ppo +override=helm \
+        data.train_files=$DATASET_PATH/train.parquet \
+        data.val_files=$DATASET_PATH/test.parquet \
+        actor_rollout_ref.model.path=$MODEL_PATH \
+        critic.model.path=$MODEL_PATH \
+        trainer.n_gpus_per_node=$NUM_GPUS \
+        trainer.project_name='{{ .Release.Name }}' \
+        trainer.experiment_name='{{ .Release.Name }}' \
+        trainer.default_local_dir=/workdir/checkpoints \
+        trainer.resume_mode=$RESUME_MODE \
+        trainer.resume_from_path=/workdir/checkpoints
+
+    {{- if .Values.checkpointsRemote }}
+    echo "Training done, stop the upload process"
+    kill $uploadPID
+    wait $uploadPID || true
+    # Once more to ensure everything gets uploaded
+    echo 'Training done, syncing once more...'
+    mc mirror --overwrite \
+      /workdir/checkpoints \
+      {{ $checkpointsRemotePath }}
+    {{- end }}
+    echo 'All done, exiting'
diff --git a/workloads/llm-finetune-verl/helm/templates/job.yaml b/workloads/llm-finetune-verl/helm/templates/job.yaml
new file mode 100644
index 0000000..3d3db12
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/templates/job.yaml
@@ -0,0 +1,113 @@
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "{{ .Release.Name }}-job"
+  {{- if .Values.labels }}
+  labels:
+    {{- range $label, $value := .Values.labels }}
+    {{ $label }}: {{ $value | quote }}
+    {{- end }}
+  {{- end }}
+spec:
+  ttlSecondsAfterFinished: 3600
+  backoffLimit: 0
+  template:
+    spec:
+      restartPolicy: Never
+      {{- if .Values.imagePullSecrets }}
+      imagePullSecrets:
+        {{- range .Values.imagePullSecrets }}
+        - name: {{ . }}
+        {{- end }}
+      {{- end }}
+      containers:
+        - name: finetuning
+          image: "{{ .Values.finetuningImage }}"
+          imagePullPolicy: Always
+          env:
+            {{- if .Values.hfTokenSecret }}
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.hfTokenSecret.name }}
+                  key: {{ .Values.hfTokenSecret.key }}
+            {{- end }}
+            # storage
+            - name: BUCKET_STORAGE_HOST
+              value: {{ .Values.bucketStorageHost }}
+            - name: BUCKET_STORAGE_ACCESS_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.bucketCredentialsSecret.name }}
+                  key: {{ .Values.bucketCredentialsSecret.accessKeyKey }}
+            - name: BUCKET_STORAGE_SECRET_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: {{ .Values.bucketCredentialsSecret.name }}
+                  key: {{ .Values.bucketCredentialsSecret.secretKeyKey }}
+          command:
+            - /configs/entrypoint.sh
+          resources:
+            limits:
+              memory: "{{ mul .Values.finetuningGpus .Values.memoryPerGpu }}Gi"
+              cpu: "{{ mul .Values.finetuningGpus .Values.cpusPerGpu }}"
+              amd.com/gpu: "{{ .Values.finetuningGpus }}"
+            requests:
+              memory: "{{ mul .Values.finetuningGpus .Values.memoryPerGpu }}Gi"
+              cpu: "{{ mul .Values.finetuningGpus .Values.cpusPerGpu }}"
+              amd.com/gpu: "{{ .Values.finetuningGpus }}"
+          volumeMounts:
+            - name: dshm # Increase SHM size for the container by mounting /dev/shm, for Pytorch parallel processing
+              mountPath: /dev/shm
+            - name: checkpoints
+              mountPath: /workdir/checkpoints
+              readOnly: false
+            - name: configs
+              mountPath: /configs
+              readOnly: true
+      volumes:
+        - name: dshm
+          emptyDir:
+            medium: Memory # equivalent to `docker run --shm-size=(total_memory/2)`
+        {{- if .Values.storageClass }}
+        - name: checkpoints
+          ephemeral:
+            volumeClaimTemplate:
+              spec:
+                accessModes: [ "ReadWriteOnce" ]
+                storageClassName: {{ .Values.storageClass }}
+                resources:
+                  requests:
+                    storage: "{{ .Values.checkpointsReservedSize }}"
+        {{- else }}
+        - name: checkpoints
+          emptyDir:
+            sizeLimit: "{{ .Values.checkpointsReservedSize }}"
+        {{- end }}
+        - name: configs
+          configMap:
+            name: "{{ .Release.Name }}-configs"
+            items:
+            - key: entrypoint.sh
+              path: entrypoint.sh
+              mode: 0777
+            - key: verl_config.yaml
+              path: verl_config.yaml
+{{- end -}}
+
+{{- define "job_wrapped_with_kaiwojob" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoJob
+metadata:
+  name: "{{ .Release.Name }}-job"
+spec:
+  job:
+    {{- include "job" . | nindent 4 }}
+{{- end -}}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "job_wrapped_with_kaiwojob" . }}
+{{- else -}}
+{{- include "job" . }}
+{{- end -}}
diff --git a/workloads/llm-finetune-verl/helm/values.schema.json b/workloads/llm-finetune-verl/helm/values.schema.json
new file mode 100644
index 0000000..348874c
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/values.schema.json
@@ -0,0 +1,138 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "type": "object",
+    "properties": {
+        "finetuningImage": {
+            "type": "string",
+            "description": "Container image for finetuning"
+        },
+        "modelName": {
+            "type": "string",
+            "description": "Model path in HuggingFace"
+        },
+        "modelRemote": {
+            "type": "string",
+            "description": "Model path in remote MinIO storage, format: bucketName/path/in/bucket"
+        },
+        "dataset": {
+            "type": "string",
+            "description": "Name of data set to use for training"
+        },
+        "datasetRemote": {
+            "type": "string",
+            "description": "Dataset path in remote MinIO storage, format: bucketName/path/in/bucket"
+        },
+        "kaiwo": {
+            "type": "object",
+            "properties": {
+                "enabled": {
+                    "type": "boolean",
+                    "description": "If true, use Kaiwo CRDs to have Kaiwo operator manage the workload",
+                    "default": false
+                }
+            },
+            "default": {}
+        },
+        "labels": {
+            "type": "object",
+            "description": "Any labels to add for the manifest, recommended: kueue",
+            "additionalProperties": {
+                "type": "string"
+            },
+            "default": {}
+        },
+        "imagePullSecrets": {
+            "type": "array",
+            "description": "Any imagePullSecrets to use",
+            "items": {
+                "type": "string"
+            },
+            "default": []
+        },
+        "bucketStorageHost": {
+            "type": "string",
+            "description": "The cloud storage host URL"
+        },
+        "bucketCredentialsSecret": {
+            "type": "object",
+            "description": "Bucket storage credential secret values, required to have the secret already setup in the cluster (e.g. via external secrets)",
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "The name of the secret in the cluster that contains the bucket storage credentials",
+                    "default": "minio-credentials"
+                },
+                "accessKeyKey": {
+                    "type": "string",
+                    "description": "The key in the secret that contains the access key",
+                    "default": "minio-access-key"
+                },
+                "secretKeyKey": {
+                    "type": "string",
+                    "description": "The key in the secret that contains the access key",
+                    "default": "minio-secret-key"
+                }
+            }
+        },
+        "checkpointsReservedSize": {
+            "type": "string",
+            "description": "How much space to reserve for model and data downloads"
+        },
+        "storageClass": {
+            "type": [
+                "string",
+                "null"
+            ],
+            "description": "Optionally set this to use a specific storageClass for the storage"
+        },
+        "cpusPerGpu": {
+            "type": "integer",
+            "description": "How many CPUs to use, per GPU",
+            "default": 8,
+            "minimum": 1
+        },
+        "finetuningGpus": {
+            "type": "integer",
+            "description": "How many GPUs to use for finetuning",
+            "default": 1,
+            "minimum": 0
+        },
+        "memoryPerGpu": {
+            "type": "integer",
+            "description": "How much memory to use in GB, per GPU",
+            "default": 64
+        },
+        "checkpointsRemote": {
+            "type": "string",
+            "description": "Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket"
+        },
+        "resumeFromCheckpoint": {
+            "type": "boolean",
+            "description": "If true, resume from the last checkpoint in checkpointsRemote (if available)",
+            "default": false
+        },
+        "hfTokenSecret": {
+            "type": "object",
+            "description": "Optional secret reference that contains a HuggingFace token",
+            "properties": {
+                "name": {
+                    "type": "string",
+                    "description": "The name of the secret in the cluster that contains the HuggingFace token"
+                },
+                "key": {
+                    "type": "string",
+                    "description": "The key in the secret that contains the HuggingFace token"
+                }
+            },
+            "default": {}
+        },
+        "verlConfig": {
+            "type": "object",
+            "description": "VeRL configurations to use"
+        }
+    },
+    "required": [
+        "finetuningImage",
+        "verlConfig"
+    ]
+}
diff --git a/workloads/llm-finetune-verl/helm/values.yaml b/workloads/llm-finetune-verl/helm/values.yaml
new file mode 100644
index 0000000..60e0837
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/values.yaml
@@ -0,0 +1,62 @@
+### General chart values ###
+finetuningImage: rocm/verl:verl-0.3.0.post0_rocm6.2_vllm0.6.3
+
+### Model ###
+# either modelRemote OR modelName must be set
+# to use a base model directly from Hugging Face, set modelName to the model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+modelName: ""
+# for remote models to be loaded from MinIO, specify the path to the model in the remote bucket as modelRemote
+modelRemote: ""
+
+### Data ###
+# either dataset OR datasetRemote must be set
+# to use one of the pre-existing datasets, set dataset to the dataset identifier (e.g., "gsm8k")
+# available datasets: "full_hh_rlhf", "geo3k", "gsm8k", "hellaswag", "math_dataset"
+dataset: ""
+# for remote datasets to be loaded from MinIO, specify the path to the model in the remote bucket as datasetRemote
+# Note: the dataset should be processed and stored in a format compatible with VeRL (train.parquet, test.parquet)
+datasetRemote: ""
+
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+  enabled: false
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Extra annotations such as an imagePullSecrets
+imagePullSecrets: []
+  # Example:
+  # imagePullSecrets:
+  #   - "regcred"
+
+# Configure these to match the credentials in your cluster:
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Resources:
+checkpointsReservedSize: 512Gi
+storageClass: mlstorage # set this to use a specific storageClass for the storage.
+finetuningGpus: 1
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+### Model output path ###
+checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket
+resumeFromCheckpoint: false  # Set to true to resume from the last checkpoint in checkpointsRemote (if available)
+
+hfTokenSecret: {} # Optional secret reference that contains the HuggingFace token
+# Example:
+# hfTokenSecret:
+#   name: hf-token
+#   key: hf-token
+
+verlConfig:
+  trainer:
+    logger: ['console']
+    test_freq: 10
+    save_freq: 10
+    total_epochs: 1
diff --git a/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh b/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh
index 272f199..fe9d1a7 100644
--- a/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh
+++ b/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh
@@ -5,8 +5,14 @@ mkdir -p /workload/output
 curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /workload/mc
 chmod +x /workload/mc
 /workload/mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
-/workload/mc mirror --watch /workload/output/ minio-host/${BUCKET_RESULT_PATH} &
+/workload/mc mirror --watch /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" &
 MINIOPID=$!
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $MINIOPID > /dev/null; then
+  echo "ERROR: Sync process failed to start"
+  exit 1
+fi
 
 OPENAI_API_BASE_URL=${OPENAI_API_BASE_URL%/}
 MODEL=$(curl -s ${OPENAI_API_BASE_URL}/models | jq -r '.data[0].id')
@@ -28,5 +34,5 @@ guidellm benchmark --target $OPENAI_API_BASE_URL \
 echo -e "<==========================\nBenchmarking completed"
 kill $MINIOPID
 wait $MINIOPID || true
-/workload/mc mirror /workload/output/ minio-host/${BUCKET_RESULT_PATH}
+/workload/mc mirror /workload/output/ minio-host/"${BUCKET_RESULT_PATH}"
 echo "All data uploaded successfully"
diff --git a/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh b/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh
index 6b465c8..24db12e 100644
--- a/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh
+++ b/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh
@@ -53,4 +53,4 @@ chmod +x /minio-binaries/mc
 export PATH="${PATH}:/minio-binaries/"
 
 mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
-mc cp --recursive $OUTPATH minio-host/${BUCKET_RESULT_PATH}/
+mc cp --recursive $OUTPATH minio-host/"${BUCKET_RESULT_PATH}"/
diff --git a/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh b/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh
index 4518a74..ac7c098 100644
--- a/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh
+++ b/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh
@@ -15,8 +15,14 @@ chmod +x $WORKPATH/bin/mc
 mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
 
 # Start a background process that watches for changes and uploads them
-mc mirror --watch $WORKPATH/output/ minio-host/${BUCKET_RESULT_PATH} &
+mc mirror --watch $WORKPATH/output/ minio-host/"${BUCKET_RESULT_PATH}" &
 MINIOPID=$!
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $MINIOPID > /dev/null; then
+  echo "ERROR: Sync process failed to start"
+  exit 1
+fi
 
 bash $WORKPATH/mount/minio_download_models.sh
 
@@ -53,5 +59,5 @@ kill $MINIOPID
 wait $MINIOPID || true
 
 # Run a final mirror command to ensure all data is uploaded
-mc mirror $WORKPATH/output/ minio-host/${BUCKET_RESULT_PATH}
+mc mirror $WORKPATH/output/ minio-host/"${BUCKET_RESULT_PATH}"
 echo 'All data uploaded successfully'
diff --git a/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh b/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh
index 2cfd716..81bf005 100644
--- a/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh
+++ b/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh
@@ -3,8 +3,14 @@ mkdir -p /workload/output
 curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /workload/mc
 chmod +x /workload/mc
 /workload/mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
-/workload/mc mirror --watch /workload/output/ minio-host/${BUCKET_RESULT_PATH} &
+/workload/mc mirror --watch /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" &
 MINIOPID=$! # Capture the PID of the mc mirror process
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $MINIOPID > /dev/null; then
+  echo "ERROR: Sync process failed to start"
+  exit 1
+fi
 
 echo "vLLM server started with PID: $SERVER_PID"
 ATTEMPT=0
@@ -62,5 +68,5 @@ done
 echo "Benchmarking completed"
 kill $MINIOPID
 wait $MINIOPID || true
-/workload/mc mirror /workload/output/ minio-host/${BUCKET_RESULT_PATH}
+/workload/mc mirror /workload/output/ minio-host/"${BUCKET_RESULT_PATH}"
 echo "All data uploaded successfully"
diff --git a/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml b/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml
index ad7a277..b6ca364 100644
--- a/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml
+++ b/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml
@@ -30,10 +30,10 @@ spec:
               mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY};
 
               echo "Listing contents of the model path:";
-              mc ls minio-host/{{ .Values.remoteSourceModelPath | trimSuffix "/" }}/ || echo "Model path not found!";
+              mc ls minio-host/'{{ .Values.remoteSourceModelPath | trimSuffix "/" | replace  "'" "'\\''" }}'/ || echo "Model path not found!";
 
               echo "Copying model checkpoint to container...";
-              mc cp -r minio-host/{{ .Values.remoteSourceModelPath | trimSuffix "/" }}/ /local-resources/sourcemodel || echo "Failed to copy model!";
+              mc cp -r minio-host/'{{ .Values.remoteSourceModelPath | trimSuffix "/" | replace  "'" "'\\''" }}'/ /local-resources/sourcemodel || echo "Failed to copy model!";
 
               echo "Listing contents of /local-resources/:";
               ls -la /local-resources/ || echo "Local resources directory not found!";
@@ -72,7 +72,7 @@ spec:
 
               echo "Conversion done, syncing checkpoint artifacts to remote storage...";
               mc mirror --overwrite \
-                /local-resources/checkpoints/ minio-host/{{ .Values.remoteDestinationModelPath | trimSuffix "/" }}/;
+                /local-resources/checkpoints/ minio-host/'{{ .Values.remoteDestinationModelPath | trimSuffix "/" | replace  "'" "'\\''" }}'/;
 
               echo "Done uploading. Signal to the main container that it can exit.";
 
diff --git a/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml b/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml
index b64d515..217e3fe 100644
--- a/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml
+++ b/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml
@@ -26,18 +26,19 @@ spec:
               # Setup MinIO, Download resources:
               mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY};
               echo "Copying data to container...";
-              mc cp -r minio-host/{{ .Values.remoteDataDirPath | trimSuffix "/" }}/{{ .Values.remoteDataNamePrefix }} /local-resources/data;
+              mc cp -r minio-host/'{{ .Values.remoteDataDirPath | trimSuffix "/" | replace  "'" "'\\''"  }}'/'{{ .Values.remoteDataNamePrefix }}' /local-resources/data;
               echo "Copying tokenizer to container...";
-              mc cp -r minio-host/{{ .Values.remoteTokenizerPath | trimSuffix "/" }}/ /local-resources/tokenizer;
+              mc cp -r minio-host/'{{ .Values.remoteTokenizerPath | trimSuffix "/" | replace  "'" "'\\''"  }}'/ /local-resources/tokenizer;
               echo "Copying model checkpoint to container...";
-              if last_ckpt=$(mc cat minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/latest_checkpointed_iteration.txt); then
+              {{- $remotePath := printf "minio-host/'%s'/" (.Values.remoteCheckpointsPath | trimSuffix "/" | replace  "'" "'\\''") }}
+              if last_ckpt=$(mc cat {{ $remotePath }}/latest_checkpointed_iteration.txt); then
                 last_ckpt=$(printf 'iter_%07d' "$last_ckpt")
                 echo "Found checkpoint at iteration $last_ckpt. Downloading ..."
-                mc mirror minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/$last_ckpt/ /local-resources/basemodel/$last_ckpt
-                mc cp minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/latest_checkpointed_iteration.txt /local-resources/basemodel/latest_checkpointed_iteration.txt
+                mc mirror {{ $remotePath }}/$last_ckpt/ /local-resources/basemodel/$last_ckpt
+                mc cp {{ $remotePath }}/latest_checkpointed_iteration.txt /local-resources/basemodel/latest_checkpointed_iteration.txt
               else
                 echo "No checkpoints found yet. Downloading basemodel ..."
-                mc cp -r minio-host/{{ .Values.remoteBaseModelPath | trimSuffix "/" }}/ /local-resources/basemodel;
+                mc cp -r minio-host/'{{ .Values.remoteBaseModelPath | trimSuffix "/" | replace  "'" "'\\''" }}'/ /local-resources/basemodel;
               fi
           resources:
             limits:
diff --git a/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml b/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml
index a6da3e6..7e1f0f7 100644
--- a/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml
+++ b/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml
@@ -86,10 +86,10 @@ spec:
                 sleep 60
               done
 
-              echo "Preprocessing done, syncing data to remote storage {{ .Values.bucketDataDir | trimSuffix "/" }}...";
-              mc cp --recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/;
+              echo "Preprocessing done, syncing data to remote storage {{ .Values.bucketDataDir | trimSuffix "/" | replace  "'" "'\\''" }}...";
+              mc cp --recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace  "'" "'\\''" }}'/;
               mc mirror --overwrite --exclude "**/.cache/*" \
-                /downloads/tokenizer/ minio-host/{{ .Values.bucketTokenizersDir | trimSuffix "/" }}/;
+                /downloads/tokenizer/ minio-host/'{{ .Values.bucketTokenizersDir | trimSuffix "/" | replace  "'" "'\\''" }}'/;
               echo "Done uploading. Signal to the main container that it can exit.";
               touch /downloads/done_uploading;
           resources: