diff --git a/docker/llm-evaluation/requirements.txt b/docker/llm-evaluation/requirements.txt
index ad6196a..f7ad356 100644
--- a/docker/llm-evaluation/requirements.txt
+++ b/docker/llm-evaluation/requirements.txt
@@ -3,5 +3,6 @@ dataclasses-json==0.6.7
evaluate==0.4.3
jsonlines==4.0.0
minio==7.2.15
+mlflow==3.1.0
openai==1.64.0
sentencepiece==0.2.0
diff --git a/docker/llm-evaluation/run_inference_and_judge_evaluation.py b/docker/llm-evaluation/run_inference_and_judge_evaluation.py
index e24dcf9..86a62e0 100644
--- a/docker/llm-evaluation/run_inference_and_judge_evaluation.py
+++ b/docker/llm-evaluation/run_inference_and_judge_evaluation.py
@@ -89,10 +89,8 @@ async def main(args: Namespace):
saved_results = []
parameters: dict = {}
- llm_url_no_protocol = args.llm_base_url.removeprefix("http://").removeprefix(
- "https://"
- ) # the Minio python client handles protocol itself
- client = get_llm_client(base_url=llm_url_no_protocol, port=args.llm_port, endpoint=args.llm_endpoint)
+
+ client = get_llm_client(base_url=args.llm_base_url, port=args.llm_port, endpoint=args.llm_endpoint)
async for inference_result in run_call_inference_container(
dataset=ds,
@@ -123,10 +121,7 @@ async def main(args: Namespace):
logger.info(inferences_data)
logger.info("Inference ran.")
- judge_url_no_protocol = args.judge_base_url.removeprefix("http://").removeprefix(
- "https://"
- ) # the Minio python client handles protocol itself
- judge_client = get_llm_client(base_url=judge_url_no_protocol, port=args.judge_port, endpoint=args.judge_endpoint)
+ judge_client = get_llm_client(base_url=args.judge_base_url, port=args.judge_port, endpoint=args.judge_endpoint)
aggregated_judge_results = AggregatedJudgeResults(
judge_results={},
diff --git a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
index 6973182..57e4f28 100644
--- a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
+++ b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
@@ -14,9 +14,9 @@
from llm_evaluation.call_inference_container.call_inference_container import (
save_inference_results,
)
-from llm_evaluation.metrics.run_metrics_evaluation import read_inference_data
+from llm_evaluation.metrics.run_metrics_evaluation import get_bert_score_distribution_graphs, read_inference_data
from llm_evaluation.metrics.run_metrics_evaluation import run as run_metrics_evaluation
-from llm_evaluation.metrics.utils import save_results
+from llm_evaluation.metrics.utils import log_metrics_in_mlflow, save_results
async def main(args: Namespace):
@@ -115,6 +115,20 @@ async def main(args: Namespace):
eval_results = run_metrics_evaluation(data)
+ distribution_graphs = get_bert_score_distribution_graphs(
+ scores=eval_results.scores,
+ )
+
+ if args.mlflow_server_uri:
+ logger.info("Logging results to MLFlow...")
+ log_metrics_in_mlflow(
+ distribution_graphs,
+ eval_results.scores,
+ mlflow_server_uri=args.mlflow_server_uri,
+ mlflow_experiment_name=args.mlflow_experiment_name,
+ mlflow_run_name=args.mlflow_run_name,
+ )
+
logger.info("Evaluation results:")
logger.info(eval_results)
diff --git a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
index e78c078..6cc0b9d 100644
--- a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
+++ b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
@@ -12,7 +12,11 @@ def get_inference_parser() -> ArgumentParser:
parser.add_argument("-p", "--llm-port", type=str, default="8080", help="Port number of the LLM service.")
parser.add_argument("-e", "--llm-endpoint", type=str, default="v1", help="Endpoint of the LLM service.")
parser.add_argument(
- "-d", "--evaluation-dataset", type=str, default="abisee/cnn_dailymail", help="Name of the evaluation dataset."
+ "-d",
+ "--evaluation-dataset-name",
+ type=str,
+ default="abisee/cnn_dailymail",
+ help="Name of the evaluation dataset.",
)
parser.add_argument(
"-v", "--evaluation-dataset-version", type=str, default="3.0.0", help="Version of the evaluation dataset."
@@ -65,6 +69,24 @@ def get_inference_parser() -> ArgumentParser:
default="/home/evaluation/example_prompts/example_summary_prompt.txt",
help="Path to the prompt template file.",
)
+ parser.add_argument(
+ "--mlflow-server-uri",
+ type=str,
+ default="", # leave this argument empty to disable MLFlow tracking
+ help="MLFlow server URI for tracking.",
+ )
+ parser.add_argument(
+ "--mlflow-experiment-name",
+ type=str,
+ default="llm-evaluation-experiment",
+ help="MLFlow experiment name for tracking.",
+ )
+ parser.add_argument(
+ "--mlflow-run-name",
+ type=str,
+ default="llm-evaluation-run",
+ help="MLFlow run name for tracking.",
+ )
return parser
diff --git a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
index b5d5e19..ebdba31 100644
--- a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
+++ b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
@@ -10,10 +10,12 @@
@dataclass_json
@dataclass
class EvaluationScores:
- precision_bert: float
- recall_bert: float
- f1_bert: float
- f1_list: List[float]
+ precision_avg_bert: float
+ recall_avg_bert: float
+ f1_avg_bert: float
+ precision_list_bert: List[float]
+ recall_list_bert: List[float]
+ f1_list_bert: List[float]
bleu_score: float
accuracy: float
diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
index 3b8ba5a..76797e4 100644
--- a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
+++ b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
@@ -7,7 +7,7 @@
def compute_bertscore(
predictions: List[str], references: List[str], language: str = "en"
-) -> Tuple[float, float, float, List[float]]:
+) -> Tuple[List[float], List[float], List[float]]:
"""
Computes the BERTScore for a set of predictions and references.
@@ -32,13 +32,7 @@ def compute_bertscore(
recall_list = convert_negatives_to_zero(array=np.array(results["recall"]))
f1_list = convert_negatives_to_zero(array=np.array(results["f1"]))
- precision_bert = round(np.average(precision_list), 4)
- recall_bert = round(np.average(recall_list), 4)
- f1_bert = round(np.average(f1_list), 4)
-
- f1_list = [round(f1, 4) for f1 in f1_list]
-
- return precision_bert, recall_bert, f1_bert, f1_list
+ return precision_list, recall_list, f1_list
def compute_exact_match(
diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
index 7d0985d..f2070b9 100644
--- a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
+++ b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
@@ -6,6 +6,9 @@
from typing import Any, Dict, List
import jsonlines
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
from llm_evaluation import logger
from llm_evaluation.argument_parsers import get_metrics_parser
from llm_evaluation.data.data_classes import EvaluationResults, EvaluationScores
@@ -28,7 +31,13 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
bert_score_start_time = time.time()
- precision_bert, recall_bert, f1_bert, f1_list = compute_bertscore(predictions=predictions, references=references)
+ precision_list_bert, recall_list_bert, f1_list_bert = compute_bertscore(
+ predictions=predictions, references=references
+ )
+
+ precision_avg_bert = round(np.average(precision_list_bert), 4)
+ recall_avg_bert = round(np.average(recall_list_bert), 4)
+ f1_avg_bert = round(np.average(f1_list_bert), 4)
logger.info(f"BERT-score computation took {time.time() - bert_score_start_time:.2f} seconds")
@@ -45,15 +54,53 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
logger.info(f"Exact match computation took {time.time() - exact_match_start_time:.2f} seconds")
return EvaluationScores(
- precision_bert=precision_bert,
- recall_bert=recall_bert,
- f1_bert=f1_bert,
- f1_list=f1_list,
+ precision_avg_bert=precision_avg_bert,
+ recall_avg_bert=recall_avg_bert,
+ f1_avg_bert=f1_avg_bert,
+ precision_list_bert=precision_list_bert,
+ recall_list_bert=recall_list_bert,
+ f1_list_bert=f1_list_bert,
bleu_score=bleu_score,
accuracy=accuracy,
)
+def get_bert_score_distribution_graphs(scores: EvaluationScores) -> Dict[str, str]:
+ """
+ Generate PNG images of the distributions of BERTScore precision, recall, and F1,
+ each with the mean value marked.
+
+ Args:
+ precision_list (list of float): List of BERTScore precision values.
+ recall_list (list of float): List of BERTScore recall values.
+ f1_list (list of float): List of BERTScore F1 values.
+
+ Returns:
+ dict: Dictionary with keys 'precision', 'recall', 'f1', each containing PNG image bytes.
+ """
+ results = {}
+ metrics = [
+ ("precision", scores.precision_list_bert),
+ ("recall", scores.recall_list_bert),
+ ("f1", scores.f1_list_bert),
+ ]
+ for name, values in metrics:
+ fig, ax = plt.subplots()
+ values = np.array(values)
+ mean_val = np.mean(values)
+ ax.hist(values, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
+ ax.axvline(mean_val, color="red", linestyle="dashed", linewidth=2, label=f"Mean: {mean_val:.4f}")
+ ax.set_title(f"BERTScore {name.capitalize()} Distribution")
+ ax.set_xlabel(name.capitalize())
+ ax.set_ylabel("Frequency")
+ ax.legend()
+ plt.tight_layout()
+ plt.savefig(f"{name}_distribution.png", format="png")
+ plt.close(fig)
+ results[name] = f"{name}_distribution.png"
+ return results
+
+
def read_inference_data(input_path: str) -> List[Dict[str, Any]]:
"""
Reads inference data from a file or directory containing JSON/JSONL files.
diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
index 23fa6f2..84c917f 100644
--- a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
+++ b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
@@ -4,13 +4,14 @@
from typing import Any, Dict, List
import jsonlines
+import mlflow
+import numpy as np
from llm_evaluation import logger
from llm_evaluation.data.data_classes import AggregatedJudgeResults, EvaluationResults
from minio import Minio, S3Error
-from numpy import ndarray
-def convert_negatives_to_zero(array: ndarray) -> ndarray:
+def convert_negatives_to_zero(array: np.ndarray) -> np.ndarray:
"""Converts all negative values in an array to zero.
Args:
@@ -129,3 +130,40 @@ def read_jsonl_data(input_file_path: str) -> List[Dict[str, Any]]:
for line in reader.iter(type=dict, skip_invalid=True):
generations.append(line)
return generations
+
+
+def log_metrics_in_mlflow(distribution_graphs, scores, mlflow_server_uri, mlflow_experiment_name, mlflow_run_name):
+
+ logger.info(f"Using MLflow tracking URI: {mlflow_server_uri}")
+
+ experiment_description = "Evaluation of LLM using BERTScore metric."
+
+ experiment_tags = {
+ "project_name": mlflow_experiment_name,
+ "mlflow.note.content": experiment_description,
+ }
+
+ client = mlflow.MlflowClient(tracking_uri=mlflow_server_uri)
+
+ # Create the Experiment, providing a unique name
+ try:
+ test_experiment = client.create_experiment(name=mlflow_experiment_name, tags=experiment_tags)
+ logger.info(f"Created experiment with ID: {test_experiment}")
+ except mlflow.exceptions.MlflowException as e:
+ # If the experiment already exists, retrieve its ID
+ logger.warning(f"Experiment '{mlflow_experiment_name}' already exists. Using existing experiment.")
+ test_experiment = client.get_experiment_by_name(mlflow_experiment_name).experiment_id
+ logger.info(f"Using existing experiment with ID: {test_experiment}")
+
+ mlflow.set_tracking_uri(mlflow_server_uri)
+ mlflow.set_experiment(experiment_name=mlflow_experiment_name)
+ with mlflow.start_run(run_name=mlflow_run_name, experiment_id=test_experiment) as run:
+
+ for name, file in distribution_graphs.items():
+ mlflow.log_metric("bert_score_mean_precision" + name, np.mean(scores.precision_avg_bert))
+ mlflow.log_metric("bert_score_mean_recall" + name, np.mean(scores.recall_avg_bert))
+ mlflow.log_metric("bert_score_mean_f1" + name, np.mean(scores.f1_avg_bert))
+ logger.info(
+ f"Saving artifact {file} (abs path: {os.path.abspath(file)}) to MLflow run {run.info.run_id}..."
+ )
+ mlflow.log_artifact(os.path.abspath(file), artifact_path="metrics_distributions")
diff --git a/docker/logistics/requirements.txt b/docker/logistics/requirements.txt
index 7405971..46ecaaa 100644
--- a/docker/logistics/requirements.txt
+++ b/docker/logistics/requirements.txt
@@ -4,3 +4,4 @@ google-cloud-storage
hf_transfer
huggingface_hub[cli]
minio
+wandb
diff --git a/docs/contributing.md b/docs/contributing.md
index 870eaff..94e1f5e 100644
--- a/docs/contributing.md
+++ b/docs/contributing.md
@@ -18,6 +18,56 @@ Thank you for considering contributing to the SiloGen AI Workloads development!
# install packages you need
```
+### Pre-commit setup
+
+We use [pre-commit](https://pre-commit.com/) for consistent formatting and cleaner code. Hooks are specified in `ai-workloads-dev/.pre-commit-config.yaml`.
+
+To install:
+`cd ai-workloads-dev` (this is necessary for `pre-commit install`, which runs particular to a git repository)
+`source your_venv`
+`pip install pre-commit`
+`pre-commit install --config .pre-commit-config.yaml`
+`git commit -m "test commit"`
+
+With the final command, pre-commit should run automatically, with output something like the following:
+
+ >check json...........................................(no files to check)Skipped
+ check yaml...........................................(no files to check)Skipped
+ fix end of files.....................................(no files to check)Skipped
+ fix requirements.txt.................................(no files to check)Skipped
+ trim trailing whitespace.............................(no files to check)Skipped
+ black................................................(no files to check)Skipped
+ flake8...............................................(no files to check)Skipped
+ isort (python).......................................(no files to check)Skipped
+ mypy.................................................(no files to check)Skipped
+ helmlint.............................................(no files to check)Skipped
+
+It's also possible to manually run pre-commit using
+
+`pre-commit run --all-files`
+
+#### Troubleshooting pre-commit
+
+Many pre-commit bugs come from having an incorrect version of pre-commit active. Pre-commit can hang around as a system-wide version, in python venvs, or in your pre-commit cache.
+
+ It's easiest to use pre-commit as part of a python virtual environment. To check that the right pre-commit is being found, run `which pre-commit` and confirm that the binaries inside your venv are shown. For example: `/../../venvs/your_venv/bin/pre-commit`. A different path could indicate that your system is choosing the wrong pre-commit install.
+
+
+From system:
+`brew uninstall pre-commit` (mac)
+`sudo apt remove pre-commit` (linux)
+
+From venv:
+`pip uninstall pre-commit`
+
+Just the pre-commit hooks uninstall:
+`pre-commit uninstall`
+`pre-commit clean`
+
+
+Then reinstall pre-commit from scratch as described above.
+
+
## Development Workflow
1. Create a branch for your feature or bugfix:
diff --git a/workloads/dev-workspace-jupyterlab/helm/values.yaml b/workloads/dev-workspace-jupyterlab/helm/values.yaml
index 2ac6519..9c238b6 100644
--- a/workloads/dev-workspace-jupyterlab/helm/values.yaml
+++ b/workloads/dev-workspace-jupyterlab/helm/values.yaml
@@ -47,8 +47,17 @@ entrypoint: |
pip install pipx ipykernel
pipx install --include-deps jupyter
pipx inject --include-deps jupyter jupyterlab-lsp 'python-lsp-server[all]' ipywidgets jupyterlab-git jupyterlab_code_formatter
- python -m ipykernel install --user --name=default-python3
- jupyter-lab --ServerApp.token='' --ServerApp.ip='0.0.0.0' --ServerApp.allow_root=True --ServerApp.base_url=$BASE_URL --no-browser --ServerApp.root_dir='/workload'
+ python -m ipykernel install --user --name=default-python3 --display-name="Python 3 (default)"
+
+ jupyter-lab --no-browser \
+ --IdentityProvider.token='' \
+ --ServerApp.ip='0.0.0.0' \
+ --ServerApp.allow_root=True \
+ --ServerApp.base_url=$BASE_URL \
+ --ServerApp.root_dir='/workload' \
+ --MultiKernelManager.default_kernel_name=default-python3 \
+ --KernelSpecManager.allowed_kernelspecs=default-python3 \
+ --KernelSpecManager.ensure_native_kernel=False
# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
kaiwo:
diff --git a/workloads/download-data-to-bucket/helm/templates/job.yaml b/workloads/download-data-to-bucket/helm/templates/job.yaml
index f232a17..2dd6cc7 100644
--- a/workloads/download-data-to-bucket/helm/templates/job.yaml
+++ b/workloads/download-data-to-bucket/helm/templates/job.yaml
@@ -33,8 +33,8 @@ spec:
mkdir -p /downloads/datasets
python /scripts/data_script.py
########################
- echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" }}'
- mc cp -recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/
+ echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'
+ mc cp -recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'/
########################
echo 'Done'
env:
diff --git a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
index 7514746..b91d2b9 100644
--- a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
+++ b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
@@ -47,13 +47,14 @@ spec:
{{- end }}
--local-dir local_models/downloaded_model
###################################
- echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" }}'
+ echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace "'" "'\\''" }}'
+ {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace "'" "'\\''") }}
mc mirror --exclude '.cache/huggingface/*' \
--exclude '.gitattributes' \
{{- if .Values.allowOverwrite }}
--overwrite \
{{- end }}
- local_models/downloaded_model/ minio-host/{{ .Values.bucketPath | trimSuffix "/" }}
+ local_models/downloaded_model/ {{ $remotePath }}
env:
{{- if .Values.hfTokenSecret }}
- name: HF_TOKEN
diff --git a/workloads/download-wandb-model-to-bucket/helm/Chart.yaml b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml
new file mode 100644
index 0000000..557e352
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: download-wandb-model-to-bucket
+description: A Helm chart for downloading a Weights and Biases model to a bucket
+version: 0.0.1
diff --git a/workloads/download-wandb-model-to-bucket/helm/README.md b/workloads/download-wandb-model-to-bucket/helm/README.md
new file mode 100644
index 0000000..1d6519c
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/README.md
@@ -0,0 +1,14 @@
+# Download a model from Weights and Biases to bucket storage
+
+This is an workload which downloads a model from weights and biases and uploads it to bucket storage.
+
+Run example:
+```bash
+helm template "dl-from-wandb" workloads/download-wandb-model-to-bucket/helm \
+ -f workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml \
+ | kubectl create -f -
+```
+
+## User input values
+
+See the `values.yaml` file for the user input values that you can provide, with instructions.
diff --git a/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml b/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml
new file mode 100644
index 0000000..337ad4d
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml
@@ -0,0 +1,18 @@
+# Which model to download
+artifactPath: test-proj-1/test-model-2
+
+# Where the resources should be stored:
+bucketPath: default-bucket/models/examples/tiny-random-test-model-2
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Download & Upload configuration:
+allowOverwrite: false
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "20Gi"
+
+# HF Token:
+wandbTokenSecret:
+ name: wandb-token
+ key: wandb-token
diff --git a/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml b/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml
new file mode 100644
index 0000000..52bd8f8
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml
@@ -0,0 +1,103 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: "{{ .Release.Name }}-job"
+ {{- if .Values.labels }}
+ labels:
+ {{- range $label, $value := .Values.labels }}
+ {{ $label }}: {{ $value | quote }}
+ {{- end }}
+ {{- end }}
+spec:
+ ttlSecondsAfterFinished: 3600
+ backoffLimit: 0
+ template:
+ spec:
+ restartPolicy: Never
+ {{- if .Values.imagePullSecrets }}
+ imagePullSecrets:
+ {{- range .Values.imagePullSecrets }}
+ - name: {{ . }}
+ {{- end }}
+ {{- end }}
+ containers:
+ - name: hf-to-bucket
+ image: {{ .Values.image }}
+ imagePullPolicy: Always
+ workingDir: /app
+ command:
+ - sh
+ - -e
+ - -u
+ - -c
+ args:
+ - |
+ ###################################
+ echo 'Setting up minio'
+ mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
+ ###################################
+ echo 'Downloading the artifact from wandb to the container'
+ {{- $safeArtifactPath := printf "'%s'" (.Values.artifactPath | replace "'" "'\\''") }}
+ wandb artifact get --type {{ .Values.artifactType }} {{ $safeArtifactPath }} --root local_artifact
+ ###################################
+ echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace "'" "'\\''" }}'
+ {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace "'" "'\\''") }}
+ mc mirror \
+ {{- if .Values.allowOverwrite }}
+ --overwrite \
+ {{- end }}
+ local_artifact/ {{ $remotePath }}
+ env:
+ - name: WANDB_API_KEY
+ valueFrom:
+ secretKeyRef:
+ name: {{ .Values.wandbTokenSecret.name }}
+ key: {{ .Values.wandbTokenSecret.key }}
+ - name: BUCKET_STORAGE_HOST
+ value: {{ .Values.bucketStorageHost }}
+ - name: BUCKET_STORAGE_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: {{ .Values.bucketCredentialsSecret.name }}
+ key: {{ .Values.bucketCredentialsSecret.accessKeyKey }}
+ - name: BUCKET_STORAGE_SECRET_KEY
+ valueFrom:
+ secretKeyRef:
+ name: {{ .Values.bucketCredentialsSecret.name }}
+ key: {{ .Values.bucketCredentialsSecret.secretKeyKey }}
+ resources:
+ requests:
+ memory: 1Gi
+ cpu: 1
+ limits:
+ memory: 1Gi
+ cpu: 1
+ volumeMounts:
+ - mountPath: /app
+ name: {{ .Release.Name }}-volume
+ securityContext:
+ allowPrivilegeEscalation: false
+ runAsNonRoot: true
+ runAsUser: 1000
+ runAsGroup: 1000
+ seccompProfile:
+ type: RuntimeDefault
+ capabilities:
+ drop: ["ALL"]
+ securityContext:
+ fsGroup: 1000
+ volumes:
+ - name: {{ .Release.Name }}-volume
+ {{- if .Values.storageClass }}
+ ephemeral:
+ volumeClaimTemplate:
+ spec:
+ accessModes: [ "ReadWriteOnce" ]
+ storageClassName: {{ .Values.storageClass }}
+ resources:
+ requests:
+ storage: "{{ .Values.storageQuantity }}"
+ {{- else }}
+ emptyDir:
+ sizeLimit: "{{ .Values.storageQuantity }}"
+ {{- end }}
diff --git a/workloads/download-wandb-model-to-bucket/helm/values.yaml b/workloads/download-wandb-model-to-bucket/helm/values.yaml
new file mode 100644
index 0000000..993dd0e
--- /dev/null
+++ b/workloads/download-wandb-model-to-bucket/helm/values.yaml
@@ -0,0 +1,34 @@
+### General chart values ###
+image: ghcr.io/silogen/logistics:v0.2
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+ # Example:
+ # labels:
+ # kaiwo.silogen.ai/managed: "true"
+
+# Extra annotations such as an imagePullSecrets
+imagePullSecrets: []
+ # Example:
+ # imagePullSecrets:
+ # - "regcred"
+
+# Configure these to match the credentials in your cluster:
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+bucketCredentialsSecret:
+ name: minio-credentials
+ accessKeyKey: minio-access-key
+ secretKeyKey: minio-secret-key
+
+# Secret reference that contains the Weights and Biases token
+wandbTokenSecret:
+ name: wandb-token
+ key: wandb-token
+
+# Inputs:
+artifactPath: "" # wandb artifact path which is in the format of project/artifact-name
+artifactType: model # wandb artifact type, e.g. model or dataset
+bucketPath: "" # Path in the bucket storage where this model should be stored. In the format bucket-name/path/separated/by/slashes/name-for-resulting-directory
+allowOverwrite: false # Optionally set to true to allow overiwriting existing files in the bucket
+storageQuantity: 64Gi # How much space needs to be allocated to store the model in the container (before pushing to bucket storage).
+storageClass: mlstorage # Set this to use a specific storageClass for the storage. If not specified, will simply use an ephemeral_storage request.
diff --git a/workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml b/workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml
similarity index 75%
rename from workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml
rename to workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml
index 85baf06..b6b91c5 100644
--- a/workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml
+++ b/workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml
@@ -1,4 +1,4 @@
-# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
+# Overrides file for running judge evaluation, using Llama-3.2-3B-Instruct to judge Qwen2.5-3B-Instruct on the default dataset
general:
job_name: judge-job-3container-qwen
model_inference_container:
diff --git a/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml b/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml
new file mode 100644
index 0000000..3f48ea0
--- /dev/null
+++ b/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml
@@ -0,0 +1,14 @@
+# Overrides file for running judge evaluation, using Llama-3.2-3B-Instruct to judge Llama-3.2-3B-Instruct on the default dataset
+general:
+ job_name: judge-job-s3-llama-3.2-3B
+model_inference_container:
+ image: rocm/vllm-dev:nightly_main_20250430
+ model: Llama-3.2-3B-Instruct
+ model_path: hf://meta-llama/Llama-3.2-3B-Instruct
+judge_inference_container:
+ image: rocm/vllm-dev:nightly_main_20250430
+ model: Llama-3.2-3B-Instruct
+ model_path: s3://default-bucket/models/meta-llama/Llama-3.2-3B-Instruct
+judge_evaluation_container:
+ image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1
+ use_data_subset: 0
diff --git a/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml b/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml
deleted file mode 100644
index 4738fcf..0000000
--- a/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
-general:
- job_name: judge-job-minio
-model_inference_container:
- image: rocm/vllm-dev:20241205-tuned
- model: llama-3.2-3B
- model_path: hf://meta-llama/Llama-3.2-3B-Instruct
-judge_inference_container:
- model: llama-3.2-3B
- model_path: s3://default-bucket/models/meta-llama/Llama-3.2-3B-Instruct
-judge_evaluation_container:
- image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1
- use_data_subset: 5
diff --git a/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml b/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml
index 7d15fec..6c41581 100644
--- a/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml
+++ b/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml
@@ -144,16 +144,6 @@ spec:
requests:
memory: "{{ .Values.judge_evaluation_container.memory }}"
cpu: "{{ .Values.judge_evaluation_container.cpu_count }}"
- startupProbe:
- exec:
- command:
- - sh
- - -c
- - |
- curl -sf http://localhost:8080/health && curl -sf http://localhost:8081/health
- initialDelaySeconds: 60
- periodSeconds: 10
- failureThreshold: 30
command: ["sh", "-c"]
args:
- |
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml
index b2d0fc2..52cb1b1 100644
--- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml
@@ -1,5 +1,5 @@
model_inference_container:
- image: rocm/vllm-dev:20241205-tuned
+ image: rocm/vllm-dev:nightly_main_20250430
evaluation_container:
image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1
dataset_path: FiscalNote/billsum
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml
index 2f220e0..ab463b3 100644
--- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml
@@ -1,6 +1,6 @@
# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
model_inference_container:
- image: rocm/vllm-dev:20241205-tuned
+ image: rocm/vllm-dev:nightly_main_20250430
model: Llama-3.2-3B-Instruct
model_path: meta-llama/Llama-3.2-3B-Instruct
evaluation_container:
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml
index 85de2bd..3e39cd3 100644
--- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml
@@ -1,6 +1,6 @@
# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset
model_inference_container:
- image: rocm/vllm-dev:20241205-tuned
+ image: rocm/vllm-dev:nightly_main_20250430
model: Llama-3.2-3B-Instruct
model_path: meta-llama/Llama-3.2-3B-Instruct
evaluation_container:
diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml
new file mode 100644
index 0000000..97d79b8
--- /dev/null
+++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml
@@ -0,0 +1,12 @@
+model_inference_container:
+ image: rocm/vllm-dev:nightly_main_20250430
+ model: Llama-3.2-3B-Instruct
+ model_path: meta-llama/Llama-3.2-3B-Instruct
+evaluation_container:
+ image: ghcr.io/silogen/evaluation-workloads-metrics-debug:v0.1
+ use_data_subset: 3
+storage:
+ mlflow:
+ server_uri: http://10.242.3.71:8082
+ experiment_name: metrics-demo-experiment
+ run_name: metrics-demo-run
diff --git a/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml b/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml
index a4979d8..a4178b6 100644
--- a/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml
+++ b/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml
@@ -83,7 +83,7 @@ spec:
echo "Running evaluation:\nDownloading Dataset, Running inference, Evaluating inferences with bertscore...";
python3 run_inference_and_metrics_evaluation.py \
--llm-base-url="http://localhost" \
- --evaluation-dataset="{{ .Values.evaluation_container.dataset_path }}" \
+ --evaluation-dataset-name="{{ .Values.evaluation_container.dataset_path }}" \
--evaluation-dataset-version="{{ .Values.evaluation_container.dataset_version }}" \
--dataset-split="{{ .Values.evaluation_container.dataset_split }}" \
--prompt-template-path="{{ .Values.evaluation_container.prompt_template_path }}" \
@@ -95,7 +95,10 @@ spec:
--context-column-name="{{ .Values.evaluation_container.dataset_info.context_column_name}}" \
--id-column-name="{{ .Values.evaluation_container.dataset_info.id_column_name}}" \
--gold-standard-column-name="{{ .Values.evaluation_container.dataset_info.gold_standard_column_name}}" \
- --use-data-subset="{{ .Values.evaluation_container.use_data_subset}}" ;
+ --use-data-subset="{{ .Values.evaluation_container.use_data_subset}}" \
+ --mlflow-server-uri="{{ .Values.storage.mlflow.server_uri }}" \
+ --mlflow-experiment-name="{{ .Values.storage.mlflow.experiment_name }}" \
+ --mlflow-run-name="{{ .Values.storage.mlflow.run_name }}" ;
env:
- name: TRANSFORMERS_CACHE
value: /HF_HOME
diff --git a/workloads/llm-evaluation-metrics/helm/values.yaml b/workloads/llm-evaluation-metrics/helm/values.yaml
index 7338287..e146c0e 100644
--- a/workloads/llm-evaluation-metrics/helm/values.yaml
+++ b/workloads/llm-evaluation-metrics/helm/values.yaml
@@ -33,3 +33,7 @@ storage:
- ReadWriteOnce
bucket_storage_host: minio.minio-tenant-default.svc.cluster.local:80
bucket_storage_bucket: default-bucket
+ mlflow:
+ server_uri: http://10.242.3.198:8082
+ experiment_name: mlflow-experiment
+ run_name: mlflow-run
diff --git a/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl b/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl
index d1b22f6..e27a123 100644
--- a/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl
+++ b/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl
@@ -3,9 +3,10 @@
# Setup MinIO
mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY}
# Sync checkpoints from remote to local
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }}
{{- if .Values.checkpointsRemote }}
-if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then
- echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+ echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''"}} to /workdir/checkpoints'
ls -lah /workdir/checkpoints
else
echo 'No checkpoints found yet'
@@ -17,12 +18,13 @@ fi
{{- define "finetuningAndUploadEntrypoint" -}}
# Print GPU Info:
rocm-smi
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }}
{{- if .Values.checkpointsRemote }}
echo "Starting checkpoint sync process"
mc mirror \
--watch \
/workdir/checkpoints \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ &
+ {{ $checkpointsRemotePath }} &
uploadPID=$!
{{- end }}
# Run training:
@@ -36,7 +38,7 @@ wait $uploadPID || true
echo 'Training done, syncing once more...'
mc mirror \
/workdir/checkpoints \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/
+ {{ $checkpointsRemotePath }}
{{- end }}
echo 'All done, exiting'
{{- end }}
diff --git a/workloads/llm-finetune-axolotl/helm/values.yaml b/workloads/llm-finetune-axolotl/helm/values.yaml
index 3ed333d..9f3234e 100644
--- a/workloads/llm-finetune-axolotl/helm/values.yaml
+++ b/workloads/llm-finetune-axolotl/helm/values.yaml
@@ -27,4 +27,4 @@ finetuningGpus: 1
configFile: # name of config file to use, include the file in the mount/ directory
### Model output path ###
-checkpointsRemote: # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
+checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
diff --git a/workloads/llm-finetune-llama-factory/helm/README.md b/workloads/llm-finetune-llama-factory/helm/README.md
index d213a5b..a495741 100644
--- a/workloads/llm-finetune-llama-factory/helm/README.md
+++ b/workloads/llm-finetune-llama-factory/helm/README.md
@@ -2,7 +2,6 @@
This is a Helm Chart for running a finetuning job using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory)
-Currently the base model and input data are assumed to be from HuggingFace, or some other source directly supported by LLaMA-Factory.
The output is saved with MinIO in the directory specified by `checkpointsRemote`.
## Configuration
@@ -22,6 +21,22 @@ helm template workloads/llm-finetune-llama-factory/helm \
| kubectl create -f -
```
+## Data specification
+
+Specify the name of data set used for training as `dataset`. This can include datasets predefined in LLaMA-Factory or those defined in `datasetInfo`. Use commas to separate multiple data sets.
+
+To use other datasets, create an entry in `datasetInfo` following the [LLaMA-Factory dataset info format](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md). Note that LLaMA-Factory directly supports loading datasets from HuggingFace, ModelScope, or s3/gcs cloud storage by setting the urls according to the documentation.
+
+This workload adds a custom way to load data from MinIO. In `datasetInfo` specify the path to the dataset in the remote bucket as `pathRemote`, and the workload will load the file and update the configuration. See the override file [`overrides/finetune-model_data_from_minio.yaml`](overrides/finetune-model_data_from_minio.yaml) for an example of finetuning where the data and model are loaded from MinIO.
+
+## Model specification
+
+To use a base model from HuggingFace or other source directly supported by LLaMA-Factory, specify the model name in `modelName`.
+
+Alternatively to use a model from MinIO, specify the path to the model in `modelRemote`.
+
+Either `modelName` or `modelRemote` must be specified. If both are included, the model from `modelRemote` is used.
+
## Cleanup
After the jobs are completed, please delete the resources created. In particular for multi-node ray jobs, a `PersistentVolumeClaim` is used as shared storage and persists on the cluster after the job is completed.
@@ -37,7 +52,7 @@ helm template workloads/llm-finetune-llama-factory/helm \
## Multi-node finetuning with ray
-The chart supports multi-node jobs by setting `nodes` to an integer greater than 1. Doing so enables ray and creates a RayJob instead. An example config is provided in [`overrides/finetune-lora-ray.yaml`](overrides/finetune-lora-ray.yaml)
+The chart supports multi-node jobs by setting `nodes` to an integer greater than 1. Doing so enables ray and creates a RayJob instead. An example config is provided in [`overrides/finetune-lora-ray.yaml`](overrides/finetune-lora-ray.yaml). The example also shows how to use [DeepSpeed ZeRO Stage 2](https://deepspeed.readthedocs.io/en/latest/zero3.html) to partition the gradients. To enable DeepSpeed, set the `deepspeed` parameter in the LLaMA-Factory config to point to one of the [deepspeed configs](https://github.com/hiyouga/LLaMA-Factory/tree/main/examples/deepspeed) included in LLaMA-Factory or a dictionary.
When configuring ray jobs, the resources you are requesting (`nodes` and `gpusPerNode`) are automatically specified for LLaMA-Factory, and do not need to be included separately in the `llamaFactoryConfig`.
diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml
index 7dd542b..c2835c7 100644
--- a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml
@@ -1,12 +1,18 @@
+### Model ###
+modelName: meta-llama/Llama-3.1-8B-Instruct
+
+### Data ###
+dataset: identity,alpaca_en_demo
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora"
+
# Resources:
checkpointsReservedSize: 10Gi
nodes: 2
gpusPerNode: 1
memoryPerNode: 32Gi
-### Model output path ###
-checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora"
-
hfTokenSecret:
name: hf-token
key: hf-token
@@ -15,7 +21,6 @@ hfTokenSecret:
### this example adapted from https://github.com/hiyouga/LLaMA-Factory/blob/main/examples/train_lora/llama3_lora_sft_ray.yaml
llamaFactoryConfig:
### model
- model_name_or_path: meta-llama/Llama-3.1-8B-Instruct # or use local absolute path
trust_remote_code: true
### method
@@ -24,10 +29,9 @@ llamaFactoryConfig:
finetuning_type: lora
lora_rank: 8
lora_target: all
+ deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z2_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config
### dataset
- dataset: identity,alpaca_en_demo
- dataset_dir: REMOTE:llamafactory/demo_data # or use local absolute path
template: llama3
cutoff_len: 2048
max_samples: 1000
@@ -62,11 +66,3 @@ llamaFactoryConfig:
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
- resume_from_checkpoint: null
-
- ### eval
- # eval_dataset: alpaca_en_demo
- # val_size: 0.1
- # per_device_eval_batch_size: 1
- # eval_strategy: steps
- # eval_steps: 500
diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml
index 1bf8a3f..e7f5d83 100644
--- a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml
@@ -1,9 +1,15 @@
-# Resources:
-checkpointsReservedSize: 10Gi
+### Model ###
+modelName: meta-llama/Llama-3.1-8B-Instruct
+
+### Data ###
+dataset: identity,alpaca_en_demo
### Model output path ###
checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora"
+# Resources:
+checkpointsReservedSize: 10Gi
+
hfTokenSecret:
name: hf-token
key: hf-token
@@ -12,7 +18,6 @@ hfTokenSecret:
### this example from https://github.com/hiyouga/LLaMA-Factory/blob/main/examples/train_lora/llama3_lora_sft.yaml
llamaFactoryConfig:
### model
- model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
trust_remote_code: true
### method
@@ -23,8 +28,6 @@ llamaFactoryConfig:
lora_target: all
### dataset
- dataset: identity,alpaca_en_demo
- dataset_dir: REMOTE:llamafactory/demo_data # or use local absolute path
template: llama3
cutoff_len: 2048
max_samples: 1000
@@ -49,11 +52,3 @@ llamaFactoryConfig:
warmup_ratio: 0.1
bf16: true
ddp_timeout: 180000000
- resume_from_checkpoint: null
-
- ### eval
- # eval_dataset: alpaca_en_demo
- # val_size: 0.1
- # per_device_eval_batch_size: 1
- # eval_strategy: steps
- # eval_steps: 500
diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml
new file mode 100644
index 0000000..9a70ec3
--- /dev/null
+++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml
@@ -0,0 +1,64 @@
+### Model ###
+modelRemote: "default-bucket/models/tiny-llama/tinyllama-1.1b-chat-v1.0"
+
+### Data ###
+# list datasets to use, can include datasets predefined in LLaMA-Factory or those defined in datasetInfo
+dataset: argilla
+# for remote datasets to be loaded from MinIO, specify the path to the dataset in the remote bucket as pathRemote
+datasetInfo:
+ argilla:
+ pathRemote: "default-bucket/datasets/argilla-mistral-large-human-prompts.jsonl"
+ formatting: sharegpt
+ columns:
+ messages: "messages"
+ tags:
+ role_tag: "role"
+ content_tag: "content"
+ user_tag: "user"
+ assistant_tag: "assistant"
+ system_tag: "system"
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/tinyllama-argilla-llama-factory-lora"
+resumeFromCheckpoint: true
+
+# Resources:
+checkpointsReservedSize: 10Gi
+
+### llama-factory config ###
+llamaFactoryConfig:
+ ### model
+ trust_remote_code: true
+
+ ### method
+ stage: sft
+ do_train: true
+ finetuning_type: lora
+ lora_rank: 8
+ lora_target: all
+
+ ### dataset
+ template: llama2
+ cutoff_len: 8192
+ max_samples: 1000
+ overwrite_cache: true
+ preprocessing_num_workers: 16
+ dataloader_num_workers: 4
+
+ ### output
+ logging_steps: 10
+ save_steps: 500
+ plot_loss: true
+ overwrite_output_dir: true
+ save_only_model: false
+ report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow]
+
+ ### train
+ per_device_train_batch_size: 1
+ gradient_accumulation_steps: 8
+ learning_rate: 1.0e-4
+ num_train_epochs: 3.0
+ lr_scheduler_type: cosine
+ warmup_ratio: 0.1
+ bf16: true
+ ddp_timeout: 180000000
diff --git a/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl b/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl
index 645aceb..69f9319 100644
--- a/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl
+++ b/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl
@@ -88,4 +88,8 @@ spec:
mode: 0777
- key: llama_factory_config.yaml
path: llama_factory_config.yaml
+ {{- if .Values.datasetInfo }}
+ - key: remote_dataset_info.json
+ path: remote_dataset_info.json
+ {{- end }}
{{- end }}
diff --git a/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml b/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml
index f75cf3b..7d384ea 100644
--- a/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml
@@ -4,6 +4,13 @@ metadata:
name: "{{ .Release.Name }}-configs"
data:
llama_factory_config.yaml: |
+ {{- if .Values.modelRemote }}
+ model_name_or_path: /workdir/basemodel
+ {{- else }}
+ model_name_or_path: "{{ .Values.modelName }}"
+ {{- end }}
+ dataset: {{ .Values.dataset }}
+ dataset_dir: /workspace/LLaMA-Factory/data
output_dir: /workdir/checkpoints
{{- if ne (int $.Values.nodes) 1 }}
ray_run_name: "{{ .Release.Name }}"
@@ -13,35 +20,74 @@ data:
GPU: {{ .Values.gpusPerNode }}
{{- end }}
{{ toYaml .Values.llamaFactoryConfig | indent 4 }}
+{{ if .Values.datasetInfo }}
+ remote_dataset_info.json: |
+{{ toPrettyJson .Values.datasetInfo | indent 4 }}
+{{- end }}
entrypoint.sh: |
#!/bin/bash
+ set -e
# Print GPU Info:
rocm-smi
mkdir -p /workdir/checkpoints
- {{- if .Values.checkpointsRemote }}
+ mkdir -p /workdir/datasets
+ cd /workspace/LLaMA-Factory
+ cp /configs/llama_factory_config.yaml llama_factory_config.yaml
+ {{- if .Values.datasetInfo }}
+ cp /configs/remote_dataset_info.json remote_dataset_info.json
+ {{- end }}
# Setup MinIO
mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_STORAGE_ACCESS_KEY $BUCKET_STORAGE_SECRET_KEY
+ {{- if .Values.modelRemote }}
+ # copy model from remote to local
+ mc cp --recursive \
+ minio-host/{{ .Values.modelRemote | trimSuffix "/" }}/ \
+ /workdir/basemodel
+ {{- end }}
+ {{- range .Values.datasetInfo }}
+ {{- if .pathRemote }}
+ # copy dataset from remote to local
+ mc cp \
+ minio-host/{{ .pathRemote }} \
+ /workdir/datasets/{{ .pathRemote | replace "/" "_" }}
+ sed -i 's;"pathRemote": "{{ .pathRemote }}";"file_name": "/workdir/datasets/{{ .pathRemote | replace "/" "_" }}";g' remote_dataset_info.json
+ {{- end }}
+ {{- end }}
+ {{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }}
+ {{- if .Values.checkpointsRemote }}
+ {{- if .Values.resumeFromCheckpoint }}
# Sync checkpoints from remote to local
- if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then
- echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+ if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+ echo 'Downloaded checkpoints from' {{ $checkpointsRemotePath }} 'to /workdir/checkpoints'
ls -lah /workdir/checkpoints
+ echo "resume_from_checkpoint: /workdir/checkpoints" >> llama_factory_config.yaml
else
echo 'No checkpoints found yet'
fi
+ {{- end }}
echo "Starting checkpoint sync process"
mc mirror \
--watch \
/workdir/checkpoints \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ &
+ {{ $checkpointsRemotePath }} &
uploadPID=$!
+ # Check if the sync process started successfully
+ sleep 1
+ if ! ps -p $uploadPID > /dev/null; then
+ echo "ERROR: Sync process failed to start"
+ exit 1
+ fi
{{- end }}
# Run training:
echo "Starting training process"
- cd LLaMA-Factory/
+ {{- if .Values.datasetInfo }}
+ jq -s add remote_dataset_info.json /workspace/LLaMA-Factory/data/dataset_info.json > dataset_info.json
+ cp dataset_info.json /workspace/LLaMA-Factory/data/dataset_info.json
+ {{- end }}
{{- if ne (int $.Values.nodes) 1 }}
export USE_RAY=1
{{- end }}
- llamafactory-cli train /configs/llama_factory_config.yaml
+ llamafactory-cli train llama_factory_config.yaml
{{- if .Values.checkpointsRemote }}
echo "Training done, stop the upload process"
kill $uploadPID
@@ -50,6 +96,6 @@ data:
echo 'Training done, syncing once more...'
mc mirror --overwrite \
/workdir/checkpoints \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/
+ {{ $checkpointsRemotePath }}
{{- end }}
echo 'All done, exiting'
diff --git a/workloads/llm-finetune-llama-factory/helm/values.schema.json b/workloads/llm-finetune-llama-factory/helm/values.schema.json
index 95d8a9d..5c62416 100644
--- a/workloads/llm-finetune-llama-factory/helm/values.schema.json
+++ b/workloads/llm-finetune-llama-factory/helm/values.schema.json
@@ -6,6 +6,22 @@
"type": "string",
"description": "Container image for finetuning"
},
+ "modelName": {
+ "type": "string",
+ "description": "Model path in Huggginface"
+ },
+ "modelRemote": {
+ "type": "string",
+ "description": "Model path in remote MinIO storage, format: bucketName/path/in/bucket"
+ },
+ "dataset": {
+ "type": "string",
+ "description": "Name of data set used for training. Use commas to separate multiple data sets."
+ },
+ "datasetInfo": {
+ "type": "object",
+ "description": "Additional datasets can be specified in datasetInfo, according to the LLaMA-Factory dataset format, see https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md"
+ },
"kaiwo": {
"type": "object",
"properties": {
@@ -109,12 +125,14 @@
"default": "16Gi"
},
"checkpointsRemote": {
- "type": [
- "string",
- "null"
- ],
+ "type": "string",
"description": "Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket"
},
+ "resumeFromCheckpoint": {
+ "type": "boolean",
+ "description": "If true, resume from the last checkpoint in checkpointsRemote (if available)",
+ "default": false
+ },
"hfTokenSecret": {
"type": "object",
"properties": {
diff --git a/workloads/llm-finetune-llama-factory/helm/values.yaml b/workloads/llm-finetune-llama-factory/helm/values.yaml
index 06339b8..50e5117 100644
--- a/workloads/llm-finetune-llama-factory/helm/values.yaml
+++ b/workloads/llm-finetune-llama-factory/helm/values.yaml
@@ -1,6 +1,20 @@
### General chart values ###
finetuningImage: ghcr.io/silogen/llama-factory-rocm-pytorch-training:v0.3
+### Model ###
+# either modelRemote OR modelName must be set
+# to use a base model directly from Hugging Face, set modelName to the model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+modelName: ""
+# for remote models to be loaded from MinIO, specify the path to the model in the remote bucket as modelRemote
+modelRemote: ""
+
+### Data ###
+# list datasets to use, can include datasets predefined in LLaMA-Factory or those defined in datasetInfo
+dataset: ""
+# Additional datasets can be specified in datasetInfo, according to the LLaMA-Factory dataset format, see https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md
+# For remote datasets to be loaded from MinIO, specify the path to the dataset in the remote bucket as pathRemote
+datasetInfo: {}
+
# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
kaiwo:
enabled: false
@@ -9,8 +23,10 @@ kaiwo:
labels: {}
# Extra annotations such as an imagePullSecrets
-imagePullSecrets:
- - "regcred"
+imagePullSecrets: []
+ # Example:
+ # imagePullSecrets:
+ # - "regcred"
# Configure these to match the credentials in your cluster:
bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
@@ -37,7 +53,8 @@ llamaFactoryConfig:
stage: sft
### Model output path ###
-checkpointsRemote: # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
+checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket)
+resumeFromCheckpoint: false # Set to true to resume from the last checkpoint in checkpointsRemote (if available)
hfTokenSecret: {} # Optional secret reference that contains the Huggingface token
# Example:
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml
index 1a3ad6d..784283a 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml
@@ -19,8 +19,6 @@ finetuning_config:
data_conf:
training_data:
type: CONCATENATION
- datasets:
- - path: "PLACEHOLDER"
validation_data:
type: AUTO_SPLIT
ratio: 0.1
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml
index 50ee0f4..b0d5dac 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml
@@ -13,8 +13,6 @@ finetuning_config:
data_conf:
training_data:
type: CONCATENATION
- datasets:
- - path: "PLACEHOLDER"
validation_data:
type: AUTO_SPLIT
ratio: 0.1
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml
index 8f73618..d8426b8 100644
--- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml
@@ -13,8 +13,6 @@ finetuning_config:
data_conf:
training_data:
type: CONCATENATION
- datasets:
- - path: "PLACEHOLDER"
validation_data:
type: AUTO_SPLIT
ratio: 0.1
diff --git a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md b/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
index d229420..3c980b1 100644
--- a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
+++ b/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md
@@ -9,16 +9,17 @@ See the various sub-configs for their options. Additional properties are not all
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
-| method | `const` | | `sft` | `"sft"` | |
| data_conf | `object` | ✅ | [ChatTrainValidConfig](#chattrainvalidconfig) | | The data input config |
| training_args | `object` | ✅ | [SilogenTrainingArguments](#silogentrainingarguments) | | Transformer TrainingArguments with some restrictions |
-| overrides | `object` | | [Overrides](#overrides) | `{"num_train_epochs": null, "lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface |
| batchsize_conf | `object` | ✅ | [BatchsizeConfig](#batchsizeconfig) | | Batch size configuration |
-| peft_conf | `object` | ✅ | [NoPeftConfig](#nopeftconfig) or [PretrainedPeftConfig](#pretrainedpeftconfig) or [GenericPeftConfig](#genericpeftconfig) | | Adapter configuration |
+| peft_conf | `object` | ✅ | [GenericPeftConfig](#genericpeftconfig) and/or [NoPeftConfig](#nopeftconfig) and/or [PretrainedPeftConfig](#pretrainedpeftconfig) | | Adapter configuration |
| run_conf | `object` | ✅ | [RunConfig](#runconfig) | | Model related configuration |
-| tracking | `object` or `null` | | [FinetuningTrackingConfig](#finetuningtrackingconfig) | | MLFlow tracking configuration |
-| quant_conf | `object` | | [BnBQuantizationConfig](#bnbquantizationconfig) or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration |
| sft_args | `object` | ✅ | [SFTArguments](#sftarguments) | | SFT specific arguments |
+| method | `const` | | `sft` | `"sft"` | |
+| overrides | `object` | | [Overrides](#overrides) | `{"lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface |
+| tracking | `object` or `null` | | [FinetuningTrackingConfig](#finetuningtrackingconfig) | | MLFlow tracking configuration |
+| quant_conf | `object` | | [BnBQuantizationConfig](#bnbquantizationconfig) and/or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration |
+
---
@@ -33,7 +34,7 @@ Automatic validation split from the training data
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
| type | `const` | ✅ | `AUTO_SPLIT` | | |
-| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
| ratio | `number` | | number | `0.2` | Ratio of the training data to use for validation |
| seed | `integer` | | integer | `1289525893` | Seed for the random number generator for splitting |
@@ -78,11 +79,20 @@ see: https://huggingface.co/docs/transformers/en/main_classes/quantization#trans
| bnb_4bit_use_double_quant | `boolean` | | boolean | `False` | |
| bnb_4bit_quant_storage | `string` or `null` | | string | | |
+## ChatTemplateName
+
+Chat template to use.
+
+#### Type: `string`
+
+**Possible Values:** `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31`
+
## ChatTrainValidConfig
-Training time data configuration.
+Training time data configuration
-Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput is also allowed for the validation side.
+Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput
+is also allowed for the validation side.
Additionally includes chat template and padding configurations, as those are part of the data input pipeline.
@@ -90,9 +100,9 @@ Additionally includes chat template and padding configurations, as those are par
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
-| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) or [WeightedMixDataInput](#weightedmixdatainput) | | |
-| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) or [ConcatenationDataInput](#concatenationdatainput) or [NoneDataInput](#nonedatainput) | | |
-| chat_template_name | `string` | | `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31` | `"mistral-with-system"` | |
+| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) and/or [WeightedMixDataInput](#weightedmixdatainput) | | |
+| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) and/or [ConcatenationDataInput](#concatenationdatainput) and/or [NoneDataInput](#nonedatainput) | | |
+| chat_template_name | `string` | | [ChatTemplateName](#chattemplatename) | `"mistral-with-system"` | |
| padding_side | `string` | | string | `"right"` | Padding side, right is usually right. |
| missing_pad_token_strategy | `string` | | [MissingPadTokenStrategy](#missingpadtokenstrategy) | `"bos-repurpose"` | See the MissingPadTokenStrategys for descriptions of the options |
@@ -117,7 +127,7 @@ For DPO this means lines of:
| -------- | ---- | -------- | --------------- | ------- | ----------- |
| type | `const` | ✅ | `CONCATENATION` | | |
| datasets | `array` | ✅ | [DatasetDefinition](#datasetdefinition) | | |
-| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
## DatasetDefinition
@@ -137,11 +147,11 @@ Settings that define how run details are logged
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
-| mlflow_server_uri | `string` | ✅ | string | | MLflow server URI. Can be local path |
-| experiment_name | `string` | ✅ | string | | Experiment name that is used for MLFlow tracking |
-| run_id | `string` or `null` | | string | | Run id, to resume logging to previousely started run |
-| run_name | `string` or `null` | | string | | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified |
-| hf_mlflow_log_artifacts | `string` | | string | `"False"` | Whether to store model artifacts in MLFlow |
+| mlflow_server_uri | `string` | ✅ | string | | MLflow server URI. Can be local path. |
+| experiment_name | `string` | ✅ | string | | Experiment name that is used for MLFlow tracking. |
+| run_id | `string` or `null` | | string | | Run id, to resume logging to previously started run. |
+| run_name | `string` or `null` | | string | | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified. |
+| hf_mlflow_log_artifacts | `string` | | string | `"False"` | Whether to store model artifacts in MLFlow. |
## GenericPeftConfig
@@ -150,7 +160,8 @@ Config for any new initialized PEFT Adapter
See https://huggingface.co/docs/peft/tutorial/peft_model_config for the possible kwargs
and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py for the types.
-### Example
+Example:
+
>>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM',
... 'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}}
>>> generic_conf = GenericPeftConfig(**loaded_data)
@@ -171,8 +182,6 @@ and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py
| task_type | `string` | | [TaskType](#tasktype) | `"CAUSAL_LM"` | |
| peft_kwargs | `object` | | object | | |
-
-
## MissingPadTokenStrategy
Specifies the available missing pad token strategies.
@@ -207,9 +216,10 @@ See parameter docstrings and help at:
https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained
See below in "Parameters for big model inference" too, it affects training too. Also note that this link takes you
to the transformers main branch version - be sure to compare with the installed version of transformers (that keeps
-changing over time, and it is difficult to keep this doctstring up to date, so we wanted to link to the latest here).
+changing over time, and it is difficult to keep this docstring up to date, so we wanted to link to the latest here).
Some important parameters to consider are:
+
- device_map :
A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer
name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass
@@ -230,26 +240,26 @@ NOTE:
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
| torch_dtype | `const` | | `auto` | `"auto"` | |
-| device_map | `object` or `string` or `null` | | object and/or string | | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential" |
+| device_map | `object` or `string` or `null` | | object and/or string | | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential". |
| max_memory | `object` or `null` | | object | | |
| low_cpu_mem_usage | `boolean` | | boolean | `False` | |
-| attn_implementation | `string` or `null` | | string | | Note: this can be set to "sdpa", "flash_attention_2", "eager" |
+| attn_implementation | `string` or `null` | | string | | Note: this can be set to "sdpa", "flash_attention_2", "eager". |
| offload_folder | `string` or `null` | | string | | |
| offload_state_dict | `boolean` or `null` | | boolean | | Default is True if offloading (otherwise no effect) |
| offload_buffers | `boolean` or `null` | | boolean | | |
-| use_cache | `boolean` | | boolean | `True` | Saves generated hidden states to speed up generation. See: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 use_cache is mutually exclusive with gradient_checkpointing |
+| use_cache | `boolean` | | boolean | `true` | Saves generated hidden states to speed up generation, see: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 This is mutually exclusive with gradient_checkpointing. |
| cache_dir | `string` or `null` | | string | | |
-| force_download | `boolean` | | boolean | `False` | |
-| local_files_only | `boolean` | | boolean | `False` | |
+| force_download | `boolean` | | boolean | `False` | |
+| local_files_only | `boolean` | | boolean | `False` | |
| proxies | `object` or `null` | | object | | |
-| resume_download | `boolean` | | boolean | `False` | |
+| resume_download | `boolean` | | boolean | `False` | |
| revision | `string` | | string | `"main"` | |
| code_revision | `string` | | string | `"main"` | |
| subfolder | `string` or `null` | | string | | |
| token | `string` or `null` | | string | | |
| use_safetensors | `boolean` or `null` | | boolean | | |
| variant | `string` or `null` | | string | | |
-| trust_remote_code | `boolean` | | boolean | `False` | Warning: if set to `True`, allows execution of downloaded remote code |
+| trust_remote_code | `boolean` | | boolean | `False` | Warning: if set to True, allows execution of downloaded remote code. |
## NoPeftConfig
@@ -280,23 +290,20 @@ A special type for not using data e.g. in validation
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
| type | `const` | ✅ | `NONE` | | |
-| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
## Overrides
-Override options that allow simple interfaces for charts using these configs
+Override options
-This is particularly useful for a helm chart interface where we include the finetuning package config
-as a part of the values.yaml file. These a more flexible helm interface with certain keys brought to the
-top level.
+These implement dynamic scaling for the learning rate.
#### Type: `object`
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
-| num_train_epochs | `integer` or `number` or `null` | | number | | Overrides the number of epochs in the training_args |
| lr_multiplier | `number` | | number | `1.0` | Multiplier applied to the learning rate in the training_args |
-| lr_batch_size_scaling | `string` | | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size. `none`: No scaling. `sqrt`: Multiplies learning rate by square root of batch size (a classic scaling rule). `linear`: Multiplies learning rate by the batch size (a more modern scaling rule). |
+| lr_batch_size_scaling | `string` | | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size. 'none': No scaling. 'sqrt': Multiplies learning rate by square root of batch size (a classic scaling rule). 'linear': Multiplies learning rate by the batch size (a more modern scaling rule). |
## PeftType
@@ -335,7 +342,7 @@ PEFT adapter uses the config and initialisation from a pretrained adapter
| Property | Type | Required | Possible values | Description |
| -------- | ---- | -------- | --------------- | ----------- |
| peft_type | `const` | ✅ | `PRETRAINED_PEFT` | |
-| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft |
+| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft. |
## RunConfig
@@ -345,12 +352,13 @@ Experiment running configuration
| Property | Type | Required | Possible values | Default | Description |
| -------- | ---- | -------- | --------------- | ------- | ----------- |
-| model | `string` | | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be `/local_resources/basemodel` |
+| model | `string` | | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be /local_resources/basemodel |
| model_args | `object` | | [ModelArguments](#modelarguments) | `{"torch_dtype": "auto", "device_map": "auto", "max_memory": null, "low_cpu_mem_usage": false, "attn_implementation": null, "offload_folder": null, "offload_state_dict": null, "offload_buffers": null, "use_cache": true, "cache_dir": null, "force_download": false, "local_files_only": false, "proxies": null, "resume_download": false, "revision": "main", "code_revision": "main", "subfolder": null, "token": null, "use_safetensors": null, "variant": null, "trust_remote_code": false}` | |
| tokenizer | `string` or `null` | | string | | Model HuggingFace ID, or path, or None to use the one associated with the model |
-| use_fast_tokenizer | `boolean` | | boolean | `True` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. |
-| resume_from_checkpoint | `boolean` or `string` | | boolean and/or string | `False` | Normally should be set to 'auto' to continue if a checkpoint exists. Can set to `True` to always try to continue, `False` to never try, or a path to load from a specific path. |
+| use_fast_tokenizer | `boolean` | | boolean | `true` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. |
+| resume_from_checkpoint | `boolean` or `string` | | boolean and/or string | | Normally should be set to 'auto' to continue if a checkpoint exists. Can set to True to always try to continue, False to never try, or a path to load from a specific path. |
| final_checkpoint_name | `string` | | string | `"checkpoint-final"` | Name of final checkpoint. Should be left as default |
+| determinism | `string` | | `no` `half` `full` | `"no"` | Set the level of determinism in implementations. Deterministic implementations are not always available, and when they are, they are usually slower than their non-deterministic counterparts. Recommended for debugging only. 'no': No determinism. 'half': Prefer deterministic implementations. 'full': Only fully deterministic implementations, error out on operations that only have non-deterministic implementations. |
## SFTArguments
@@ -425,5 +433,5 @@ For DPO this means lines of:
| -------- | ---- | -------- | --------------- | ------- | ----------- |
| type | `const` | ✅ | `PRECOMPUTE_WEIGHTED_MIX` | | |
| datasets | `array` | ✅ | [WeightedDatasetDefinition](#weighteddatasetdefinition) | | |
-| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method |
+| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. |
| seed | `integer` | | integer | `19851243` | Seed for the random number generator for interleaving draws |
diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
index 75c4189..c1be6ca 100644
--- a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
+++ b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
@@ -4,29 +4,30 @@
echo 'Copying resources to container...';
mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY}
mc cp --recursive \
- minio-host/{{ .Values.basemodel | trimSuffix "/" }}/ \
+ minio-host/'{{ .Values.basemodel | trimSuffix "/" }}'/ \
/local_resources/basemodel
{{- if $.Values.trainingData }}
mc cp \
- minio-host/{{ $.Values.trainingData }} \
- /local_resources/{{ $.Values.trainingData | replace "/" "_" }}
+ minio-host/'{{ $.Values.trainingData | replace "'" "'\\''" }}' \
+ /local_resources/'{{ $.Values.trainingData | replace "'" "'\\''" | replace "/" "_" }}'
{{- else }}
{{- range .Values.finetuning_config.data_conf.training_data.datasets }}
mc cp \
- minio-host/{{ .path }} \
- /local_resources/{{ .path | replace "/" "_" }}
+ minio-host/'{{ .path | replace "'" "'\\''" }}' \
+ /local_resources/'{{ .path | replace "'" "'\\''" | replace "/" "_" }}'
{{- end }}
{{- if (or (eq .Values.finetuning_config.data_conf.validation_data.type "AUTO_SPLIT" ) (eq .Values.finetuning_config.data_conf.validation_data.type "NONE")) }}
{{- range .Values.finetuning_config.data_conf.validation_data.datasets }}
mc cp \
- minio-host/{{ .path }} \
- /local_resources/{{ .path | replace "/" "_" }}
+ minio-host/'{{ .path | replace "'" "'\\''" }}' \
+ /local_resources/'{{ .path | replace "'" "'\\''" | replace "/" "_" }}'
{{- end }}
{{- end }}
{{- end }}
# Sync checkpoints from remote to local
-if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then
- echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }}
+if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+ echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''" }} to /workdir/checkpoints'
ls -lah /workdir/checkpoints
else
echo 'No checkpoints found yet'
@@ -35,25 +36,38 @@ fi
{/* ####################################################################################################################################################### */}}
{{- define "finetuningAndUploadEntrypoint" -}}
-{{- $logs_path := (default ( .Values.checkpointsRemote | trimSuffix "/" | printf "%s/logs/" ) .Values.logsRemote ) -}}
+# quote paths with single quotes to avoid issues with special characters in paths, and replace any existing single quote with escaped single quote
+{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }}
+{{- $logsRemotePath := printf "minio-host/'%s'/" ( (default ( .Values.checkpointsRemote | trimSuffix "/" | printf "%s/logs" ) .Values.logsRemote ) | trimSuffix "/" | replace "'" "'\\''") -}}
# Print GPU Info:
rocm-smi
echo "Starting checkpoint sync process"
mc mirror \
--watch \
/workdir/checkpoints \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ &
+ {{ $checkpointsRemotePath }} &
uploadPID=$!
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $uploadPID > /dev/null; then
+ echo "ERROR: Sync process failed to start"
+ exit 1
+fi
# Run training:
{{- if .Values.runTensorboard }}
tensorboard --logdir /workdir/logs --port 6006 &
echo "Serving tensorboard on port 6006. Port-forward to access training logs during the training process lifetime."
-echo "Also starting logs upload process, uploading to {{ $logs_path }}"
+echo "Also starting logs upload process, uploading to {{ $logsRemotePath }}"
mc mirror \
--watch \
/workdir/logs \
- minio-host/{{ $logs_path }} &
+ {{ $logsRemotePath }} &
logsPID=$!
+sleep 1
+if ! ps -p $logsPID > /dev/null; then
+ echo "ERROR: Logs sync process failed to start"
+ exit 1
+fi
{{- end }}
echo "Starting training process"
accelerate launch \
@@ -81,16 +95,16 @@ merge_adapter $merge_base ./checkpoints/checkpoint-final ./checkpoints/checkpoin
echo 'Training done, syncing once more...'
mc mirror \
/workdir/checkpoints \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/
+ {{ $checkpointsRemotePath }}
{{- if .Values.runTensorboard }}
mc mirror \
/workdir/logs \
- minio-host/{{ $logs_path }}
+ {{ $logsRemotePath }}
{{- end }}
# Sync the final checkpoint with overwrite to carry over vLLM-compatibility changes
mc mirror \
--overwrite \
/workdir/checkpoints/checkpoint-final \
- minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/checkpoint-final/
+ {{ $checkpointsRemotePath }}checkpoint-final/
echo 'All done, exiting'
{{- end }}
diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
index 739122d..ec33a0b 100644
--- a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
+++ b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml
@@ -47,7 +47,7 @@ data:
main_process_port: null
mixed_precision: bf16
num_machines: 1
- num_processes: {{ .Values.finetuningGpus }}
+ num_processes: 1
use_cpu: false
{{- else if (eq .Values.distributedType "auto-ddp") }}
compute_environment: LOCAL_MACHINE
diff --git a/workloads/llm-finetune-verl/helm/Chart.yaml b/workloads/llm-finetune-verl/helm/Chart.yaml
new file mode 100644
index 0000000..ca3ef76
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: llm-finetune-verl-example
+description: VeRL finetuning on SiloGen stack
+version: 0.0.1
diff --git a/workloads/llm-finetune-verl/helm/README.md b/workloads/llm-finetune-verl/helm/README.md
new file mode 100644
index 0000000..6408efa
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/README.md
@@ -0,0 +1,49 @@
+# Finetuning with VeRL
+
+This is a Helm Chart for running a finetuning job using [VeRL](https://github.com/volcengine/verl)
+
+The output is saved with MinIO in the directory specified by `checkpointsRemote`.
+
+## Configuration
+
+Include any parameters for VeRL in the `verlConfig` parameter. See the override file [`overrides/ppo_qwen_gsm8k.yaml`](overrides/ppo_qwen_gsm8k.yaml) for an example and the [VeRL documentation](https://verl.readthedocs.io/en/latest/examples/config.html) for more details.
+
+## Running the workload
+
+The simplest is to run `helm template` and pipe the result to `kubectl create`.
+
+Example command using the example override file `overrides/ppo_qwen_gsm8k.yaml`:
+
+```bash
+helm template workloads/llm-finetune-verl/helm \
+ --values workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml \
+ --name-template ppo-qwen-gsm8k-verl \
+ | kubectl create -f -
+```
+
+## Data specification
+
+VeRL requires that the data is prepared for the policy training in a [particular way](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html).
+
+Some example data preprocess scripts are provided, to use one of these, specify the name of data set used for training as `dataset`. Available datasets are "full_hh_rlhf", "geo3k", "gsm8k", "hellaswag", "math_dataset".
+
+To use your own datasets from MinIO, specify the path as `datasetRemote`. It should point to a directory with files that have already been appropriately processed (`train.parquet` and `test.parquet`).
+
+## Model specification
+
+To use a base model from HuggingFace or other source directly supported by LLaMA-Factory, specify the model name in `modelName`.
+
+Alternatively to use a model from MinIO, specify the path to the model in `modelRemote`.
+
+Either `modelName` or `modelRemote` must be specified. If both are included, the model from `modelRemote` is used.
+
+## Cleanup
+
+After the jobs are completed, please delete the resources created. To delete the resources, you can run the same `helm template` command, only replacing `kubectl create` with `kubectl delete`, e.g.:
+
+```bash
+helm template workloads/llm-finetune-verl/helm \
+ --values workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml \
+ --name-template ppo-qwen-gsm8k-verl \
+ | kubectl delete -f -
+```
diff --git a/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml b/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml
new file mode 100644
index 0000000..99f6801
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml
@@ -0,0 +1,45 @@
+### Model ###
+modelName: "Qwen/Qwen2-7B-Instruct"
+
+### Data ###
+dataset: "gsm8k"
+
+# Resources:
+checkpointsReservedSize: 512Gi
+storageClass: mlstorage
+finetuningGpus: 2
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/Qwen2_7B_Instruct_GRPO_gsm8k_verl"
+
+verlConfig:
+ algorithm:
+ adv_estimator: grpo
+ kl_ctrl:
+ kl_coef: 0.001
+ data:
+ train_batch_size: 1024
+ max_prompt_length: 512
+ max_response_length: 1024
+ actor_rollout_ref:
+ model:
+ use_remove_padding: True
+ enable_gradient_checkpointing: True
+ actor:
+ ppo_micro_batch_size_per_gpu: 80
+ use_kl_loss: True
+ kl_loss_coef: 0.001
+ kl_loss_type: low_var_kl
+ rollout:
+ n: 5
+ log_prob_micro_batch_size_per_gpu: 40
+ tensor_model_parallel_size: 2
+ gpu_memory_utilization: 0.6
+ ref:
+ log_prob_micro_batch_size_per_gpu: 40
+ fsdp_config:
+ param_offload: True
+ trainer:
+ total_epochs: 10
diff --git a/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml
new file mode 100644
index 0000000..e6d278a
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml
@@ -0,0 +1,3 @@
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+ enabled: true
diff --git a/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml b/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml
new file mode 100644
index 0000000..fac53d8
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml
@@ -0,0 +1,50 @@
+### Model ###
+modelName: "Qwen/Qwen2-7B-Instruct"
+
+### Data ###
+dataset: "gsm8k"
+
+# Resources:
+checkpointsReservedSize: 512Gi
+storageClass: mlstorage
+finetuningGpus: 2
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+### Model output path ###
+checkpointsRemote: "default-bucket/experiments/Qwen2_7B_Instruct_PPO_gsm8k_verl"
+
+verlConfig:
+ data:
+ train_batch_size: 1024
+ max_prompt_length: 1024
+ max_response_length: 512
+ actor_rollout_ref:
+ model:
+ use_remove_padding: True
+ enable_gradient_checkpointing: True
+ actor:
+ ppo_micro_batch_size_per_gpu: 16
+ rollout:
+ log_prob_micro_batch_size_per_gpu: 40
+ tensor_model_parallel_size: 2
+ gpu_memory_utilization: 0.6
+ ref:
+ log_prob_micro_batch_size_per_gpu: 40
+ fsdp_config:
+ param_offload: True
+ critic:
+ optim:
+ lr: 1e-5
+ model:
+ use_remove_padding: True
+ enable_gradient_checkpointing: True
+ ppo_micro_batch_size_per_gpu: 32
+ fsdp_config:
+ param_offload: False
+ optimizer_offload: False
+ algorithm:
+ kl_ctrl:
+ kl_coef: 0.001
+ trainer:
+ total_epochs: 10
diff --git a/workloads/llm-finetune-verl/helm/templates/configmap.yaml b/workloads/llm-finetune-verl/helm/templates/configmap.yaml
new file mode 100644
index 0000000..89e0424
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/templates/configmap.yaml
@@ -0,0 +1,119 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: "{{ .Release.Name }}-configs"
+data:
+ verl_config.yaml: |
+ # @package _global_
+{{ toYaml .Values.verlConfig | indent 4 }}
+ entrypoint.sh: |
+ #!/bin/bash
+ set -eu
+ # Print GPU Info:
+ rocm-smi
+ mkdir -p /workdir/checkpoints
+ mkdir -p /workdir/datasets
+
+ echo "Installing MinIO:"
+ curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+ --create-dirs \
+ -o /minio-binaries/mc
+ chmod +x /minio-binaries/mc
+ export PATH="${PATH}:/minio-binaries/"
+ # Setup MinIO
+ mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_STORAGE_ACCESS_KEY $BUCKET_STORAGE_SECRET_KEY
+ {{- if .Values.modelRemote }}
+ # copy model from remote to local
+ echo "Downloading model from remote: {{ .Values.modelRemote }}"
+ mc cp --recursive \
+ minio-host/{{ .Values.modelRemote | trimSuffix "/" }}/ \
+ /workdir/basemodel
+ MODEL_PATH=/workdir/basemodel
+ {{- else if .Values.modelName }}
+ MODEL_PATH={{ .Values.modelName }}
+ {{- else }}
+ {{- fail "either modelName or modelRemote must be set" }}
+ {{- end }}
+ python3 -c "import transformers;transformers.pipeline('text-generation', model='$MODEL_PATH')"
+
+ {{- if .Values.datasetRemote }}
+ echo "Downloading dataset from remote: {{ .Values.datasetRemote }}"
+ mc cp --recursive \
+ minio-host/{{ .Values.datasetRemote | trimSuffix "/" }}/ \
+ /workdir/datasets/{{ .Values.datasetRemote | trimSuffix "/" }}
+ DATASET_PATH=/workdir/datasets/{{ .Values.datasetRemote | trimSuffix "/" }}
+ {{- else if .Values.dataset }}
+ {{- if eq .Values.dataset "full_hh_rlhf" }}
+ python3 /app/examples/data_preprocess/{{ .Values.dataset }}.py --split rm --local_dir /workdir/datasets/{{ .Values.dataset }}
+ DATASET_PATH=/workdir/datasets/{{ .Values.dataset }}/rm
+ {{- else }}
+ python3 /app/examples/data_preprocess/{{ .Values.dataset }}.py --local_dir /workdir/datasets/{{ .Values.dataset }}
+ DATASET_PATH=/workdir/datasets/{{ .Values.dataset }}
+ {{- end }}
+ {{- else }}
+ {{- fail "either dataset or datasetRemote must be set" }}
+ {{- end }}
+
+ {{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }}
+ {{- if .Values.checkpointsRemote }}
+ {{- if .Values.resumeFromCheckpoint }}
+ # Sync checkpoints from remote to local
+ if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then
+ echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints'
+ ls -lah /workdir/checkpoints
+ RESUME_MODE='resume_path'
+ else
+ echo 'No checkpoints found yet'
+ RESUME_MODE='disable'
+ fi
+ {{- else }}
+ RESUME_MODE='disable'
+ {{- end }}
+ echo "Starting checkpoint sync process"
+ mc mirror \
+ --watch \
+ --overwrite \
+ /workdir/checkpoints \
+ {{ $checkpointsRemotePath }} &
+ uploadPID=$!
+ # Check if the sync process started successfully
+ sleep 1
+ if ! ps -p $uploadPID > /dev/null; then
+ echo "ERROR: Sync process failed to start"
+ exit 1
+ fi
+ {{- end }}
+
+ export HIP_VISIBLE_DEVICES=$(rocm-smi --showall --csv | grep -P '^card\d+,' | cut -d',' -f1 | sed 's/card//g' | paste -sd ',' -)
+ export NUM_GPUS=$(echo $HIP_VISIBLE_DEVICES | tr ',' '\n' | wc -l)
+ export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+ export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
+
+ # copy config file into the verl directory, this is necessary to apply it as an override with hydra
+ mkdir -p /app/verl/trainer/config/override
+ cp /configs/verl_config.yaml /app/verl/trainer/config/override/helm.yaml
+
+ echo "Starting training process"
+ python3 -m verl.trainer.main_ppo +override=helm \
+ data.train_files=$DATASET_PATH/train.parquet \
+ data.val_files=$DATASET_PATH/test.parquet \
+ actor_rollout_ref.model.path=$MODEL_PATH \
+ critic.model.path=$MODEL_PATH \
+ trainer.n_gpus_per_node=$NUM_GPUS \
+ trainer.project_name='{{ .Release.Name }}' \
+ trainer.experiment_name='{{ .Release.Name }}' \
+ trainer.default_local_dir=/workdir/checkpoints \
+ trainer.resume_mode=$RESUME_MODE \
+ trainer.resume_from_path=/workdir/checkpoints
+
+ {{- if .Values.checkpointsRemote }}
+ echo "Training done, stop the upload process"
+ kill $uploadPID
+ wait $uploadPID || true
+ # Once more to ensure everything gets uploaded
+ echo 'Training done, syncing once more...'
+ mc mirror --overwrite \
+ /workdir/checkpoints \
+ {{ $checkpointsRemotePath }}
+ {{- end }}
+ echo 'All done, exiting'
diff --git a/workloads/llm-finetune-verl/helm/templates/job.yaml b/workloads/llm-finetune-verl/helm/templates/job.yaml
new file mode 100644
index 0000000..3d3db12
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/templates/job.yaml
@@ -0,0 +1,113 @@
+{{- define "job" -}}
+apiVersion: batch/v1
+kind: Job
+metadata:
+ name: "{{ .Release.Name }}-job"
+ {{- if .Values.labels }}
+ labels:
+ {{- range $label, $value := .Values.labels }}
+ {{ $label }}: {{ $value | quote }}
+ {{- end }}
+ {{- end }}
+spec:
+ ttlSecondsAfterFinished: 3600
+ backoffLimit: 0
+ template:
+ spec:
+ restartPolicy: Never
+ {{- if .Values.imagePullSecrets }}
+ imagePullSecrets:
+ {{- range .Values.imagePullSecrets }}
+ - name: {{ . }}
+ {{- end }}
+ {{- end }}
+ containers:
+ - name: finetuning
+ image: "{{ .Values.finetuningImage }}"
+ imagePullPolicy: Always
+ env:
+ {{- if .Values.hfTokenSecret }}
+ - name: HF_TOKEN
+ valueFrom:
+ secretKeyRef:
+ name: {{ .Values.hfTokenSecret.name }}
+ key: {{ .Values.hfTokenSecret.key }}
+ {{- end }}
+ # storage
+ - name: BUCKET_STORAGE_HOST
+ value: {{ .Values.bucketStorageHost }}
+ - name: BUCKET_STORAGE_ACCESS_KEY
+ valueFrom:
+ secretKeyRef:
+ name: {{ .Values.bucketCredentialsSecret.name }}
+ key: {{ .Values.bucketCredentialsSecret.accessKeyKey }}
+ - name: BUCKET_STORAGE_SECRET_KEY
+ valueFrom:
+ secretKeyRef:
+ name: {{ .Values.bucketCredentialsSecret.name }}
+ key: {{ .Values.bucketCredentialsSecret.secretKeyKey }}
+ command:
+ - /configs/entrypoint.sh
+ resources:
+ limits:
+ memory: "{{ mul .Values.finetuningGpus .Values.memoryPerGpu }}Gi"
+ cpu: "{{ mul .Values.finetuningGpus .Values.cpusPerGpu }}"
+ amd.com/gpu: "{{ .Values.finetuningGpus }}"
+ requests:
+ memory: "{{ mul .Values.finetuningGpus .Values.memoryPerGpu }}Gi"
+ cpu: "{{ mul .Values.finetuningGpus .Values.cpusPerGpu }}"
+ amd.com/gpu: "{{ .Values.finetuningGpus }}"
+ volumeMounts:
+ - name: dshm # Increase SHM size for the container by mounting /dev/shm, for Pytorch parallel processing
+ mountPath: /dev/shm
+ - name: checkpoints
+ mountPath: /workdir/checkpoints
+ readOnly: false
+ - name: configs
+ mountPath: /configs
+ readOnly: true
+ volumes:
+ - name: dshm
+ emptyDir:
+ medium: Memory # equivalent to `docker run --shm-size=(total_memory/2)`
+ {{- if .Values.storageClass }}
+ - name: checkpoints
+ ephemeral:
+ volumeClaimTemplate:
+ spec:
+ accessModes: [ "ReadWriteOnce" ]
+ storageClassName: {{ .Values.storageClass }}
+ resources:
+ requests:
+ storage: "{{ .Values.checkpointsReservedSize }}"
+ {{- else }}
+ - name: checkpoints
+ emptyDir:
+ sizeLimit: "{{ .Values.checkpointsReservedSize }}"
+ {{- end }}
+ - name: configs
+ configMap:
+ name: "{{ .Release.Name }}-configs"
+ items:
+ - key: entrypoint.sh
+ path: entrypoint.sh
+ mode: 0777
+ - key: verl_config.yaml
+ path: verl_config.yaml
+{{- end -}}
+
+{{- define "job_wrapped_with_kaiwojob" -}}
+apiVersion: kaiwo.silogen.ai/v1alpha1
+kind: KaiwoJob
+metadata:
+ name: "{{ .Release.Name }}-job"
+spec:
+ job:
+ {{- include "job" . | nindent 4 }}
+{{- end -}}
+
+{{- if .Values.kaiwo.enabled -}}
+{{- include "job_wrapped_with_kaiwojob" . }}
+{{- else -}}
+{{- include "job" . }}
+{{- end -}}
diff --git a/workloads/llm-finetune-verl/helm/values.schema.json b/workloads/llm-finetune-verl/helm/values.schema.json
new file mode 100644
index 0000000..348874c
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/values.schema.json
@@ -0,0 +1,138 @@
+{
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "type": "object",
+ "properties": {
+ "finetuningImage": {
+ "type": "string",
+ "description": "Container image for finetuning"
+ },
+ "modelName": {
+ "type": "string",
+ "description": "Model path in HuggingFace"
+ },
+ "modelRemote": {
+ "type": "string",
+ "description": "Model path in remote MinIO storage, format: bucketName/path/in/bucket"
+ },
+ "dataset": {
+ "type": "string",
+ "description": "Name of data set to use for training"
+ },
+ "datasetRemote": {
+ "type": "string",
+ "description": "Dataset path in remote MinIO storage, format: bucketName/path/in/bucket"
+ },
+ "kaiwo": {
+ "type": "object",
+ "properties": {
+ "enabled": {
+ "type": "boolean",
+ "description": "If true, use Kaiwo CRDs to have Kaiwo operator manage the workload",
+ "default": false
+ }
+ },
+ "default": {}
+ },
+ "labels": {
+ "type": "object",
+ "description": "Any labels to add for the manifest, recommended: kueue",
+ "additionalProperties": {
+ "type": "string"
+ },
+ "default": {}
+ },
+ "imagePullSecrets": {
+ "type": "array",
+ "description": "Any imagePullSecrets to use",
+ "items": {
+ "type": "string"
+ },
+ "default": []
+ },
+ "bucketStorageHost": {
+ "type": "string",
+ "description": "The cloud storage host URL"
+ },
+ "bucketCredentialsSecret": {
+ "type": "object",
+ "description": "Bucket storage credential secret values, required to have the secret already setup in the cluster (e.g. via external secrets)",
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": "The name of the secret in the cluster that contains the bucket storage credentials",
+ "default": "minio-credentials"
+ },
+ "accessKeyKey": {
+ "type": "string",
+ "description": "The key in the secret that contains the access key",
+ "default": "minio-access-key"
+ },
+ "secretKeyKey": {
+ "type": "string",
+ "description": "The key in the secret that contains the access key",
+ "default": "minio-secret-key"
+ }
+ }
+ },
+ "checkpointsReservedSize": {
+ "type": "string",
+ "description": "How much space to reserve for model and data downloads"
+ },
+ "storageClass": {
+ "type": [
+ "string",
+ "null"
+ ],
+ "description": "Optionally set this to use a specific storageClass for the storage"
+ },
+ "cpusPerGpu": {
+ "type": "integer",
+ "description": "How many CPUs to use, per GPU",
+ "default": 8,
+ "minimum": 1
+ },
+ "finetuningGpus": {
+ "type": "integer",
+ "description": "How many GPUs to use for finetuning",
+ "default": 1,
+ "minimum": 0
+ },
+ "memoryPerGpu": {
+ "type": "integer",
+ "description": "How much memory to use in GB, per GPU",
+ "default": 64
+ },
+ "checkpointsRemote": {
+ "type": "string",
+ "description": "Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket"
+ },
+ "resumeFromCheckpoint": {
+ "type": "boolean",
+ "description": "If true, resume from the last checkpoint in checkpointsRemote (if available)",
+ "default": false
+ },
+ "hfTokenSecret": {
+ "type": "object",
+ "description": "Optional secret reference that contains a HuggingFace token",
+ "properties": {
+ "name": {
+ "type": "string",
+ "description": "The name of the secret in the cluster that contains the HuggingFace token"
+ },
+ "key": {
+ "type": "string",
+ "description": "The key in the secret that contains the HuggingFace token"
+ }
+ },
+ "default": {}
+ },
+ "verlConfig": {
+ "type": "object",
+ "description": "VeRL configurations to use"
+ }
+ },
+ "required": [
+ "finetuningImage",
+ "verlConfig"
+ ]
+}
diff --git a/workloads/llm-finetune-verl/helm/values.yaml b/workloads/llm-finetune-verl/helm/values.yaml
new file mode 100644
index 0000000..60e0837
--- /dev/null
+++ b/workloads/llm-finetune-verl/helm/values.yaml
@@ -0,0 +1,62 @@
+### General chart values ###
+finetuningImage: rocm/verl:verl-0.3.0.post0_rocm6.2_vllm0.6.3
+
+### Model ###
+# either modelRemote OR modelName must be set
+# to use a base model directly from Hugging Face, set modelName to the model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct")
+modelName: ""
+# for remote models to be loaded from MinIO, specify the path to the model in the remote bucket as modelRemote
+modelRemote: ""
+
+### Data ###
+# either dataset OR datasetRemote must be set
+# to use one of the pre-existing datasets, set dataset to the dataset identifier (e.g., "gsm8k")
+# available datasets: "full_hh_rlhf", "geo3k", "gsm8k", "hellaswag", "math_dataset"
+dataset: ""
+# for remote datasets to be loaded from MinIO, specify the path to the model in the remote bucket as datasetRemote
+# Note: the dataset should be processed and stored in a format compatible with VeRL (train.parquet, test.parquet)
+datasetRemote: ""
+
+# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
+kaiwo:
+ enabled: false
+
+# Use to add labels to the metadata of the resources created by this workload.
+labels: {}
+
+# Extra annotations such as an imagePullSecrets
+imagePullSecrets: []
+ # Example:
+ # imagePullSecrets:
+ # - "regcred"
+
+# Configure these to match the credentials in your cluster:
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+bucketCredentialsSecret:
+ name: minio-credentials
+ accessKeyKey: minio-access-key
+ secretKeyKey: minio-secret-key
+
+# Resources:
+checkpointsReservedSize: 512Gi
+storageClass: mlstorage # set this to use a specific storageClass for the storage.
+finetuningGpus: 1
+memoryPerGpu: 64
+cpusPerGpu: 8
+
+### Model output path ###
+checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket
+resumeFromCheckpoint: false # Set to true to resume from the last checkpoint in checkpointsRemote (if available)
+
+hfTokenSecret: {} # Optional secret reference that contains the HuggingFace token
+# Example:
+# hfTokenSecret:
+# name: hf-token
+# key: hf-token
+
+verlConfig:
+ trainer:
+ logger: ['console']
+ test_freq: 10
+ save_freq: 10
+ total_epochs: 1
diff --git a/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh b/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh
index 272f199..fe9d1a7 100644
--- a/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh
+++ b/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh
@@ -5,8 +5,14 @@ mkdir -p /workload/output
curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /workload/mc
chmod +x /workload/mc
/workload/mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
-/workload/mc mirror --watch /workload/output/ minio-host/${BUCKET_RESULT_PATH} &
+/workload/mc mirror --watch /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" &
MINIOPID=$!
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $MINIOPID > /dev/null; then
+ echo "ERROR: Sync process failed to start"
+ exit 1
+fi
OPENAI_API_BASE_URL=${OPENAI_API_BASE_URL%/}
MODEL=$(curl -s ${OPENAI_API_BASE_URL}/models | jq -r '.data[0].id')
@@ -28,5 +34,5 @@ guidellm benchmark --target $OPENAI_API_BASE_URL \
echo -e "<==========================\nBenchmarking completed"
kill $MINIOPID
wait $MINIOPID || true
-/workload/mc mirror /workload/output/ minio-host/${BUCKET_RESULT_PATH}
+/workload/mc mirror /workload/output/ minio-host/"${BUCKET_RESULT_PATH}"
echo "All data uploaded successfully"
diff --git a/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh b/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh
index 6b465c8..24db12e 100644
--- a/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh
+++ b/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh
@@ -53,4 +53,4 @@ chmod +x /minio-binaries/mc
export PATH="${PATH}:/minio-binaries/"
mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
-mc cp --recursive $OUTPATH minio-host/${BUCKET_RESULT_PATH}/
+mc cp --recursive $OUTPATH minio-host/"${BUCKET_RESULT_PATH}"/
diff --git a/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh b/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh
index 4518a74..ac7c098 100644
--- a/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh
+++ b/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh
@@ -15,8 +15,14 @@ chmod +x $WORKPATH/bin/mc
mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
# Start a background process that watches for changes and uploads them
-mc mirror --watch $WORKPATH/output/ minio-host/${BUCKET_RESULT_PATH} &
+mc mirror --watch $WORKPATH/output/ minio-host/"${BUCKET_RESULT_PATH}" &
MINIOPID=$!
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $MINIOPID > /dev/null; then
+ echo "ERROR: Sync process failed to start"
+ exit 1
+fi
bash $WORKPATH/mount/minio_download_models.sh
@@ -53,5 +59,5 @@ kill $MINIOPID
wait $MINIOPID || true
# Run a final mirror command to ensure all data is uploaded
-mc mirror $WORKPATH/output/ minio-host/${BUCKET_RESULT_PATH}
+mc mirror $WORKPATH/output/ minio-host/"${BUCKET_RESULT_PATH}"
echo 'All data uploaded successfully'
diff --git a/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh b/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh
index 2cfd716..81bf005 100644
--- a/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh
+++ b/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh
@@ -3,8 +3,14 @@ mkdir -p /workload/output
curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /workload/mc
chmod +x /workload/mc
/workload/mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY}
-/workload/mc mirror --watch /workload/output/ minio-host/${BUCKET_RESULT_PATH} &
+/workload/mc mirror --watch /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" &
MINIOPID=$! # Capture the PID of the mc mirror process
+sleep 1 # Give some time for the process to start
+# Check if the sync process started successfully
+if ! ps -p $MINIOPID > /dev/null; then
+ echo "ERROR: Sync process failed to start"
+ exit 1
+fi
echo "vLLM server started with PID: $SERVER_PID"
ATTEMPT=0
@@ -62,5 +68,5 @@ done
echo "Benchmarking completed"
kill $MINIOPID
wait $MINIOPID || true
-/workload/mc mirror /workload/output/ minio-host/${BUCKET_RESULT_PATH}
+/workload/mc mirror /workload/output/ minio-host/"${BUCKET_RESULT_PATH}"
echo "All data uploaded successfully"
diff --git a/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml b/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml
index ad7a277..b6ca364 100644
--- a/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml
+++ b/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml
@@ -30,10 +30,10 @@ spec:
mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY};
echo "Listing contents of the model path:";
- mc ls minio-host/{{ .Values.remoteSourceModelPath | trimSuffix "/" }}/ || echo "Model path not found!";
+ mc ls minio-host/'{{ .Values.remoteSourceModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/ || echo "Model path not found!";
echo "Copying model checkpoint to container...";
- mc cp -r minio-host/{{ .Values.remoteSourceModelPath | trimSuffix "/" }}/ /local-resources/sourcemodel || echo "Failed to copy model!";
+ mc cp -r minio-host/'{{ .Values.remoteSourceModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/ /local-resources/sourcemodel || echo "Failed to copy model!";
echo "Listing contents of /local-resources/:";
ls -la /local-resources/ || echo "Local resources directory not found!";
@@ -72,7 +72,7 @@ spec:
echo "Conversion done, syncing checkpoint artifacts to remote storage...";
mc mirror --overwrite \
- /local-resources/checkpoints/ minio-host/{{ .Values.remoteDestinationModelPath | trimSuffix "/" }}/;
+ /local-resources/checkpoints/ minio-host/'{{ .Values.remoteDestinationModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/;
echo "Done uploading. Signal to the main container that it can exit.";
diff --git a/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml b/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml
index b64d515..217e3fe 100644
--- a/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml
+++ b/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml
@@ -26,18 +26,19 @@ spec:
# Setup MinIO, Download resources:
mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY};
echo "Copying data to container...";
- mc cp -r minio-host/{{ .Values.remoteDataDirPath | trimSuffix "/" }}/{{ .Values.remoteDataNamePrefix }} /local-resources/data;
+ mc cp -r minio-host/'{{ .Values.remoteDataDirPath | trimSuffix "/" | replace "'" "'\\''" }}'/'{{ .Values.remoteDataNamePrefix }}' /local-resources/data;
echo "Copying tokenizer to container...";
- mc cp -r minio-host/{{ .Values.remoteTokenizerPath | trimSuffix "/" }}/ /local-resources/tokenizer;
+ mc cp -r minio-host/'{{ .Values.remoteTokenizerPath | trimSuffix "/" | replace "'" "'\\''" }}'/ /local-resources/tokenizer;
echo "Copying model checkpoint to container...";
- if last_ckpt=$(mc cat minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/latest_checkpointed_iteration.txt); then
+ {{- $remotePath := printf "minio-host/'%s'/" (.Values.remoteCheckpointsPath | trimSuffix "/" | replace "'" "'\\''") }}
+ if last_ckpt=$(mc cat {{ $remotePath }}/latest_checkpointed_iteration.txt); then
last_ckpt=$(printf 'iter_%07d' "$last_ckpt")
echo "Found checkpoint at iteration $last_ckpt. Downloading ..."
- mc mirror minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/$last_ckpt/ /local-resources/basemodel/$last_ckpt
- mc cp minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/latest_checkpointed_iteration.txt /local-resources/basemodel/latest_checkpointed_iteration.txt
+ mc mirror {{ $remotePath }}/$last_ckpt/ /local-resources/basemodel/$last_ckpt
+ mc cp {{ $remotePath }}/latest_checkpointed_iteration.txt /local-resources/basemodel/latest_checkpointed_iteration.txt
else
echo "No checkpoints found yet. Downloading basemodel ..."
- mc cp -r minio-host/{{ .Values.remoteBaseModelPath | trimSuffix "/" }}/ /local-resources/basemodel;
+ mc cp -r minio-host/'{{ .Values.remoteBaseModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/ /local-resources/basemodel;
fi
resources:
limits:
diff --git a/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml b/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml
index a6da3e6..7e1f0f7 100644
--- a/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml
+++ b/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml
@@ -86,10 +86,10 @@ spec:
sleep 60
done
- echo "Preprocessing done, syncing data to remote storage {{ .Values.bucketDataDir | trimSuffix "/" }}...";
- mc cp --recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/;
+ echo "Preprocessing done, syncing data to remote storage {{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}...";
+ mc cp --recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'/;
mc mirror --overwrite --exclude "**/.cache/*" \
- /downloads/tokenizer/ minio-host/{{ .Values.bucketTokenizersDir | trimSuffix "/" }}/;
+ /downloads/tokenizer/ minio-host/'{{ .Values.bucketTokenizersDir | trimSuffix "/" | replace "'" "'\\''" }}'/;
echo "Done uploading. Signal to the main container that it can exit.";
touch /downloads/done_uploading;
resources: