diff --git a/docker/llm-evaluation/requirements.txt b/docker/llm-evaluation/requirements.txt index ad6196a..f7ad356 100644 --- a/docker/llm-evaluation/requirements.txt +++ b/docker/llm-evaluation/requirements.txt @@ -3,5 +3,6 @@ dataclasses-json==0.6.7 evaluate==0.4.3 jsonlines==4.0.0 minio==7.2.15 +mlflow==3.1.0 openai==1.64.0 sentencepiece==0.2.0 diff --git a/docker/llm-evaluation/run_inference_and_judge_evaluation.py b/docker/llm-evaluation/run_inference_and_judge_evaluation.py index e24dcf9..86a62e0 100644 --- a/docker/llm-evaluation/run_inference_and_judge_evaluation.py +++ b/docker/llm-evaluation/run_inference_and_judge_evaluation.py @@ -89,10 +89,8 @@ async def main(args: Namespace): saved_results = [] parameters: dict = {} - llm_url_no_protocol = args.llm_base_url.removeprefix("http://").removeprefix( - "https://" - ) # the Minio python client handles protocol itself - client = get_llm_client(base_url=llm_url_no_protocol, port=args.llm_port, endpoint=args.llm_endpoint) + + client = get_llm_client(base_url=args.llm_base_url, port=args.llm_port, endpoint=args.llm_endpoint) async for inference_result in run_call_inference_container( dataset=ds, @@ -123,10 +121,7 @@ async def main(args: Namespace): logger.info(inferences_data) logger.info("Inference ran.") - judge_url_no_protocol = args.judge_base_url.removeprefix("http://").removeprefix( - "https://" - ) # the Minio python client handles protocol itself - judge_client = get_llm_client(base_url=judge_url_no_protocol, port=args.judge_port, endpoint=args.judge_endpoint) + judge_client = get_llm_client(base_url=args.judge_base_url, port=args.judge_port, endpoint=args.judge_endpoint) aggregated_judge_results = AggregatedJudgeResults( judge_results={}, diff --git a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py index 6973182..57e4f28 100644 --- a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py +++ b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py @@ -14,9 +14,9 @@ from llm_evaluation.call_inference_container.call_inference_container import ( save_inference_results, ) -from llm_evaluation.metrics.run_metrics_evaluation import read_inference_data +from llm_evaluation.metrics.run_metrics_evaluation import get_bert_score_distribution_graphs, read_inference_data from llm_evaluation.metrics.run_metrics_evaluation import run as run_metrics_evaluation -from llm_evaluation.metrics.utils import save_results +from llm_evaluation.metrics.utils import log_metrics_in_mlflow, save_results async def main(args: Namespace): @@ -115,6 +115,20 @@ async def main(args: Namespace): eval_results = run_metrics_evaluation(data) + distribution_graphs = get_bert_score_distribution_graphs( + scores=eval_results.scores, + ) + + if args.mlflow_server_uri: + logger.info("Logging results to MLFlow...") + log_metrics_in_mlflow( + distribution_graphs, + eval_results.scores, + mlflow_server_uri=args.mlflow_server_uri, + mlflow_experiment_name=args.mlflow_experiment_name, + mlflow_run_name=args.mlflow_run_name, + ) + logger.info("Evaluation results:") logger.info(eval_results) diff --git a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py index e78c078..6cc0b9d 100644 --- a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py +++ b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py @@ -12,7 +12,11 @@ def get_inference_parser() -> ArgumentParser: parser.add_argument("-p", "--llm-port", type=str, default="8080", help="Port number of the LLM service.") parser.add_argument("-e", "--llm-endpoint", type=str, default="v1", help="Endpoint of the LLM service.") parser.add_argument( - "-d", "--evaluation-dataset", type=str, default="abisee/cnn_dailymail", help="Name of the evaluation dataset." + "-d", + "--evaluation-dataset-name", + type=str, + default="abisee/cnn_dailymail", + help="Name of the evaluation dataset.", ) parser.add_argument( "-v", "--evaluation-dataset-version", type=str, default="3.0.0", help="Version of the evaluation dataset." @@ -65,6 +69,24 @@ def get_inference_parser() -> ArgumentParser: default="/home/evaluation/example_prompts/example_summary_prompt.txt", help="Path to the prompt template file.", ) + parser.add_argument( + "--mlflow-server-uri", + type=str, + default="", # leave this argument empty to disable MLFlow tracking + help="MLFlow server URI for tracking.", + ) + parser.add_argument( + "--mlflow-experiment-name", + type=str, + default="llm-evaluation-experiment", + help="MLFlow experiment name for tracking.", + ) + parser.add_argument( + "--mlflow-run-name", + type=str, + default="llm-evaluation-run", + help="MLFlow run name for tracking.", + ) return parser diff --git a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py index b5d5e19..ebdba31 100644 --- a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py +++ b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py @@ -10,10 +10,12 @@ @dataclass_json @dataclass class EvaluationScores: - precision_bert: float - recall_bert: float - f1_bert: float - f1_list: List[float] + precision_avg_bert: float + recall_avg_bert: float + f1_avg_bert: float + precision_list_bert: List[float] + recall_list_bert: List[float] + f1_list_bert: List[float] bleu_score: float accuracy: float diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py index 3b8ba5a..76797e4 100644 --- a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py +++ b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py @@ -7,7 +7,7 @@ def compute_bertscore( predictions: List[str], references: List[str], language: str = "en" -) -> Tuple[float, float, float, List[float]]: +) -> Tuple[List[float], List[float], List[float]]: """ Computes the BERTScore for a set of predictions and references. @@ -32,13 +32,7 @@ def compute_bertscore( recall_list = convert_negatives_to_zero(array=np.array(results["recall"])) f1_list = convert_negatives_to_zero(array=np.array(results["f1"])) - precision_bert = round(np.average(precision_list), 4) - recall_bert = round(np.average(recall_list), 4) - f1_bert = round(np.average(f1_list), 4) - - f1_list = [round(f1, 4) for f1 in f1_list] - - return precision_bert, recall_bert, f1_bert, f1_list + return precision_list, recall_list, f1_list def compute_exact_match( diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py index 7d0985d..f2070b9 100644 --- a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py +++ b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py @@ -6,6 +6,9 @@ from typing import Any, Dict, List import jsonlines +import matplotlib.pyplot as plt +import mlflow +import numpy as np from llm_evaluation import logger from llm_evaluation.argument_parsers import get_metrics_parser from llm_evaluation.data.data_classes import EvaluationResults, EvaluationScores @@ -28,7 +31,13 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS bert_score_start_time = time.time() - precision_bert, recall_bert, f1_bert, f1_list = compute_bertscore(predictions=predictions, references=references) + precision_list_bert, recall_list_bert, f1_list_bert = compute_bertscore( + predictions=predictions, references=references + ) + + precision_avg_bert = round(np.average(precision_list_bert), 4) + recall_avg_bert = round(np.average(recall_list_bert), 4) + f1_avg_bert = round(np.average(f1_list_bert), 4) logger.info(f"BERT-score computation took {time.time() - bert_score_start_time:.2f} seconds") @@ -45,15 +54,53 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS logger.info(f"Exact match computation took {time.time() - exact_match_start_time:.2f} seconds") return EvaluationScores( - precision_bert=precision_bert, - recall_bert=recall_bert, - f1_bert=f1_bert, - f1_list=f1_list, + precision_avg_bert=precision_avg_bert, + recall_avg_bert=recall_avg_bert, + f1_avg_bert=f1_avg_bert, + precision_list_bert=precision_list_bert, + recall_list_bert=recall_list_bert, + f1_list_bert=f1_list_bert, bleu_score=bleu_score, accuracy=accuracy, ) +def get_bert_score_distribution_graphs(scores: EvaluationScores) -> Dict[str, str]: + """ + Generate PNG images of the distributions of BERTScore precision, recall, and F1, + each with the mean value marked. + + Args: + precision_list (list of float): List of BERTScore precision values. + recall_list (list of float): List of BERTScore recall values. + f1_list (list of float): List of BERTScore F1 values. + + Returns: + dict: Dictionary with keys 'precision', 'recall', 'f1', each containing PNG image bytes. + """ + results = {} + metrics = [ + ("precision", scores.precision_list_bert), + ("recall", scores.recall_list_bert), + ("f1", scores.f1_list_bert), + ] + for name, values in metrics: + fig, ax = plt.subplots() + values = np.array(values) + mean_val = np.mean(values) + ax.hist(values, bins=20, alpha=0.7, color="skyblue", edgecolor="black") + ax.axvline(mean_val, color="red", linestyle="dashed", linewidth=2, label=f"Mean: {mean_val:.4f}") + ax.set_title(f"BERTScore {name.capitalize()} Distribution") + ax.set_xlabel(name.capitalize()) + ax.set_ylabel("Frequency") + ax.legend() + plt.tight_layout() + plt.savefig(f"{name}_distribution.png", format="png") + plt.close(fig) + results[name] = f"{name}_distribution.png" + return results + + def read_inference_data(input_path: str) -> List[Dict[str, Any]]: """ Reads inference data from a file or directory containing JSON/JSONL files. diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py index 23fa6f2..84c917f 100644 --- a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py +++ b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py @@ -4,13 +4,14 @@ from typing import Any, Dict, List import jsonlines +import mlflow +import numpy as np from llm_evaluation import logger from llm_evaluation.data.data_classes import AggregatedJudgeResults, EvaluationResults from minio import Minio, S3Error -from numpy import ndarray -def convert_negatives_to_zero(array: ndarray) -> ndarray: +def convert_negatives_to_zero(array: np.ndarray) -> np.ndarray: """Converts all negative values in an array to zero. Args: @@ -129,3 +130,40 @@ def read_jsonl_data(input_file_path: str) -> List[Dict[str, Any]]: for line in reader.iter(type=dict, skip_invalid=True): generations.append(line) return generations + + +def log_metrics_in_mlflow(distribution_graphs, scores, mlflow_server_uri, mlflow_experiment_name, mlflow_run_name): + + logger.info(f"Using MLflow tracking URI: {mlflow_server_uri}") + + experiment_description = "Evaluation of LLM using BERTScore metric." + + experiment_tags = { + "project_name": mlflow_experiment_name, + "mlflow.note.content": experiment_description, + } + + client = mlflow.MlflowClient(tracking_uri=mlflow_server_uri) + + # Create the Experiment, providing a unique name + try: + test_experiment = client.create_experiment(name=mlflow_experiment_name, tags=experiment_tags) + logger.info(f"Created experiment with ID: {test_experiment}") + except mlflow.exceptions.MlflowException as e: + # If the experiment already exists, retrieve its ID + logger.warning(f"Experiment '{mlflow_experiment_name}' already exists. Using existing experiment.") + test_experiment = client.get_experiment_by_name(mlflow_experiment_name).experiment_id + logger.info(f"Using existing experiment with ID: {test_experiment}") + + mlflow.set_tracking_uri(mlflow_server_uri) + mlflow.set_experiment(experiment_name=mlflow_experiment_name) + with mlflow.start_run(run_name=mlflow_run_name, experiment_id=test_experiment) as run: + + for name, file in distribution_graphs.items(): + mlflow.log_metric("bert_score_mean_precision" + name, np.mean(scores.precision_avg_bert)) + mlflow.log_metric("bert_score_mean_recall" + name, np.mean(scores.recall_avg_bert)) + mlflow.log_metric("bert_score_mean_f1" + name, np.mean(scores.f1_avg_bert)) + logger.info( + f"Saving artifact {file} (abs path: {os.path.abspath(file)}) to MLflow run {run.info.run_id}..." + ) + mlflow.log_artifact(os.path.abspath(file), artifact_path="metrics_distributions") diff --git a/docker/logistics/requirements.txt b/docker/logistics/requirements.txt index 7405971..46ecaaa 100644 --- a/docker/logistics/requirements.txt +++ b/docker/logistics/requirements.txt @@ -4,3 +4,4 @@ google-cloud-storage hf_transfer huggingface_hub[cli] minio +wandb diff --git a/docs/contributing.md b/docs/contributing.md index 870eaff..94e1f5e 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -18,6 +18,56 @@ Thank you for considering contributing to the SiloGen AI Workloads development! # install packages you need ``` +### Pre-commit setup + +We use [pre-commit](https://pre-commit.com/) for consistent formatting and cleaner code. Hooks are specified in `ai-workloads-dev/.pre-commit-config.yaml`. + +To install:
+`cd ai-workloads-dev` (this is necessary for `pre-commit install`, which runs particular to a git repository)
+`source your_venv`
+`pip install pre-commit`
+`pre-commit install --config .pre-commit-config.yaml`
+`git commit -m "test commit"`
+ +With the final command, pre-commit should run automatically, with output something like the following: + + >check json...........................................(no files to check)Skipped
+ check yaml...........................................(no files to check)Skipped
+ fix end of files.....................................(no files to check)Skipped
+ fix requirements.txt.................................(no files to check)Skipped
+ trim trailing whitespace.............................(no files to check)Skipped
+ black................................................(no files to check)Skipped
+ flake8...............................................(no files to check)Skipped
+ isort (python).......................................(no files to check)Skipped
+ mypy.................................................(no files to check)Skipped
+ helmlint.............................................(no files to check)Skipped
+ +It's also possible to manually run pre-commit using + +`pre-commit run --all-files` + +#### Troubleshooting pre-commit + +Many pre-commit bugs come from having an incorrect version of pre-commit active. Pre-commit can hang around as a system-wide version, in python venvs, or in your pre-commit cache. + + It's easiest to use pre-commit as part of a python virtual environment. To check that the right pre-commit is being found, run `which pre-commit` and confirm that the binaries inside your venv are shown. For example: `/../../venvs/your_venv/bin/pre-commit`. A different path could indicate that your system is choosing the wrong pre-commit install. + + +From system: +`brew uninstall pre-commit` (mac) +`sudo apt remove pre-commit` (linux) + +From venv: +`pip uninstall pre-commit` + +Just the pre-commit hooks uninstall: +`pre-commit uninstall` +`pre-commit clean` + + +Then reinstall pre-commit from scratch as described above. + + ## Development Workflow 1. Create a branch for your feature or bugfix: diff --git a/workloads/dev-workspace-jupyterlab/helm/values.yaml b/workloads/dev-workspace-jupyterlab/helm/values.yaml index 2ac6519..9c238b6 100644 --- a/workloads/dev-workspace-jupyterlab/helm/values.yaml +++ b/workloads/dev-workspace-jupyterlab/helm/values.yaml @@ -47,8 +47,17 @@ entrypoint: | pip install pipx ipykernel pipx install --include-deps jupyter pipx inject --include-deps jupyter jupyterlab-lsp 'python-lsp-server[all]' ipywidgets jupyterlab-git jupyterlab_code_formatter - python -m ipykernel install --user --name=default-python3 - jupyter-lab --ServerApp.token='' --ServerApp.ip='0.0.0.0' --ServerApp.allow_root=True --ServerApp.base_url=$BASE_URL --no-browser --ServerApp.root_dir='/workload' + python -m ipykernel install --user --name=default-python3 --display-name="Python 3 (default)" + + jupyter-lab --no-browser \ + --IdentityProvider.token='' \ + --ServerApp.ip='0.0.0.0' \ + --ServerApp.allow_root=True \ + --ServerApp.base_url=$BASE_URL \ + --ServerApp.root_dir='/workload' \ + --MultiKernelManager.default_kernel_name=default-python3 \ + --KernelSpecManager.allowed_kernelspecs=default-python3 \ + --KernelSpecManager.ensure_native_kernel=False # kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload) kaiwo: diff --git a/workloads/download-data-to-bucket/helm/templates/job.yaml b/workloads/download-data-to-bucket/helm/templates/job.yaml index f232a17..2dd6cc7 100644 --- a/workloads/download-data-to-bucket/helm/templates/job.yaml +++ b/workloads/download-data-to-bucket/helm/templates/job.yaml @@ -33,8 +33,8 @@ spec: mkdir -p /downloads/datasets python /scripts/data_script.py ######################## - echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" }}' - mc cp -recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/ + echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}' + mc cp -recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'/ ######################## echo 'Done' env: diff --git a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml index 7514746..b91d2b9 100644 --- a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml +++ b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml @@ -47,13 +47,14 @@ spec: {{- end }} --local-dir local_models/downloaded_model ################################### - echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" }}' + echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace "'" "'\\''" }}' + {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace "'" "'\\''") }} mc mirror --exclude '.cache/huggingface/*' \ --exclude '.gitattributes' \ {{- if .Values.allowOverwrite }} --overwrite \ {{- end }} - local_models/downloaded_model/ minio-host/{{ .Values.bucketPath | trimSuffix "/" }} + local_models/downloaded_model/ {{ $remotePath }} env: {{- if .Values.hfTokenSecret }} - name: HF_TOKEN diff --git a/workloads/download-wandb-model-to-bucket/helm/Chart.yaml b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml new file mode 100644 index 0000000..557e352 --- /dev/null +++ b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: download-wandb-model-to-bucket +description: A Helm chart for downloading a Weights and Biases model to a bucket +version: 0.0.1 diff --git a/workloads/download-wandb-model-to-bucket/helm/README.md b/workloads/download-wandb-model-to-bucket/helm/README.md new file mode 100644 index 0000000..1d6519c --- /dev/null +++ b/workloads/download-wandb-model-to-bucket/helm/README.md @@ -0,0 +1,14 @@ +# Download a model from Weights and Biases to bucket storage + +This is an workload which downloads a model from weights and biases and uploads it to bucket storage. + +Run example: +```bash +helm template "dl-from-wandb" workloads/download-wandb-model-to-bucket/helm \ + -f workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml \ + | kubectl create -f - +``` + +## User input values + +See the `values.yaml` file for the user input values that you can provide, with instructions. diff --git a/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml b/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml new file mode 100644 index 0000000..337ad4d --- /dev/null +++ b/workloads/download-wandb-model-to-bucket/helm/overrides/example-model-to-minio.yaml @@ -0,0 +1,18 @@ +# Which model to download +artifactPath: test-proj-1/test-model-2 + +# Where the resources should be stored: +bucketPath: default-bucket/models/examples/tiny-random-test-model-2 +bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80 + +# Download & Upload configuration: +allowOverwrite: false + +# Storage configuration: +storageClass: mlstorage +storageQuantity: "20Gi" + +# HF Token: +wandbTokenSecret: + name: wandb-token + key: wandb-token diff --git a/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml b/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml new file mode 100644 index 0000000..52bd8f8 --- /dev/null +++ b/workloads/download-wandb-model-to-bucket/helm/templates/job.yaml @@ -0,0 +1,103 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ .Release.Name }}-job" + {{- if .Values.labels }} + labels: + {{- range $label, $value := .Values.labels }} + {{ $label }}: {{ $value | quote }} + {{- end }} + {{- end }} +spec: + ttlSecondsAfterFinished: 3600 + backoffLimit: 0 + template: + spec: + restartPolicy: Never + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{- range .Values.imagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + containers: + - name: hf-to-bucket + image: {{ .Values.image }} + imagePullPolicy: Always + workingDir: /app + command: + - sh + - -e + - -u + - -c + args: + - | + ################################### + echo 'Setting up minio' + mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY} + ################################### + echo 'Downloading the artifact from wandb to the container' + {{- $safeArtifactPath := printf "'%s'" (.Values.artifactPath | replace "'" "'\\''") }} + wandb artifact get --type {{ .Values.artifactType }} {{ $safeArtifactPath }} --root local_artifact + ################################### + echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace "'" "'\\''" }}' + {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace "'" "'\\''") }} + mc mirror \ + {{- if .Values.allowOverwrite }} + --overwrite \ + {{- end }} + local_artifact/ {{ $remotePath }} + env: + - name: WANDB_API_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.wandbTokenSecret.name }} + key: {{ .Values.wandbTokenSecret.key }} + - name: BUCKET_STORAGE_HOST + value: {{ .Values.bucketStorageHost }} + - name: BUCKET_STORAGE_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.bucketCredentialsSecret.name }} + key: {{ .Values.bucketCredentialsSecret.accessKeyKey }} + - name: BUCKET_STORAGE_SECRET_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.bucketCredentialsSecret.name }} + key: {{ .Values.bucketCredentialsSecret.secretKeyKey }} + resources: + requests: + memory: 1Gi + cpu: 1 + limits: + memory: 1Gi + cpu: 1 + volumeMounts: + - mountPath: /app + name: {{ .Release.Name }}-volume + securityContext: + allowPrivilegeEscalation: false + runAsNonRoot: true + runAsUser: 1000 + runAsGroup: 1000 + seccompProfile: + type: RuntimeDefault + capabilities: + drop: ["ALL"] + securityContext: + fsGroup: 1000 + volumes: + - name: {{ .Release.Name }}-volume + {{- if .Values.storageClass }} + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Values.storageClass }} + resources: + requests: + storage: "{{ .Values.storageQuantity }}" + {{- else }} + emptyDir: + sizeLimit: "{{ .Values.storageQuantity }}" + {{- end }} diff --git a/workloads/download-wandb-model-to-bucket/helm/values.yaml b/workloads/download-wandb-model-to-bucket/helm/values.yaml new file mode 100644 index 0000000..993dd0e --- /dev/null +++ b/workloads/download-wandb-model-to-bucket/helm/values.yaml @@ -0,0 +1,34 @@ +### General chart values ### +image: ghcr.io/silogen/logistics:v0.2 + +# Use to add labels to the metadata of the resources created by this workload. +labels: {} + # Example: + # labels: + # kaiwo.silogen.ai/managed: "true" + +# Extra annotations such as an imagePullSecrets +imagePullSecrets: [] + # Example: + # imagePullSecrets: + # - "regcred" + +# Configure these to match the credentials in your cluster: +bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80 +bucketCredentialsSecret: + name: minio-credentials + accessKeyKey: minio-access-key + secretKeyKey: minio-secret-key + +# Secret reference that contains the Weights and Biases token +wandbTokenSecret: + name: wandb-token + key: wandb-token + +# Inputs: +artifactPath: "" # wandb artifact path which is in the format of project/artifact-name +artifactType: model # wandb artifact type, e.g. model or dataset +bucketPath: "" # Path in the bucket storage where this model should be stored. In the format bucket-name/path/separated/by/slashes/name-for-resulting-directory +allowOverwrite: false # Optionally set to true to allow overiwriting existing files in the bucket +storageQuantity: 64Gi # How much space needs to be allocated to store the model in the container (before pushing to bucket storage). +storageClass: mlstorage # Set this to use a specific storageClass for the storage. If not specified, will simply use an ephemeral_storage request. diff --git a/workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml b/workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml similarity index 75% rename from workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml rename to workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml index 85baf06..b6b91c5 100644 --- a/workloads/llm-evaluation-judge/helm/overrides/prometheus-Qwen2_5_3B_instruct-cnn_dailymail.yaml +++ b/workloads/llm-evaluation-judge/helm/overrides/Qwen2_5_3B_instruct-llama-3.2-3B.yaml @@ -1,4 +1,4 @@ -# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset +# Overrides file for running judge evaluation, using Llama-3.2-3B-Instruct to judge Qwen2.5-3B-Instruct on the default dataset general: job_name: judge-job-3container-qwen model_inference_container: diff --git a/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml b/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml new file mode 100644 index 0000000..3f48ea0 --- /dev/null +++ b/workloads/llm-evaluation-judge/helm/overrides/llama-3.2-3B.yaml @@ -0,0 +1,14 @@ +# Overrides file for running judge evaluation, using Llama-3.2-3B-Instruct to judge Llama-3.2-3B-Instruct on the default dataset +general: + job_name: judge-job-s3-llama-3.2-3B +model_inference_container: + image: rocm/vllm-dev:nightly_main_20250430 + model: Llama-3.2-3B-Instruct + model_path: hf://meta-llama/Llama-3.2-3B-Instruct +judge_inference_container: + image: rocm/vllm-dev:nightly_main_20250430 + model: Llama-3.2-3B-Instruct + model_path: s3://default-bucket/models/meta-llama/Llama-3.2-3B-Instruct +judge_evaluation_container: + image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1 + use_data_subset: 0 diff --git a/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml b/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml deleted file mode 100644 index 4738fcf..0000000 --- a/workloads/llm-evaluation-judge/helm/overrides/prometheus-llama_3_8b-cnn_dailymail.yaml +++ /dev/null @@ -1,13 +0,0 @@ -# Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset -general: - job_name: judge-job-minio -model_inference_container: - image: rocm/vllm-dev:20241205-tuned - model: llama-3.2-3B - model_path: hf://meta-llama/Llama-3.2-3B-Instruct -judge_inference_container: - model: llama-3.2-3B - model_path: s3://default-bucket/models/meta-llama/Llama-3.2-3B-Instruct -judge_evaluation_container: - image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1 - use_data_subset: 5 diff --git a/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml b/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml index 7d15fec..6c41581 100644 --- a/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml +++ b/workloads/llm-evaluation-judge/helm/templates/evaluation_judge_template.yaml @@ -144,16 +144,6 @@ spec: requests: memory: "{{ .Values.judge_evaluation_container.memory }}" cpu: "{{ .Values.judge_evaluation_container.cpu_count }}" - startupProbe: - exec: - command: - - sh - - -c - - | - curl -sf http://localhost:8080/health && curl -sf http://localhost:8081/health - initialDelaySeconds: 60 - periodSeconds: 10 - failureThreshold: 30 command: ["sh", "-c"] args: - | diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml index b2d0fc2..52cb1b1 100644 --- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml +++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.1-8B_billsum_values.yaml @@ -1,5 +1,5 @@ model_inference_container: - image: rocm/vllm-dev:20241205-tuned + image: rocm/vllm-dev:nightly_main_20250430 evaluation_container: image: ghcr.io/silogen/evaluation-workloads-metrics:v0.1 dataset_path: FiscalNote/billsum diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml index 2f220e0..ab463b3 100644 --- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml +++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_billsum_values.yaml @@ -1,6 +1,6 @@ # Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset model_inference_container: - image: rocm/vllm-dev:20241205-tuned + image: rocm/vllm-dev:nightly_main_20250430 model: Llama-3.2-3B-Instruct model_path: meta-llama/Llama-3.2-3B-Instruct evaluation_container: diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml index 85de2bd..3e39cd3 100644 --- a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml +++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-Instruct_cnn-dailymail_values.yaml @@ -1,6 +1,6 @@ # Values file for running bertscore evaluation for Llama-3.1-8B on the cnn-dailymail summarization dataset model_inference_container: - image: rocm/vllm-dev:20241205-tuned + image: rocm/vllm-dev:nightly_main_20250430 model: Llama-3.2-3B-Instruct model_path: meta-llama/Llama-3.2-3B-Instruct evaluation_container: diff --git a/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml new file mode 100644 index 0000000..97d79b8 --- /dev/null +++ b/workloads/llm-evaluation-metrics/helm/overrides/bertscore_llama-3.2-3B-cnn-mlflow.yaml @@ -0,0 +1,12 @@ +model_inference_container: + image: rocm/vllm-dev:nightly_main_20250430 + model: Llama-3.2-3B-Instruct + model_path: meta-llama/Llama-3.2-3B-Instruct +evaluation_container: + image: ghcr.io/silogen/evaluation-workloads-metrics-debug:v0.1 + use_data_subset: 3 +storage: + mlflow: + server_uri: http://10.242.3.71:8082 + experiment_name: metrics-demo-experiment + run_name: metrics-demo-run diff --git a/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml b/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml index a4979d8..a4178b6 100644 --- a/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml +++ b/workloads/llm-evaluation-metrics/helm/templates/metrics_evaluation_template_with_download.yaml @@ -83,7 +83,7 @@ spec: echo "Running evaluation:\nDownloading Dataset, Running inference, Evaluating inferences with bertscore..."; python3 run_inference_and_metrics_evaluation.py \ --llm-base-url="http://localhost" \ - --evaluation-dataset="{{ .Values.evaluation_container.dataset_path }}" \ + --evaluation-dataset-name="{{ .Values.evaluation_container.dataset_path }}" \ --evaluation-dataset-version="{{ .Values.evaluation_container.dataset_version }}" \ --dataset-split="{{ .Values.evaluation_container.dataset_split }}" \ --prompt-template-path="{{ .Values.evaluation_container.prompt_template_path }}" \ @@ -95,7 +95,10 @@ spec: --context-column-name="{{ .Values.evaluation_container.dataset_info.context_column_name}}" \ --id-column-name="{{ .Values.evaluation_container.dataset_info.id_column_name}}" \ --gold-standard-column-name="{{ .Values.evaluation_container.dataset_info.gold_standard_column_name}}" \ - --use-data-subset="{{ .Values.evaluation_container.use_data_subset}}" ; + --use-data-subset="{{ .Values.evaluation_container.use_data_subset}}" \ + --mlflow-server-uri="{{ .Values.storage.mlflow.server_uri }}" \ + --mlflow-experiment-name="{{ .Values.storage.mlflow.experiment_name }}" \ + --mlflow-run-name="{{ .Values.storage.mlflow.run_name }}" ; env: - name: TRANSFORMERS_CACHE value: /HF_HOME diff --git a/workloads/llm-evaluation-metrics/helm/values.yaml b/workloads/llm-evaluation-metrics/helm/values.yaml index 7338287..e146c0e 100644 --- a/workloads/llm-evaluation-metrics/helm/values.yaml +++ b/workloads/llm-evaluation-metrics/helm/values.yaml @@ -33,3 +33,7 @@ storage: - ReadWriteOnce bucket_storage_host: minio.minio-tenant-default.svc.cluster.local:80 bucket_storage_bucket: default-bucket + mlflow: + server_uri: http://10.242.3.198:8082 + experiment_name: mlflow-experiment + run_name: mlflow-run diff --git a/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl b/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl index d1b22f6..e27a123 100644 --- a/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl +++ b/workloads/llm-finetune-axolotl/helm/templates/_helpers.tpl @@ -3,9 +3,10 @@ # Setup MinIO mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY} # Sync checkpoints from remote to local +{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }} {{- if .Values.checkpointsRemote }} -if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then - echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints' +if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then + echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''"}} to /workdir/checkpoints' ls -lah /workdir/checkpoints else echo 'No checkpoints found yet' @@ -17,12 +18,13 @@ fi {{- define "finetuningAndUploadEntrypoint" -}} # Print GPU Info: rocm-smi +{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }} {{- if .Values.checkpointsRemote }} echo "Starting checkpoint sync process" mc mirror \ --watch \ /workdir/checkpoints \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ & + {{ $checkpointsRemotePath }} & uploadPID=$! {{- end }} # Run training: @@ -36,7 +38,7 @@ wait $uploadPID || true echo 'Training done, syncing once more...' mc mirror \ /workdir/checkpoints \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ + {{ $checkpointsRemotePath }} {{- end }} echo 'All done, exiting' {{- end }} diff --git a/workloads/llm-finetune-axolotl/helm/values.yaml b/workloads/llm-finetune-axolotl/helm/values.yaml index 3ed333d..9f3234e 100644 --- a/workloads/llm-finetune-axolotl/helm/values.yaml +++ b/workloads/llm-finetune-axolotl/helm/values.yaml @@ -27,4 +27,4 @@ finetuningGpus: 1 configFile: # name of config file to use, include the file in the mount/ directory ### Model output path ### -checkpointsRemote: # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket) +checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket) diff --git a/workloads/llm-finetune-llama-factory/helm/README.md b/workloads/llm-finetune-llama-factory/helm/README.md index d213a5b..a495741 100644 --- a/workloads/llm-finetune-llama-factory/helm/README.md +++ b/workloads/llm-finetune-llama-factory/helm/README.md @@ -2,7 +2,6 @@ This is a Helm Chart for running a finetuning job using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory) -Currently the base model and input data are assumed to be from HuggingFace, or some other source directly supported by LLaMA-Factory. The output is saved with MinIO in the directory specified by `checkpointsRemote`. ## Configuration @@ -22,6 +21,22 @@ helm template workloads/llm-finetune-llama-factory/helm \ | kubectl create -f - ``` +## Data specification + +Specify the name of data set used for training as `dataset`. This can include datasets predefined in LLaMA-Factory or those defined in `datasetInfo`. Use commas to separate multiple data sets. + +To use other datasets, create an entry in `datasetInfo` following the [LLaMA-Factory dataset info format](https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md). Note that LLaMA-Factory directly supports loading datasets from HuggingFace, ModelScope, or s3/gcs cloud storage by setting the urls according to the documentation. + +This workload adds a custom way to load data from MinIO. In `datasetInfo` specify the path to the dataset in the remote bucket as `pathRemote`, and the workload will load the file and update the configuration. See the override file [`overrides/finetune-model_data_from_minio.yaml`](overrides/finetune-model_data_from_minio.yaml) for an example of finetuning where the data and model are loaded from MinIO. + +## Model specification + +To use a base model from HuggingFace or other source directly supported by LLaMA-Factory, specify the model name in `modelName`. + +Alternatively to use a model from MinIO, specify the path to the model in `modelRemote`. + +Either `modelName` or `modelRemote` must be specified. If both are included, the model from `modelRemote` is used. + ## Cleanup After the jobs are completed, please delete the resources created. In particular for multi-node ray jobs, a `PersistentVolumeClaim` is used as shared storage and persists on the cluster after the job is completed. @@ -37,7 +52,7 @@ helm template workloads/llm-finetune-llama-factory/helm \ ## Multi-node finetuning with ray -The chart supports multi-node jobs by setting `nodes` to an integer greater than 1. Doing so enables ray and creates a RayJob instead. An example config is provided in [`overrides/finetune-lora-ray.yaml`](overrides/finetune-lora-ray.yaml) +The chart supports multi-node jobs by setting `nodes` to an integer greater than 1. Doing so enables ray and creates a RayJob instead. An example config is provided in [`overrides/finetune-lora-ray.yaml`](overrides/finetune-lora-ray.yaml). The example also shows how to use [DeepSpeed ZeRO Stage 2](https://deepspeed.readthedocs.io/en/latest/zero3.html) to partition the gradients. To enable DeepSpeed, set the `deepspeed` parameter in the LLaMA-Factory config to point to one of the [deepspeed configs](https://github.com/hiyouga/LLaMA-Factory/tree/main/examples/deepspeed) included in LLaMA-Factory or a dictionary. When configuring ray jobs, the resources you are requesting (`nodes` and `gpusPerNode`) are automatically specified for LLaMA-Factory, and do not need to be included separately in the `llamaFactoryConfig`. diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml index 7dd542b..c2835c7 100644 --- a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml +++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora-ray.yaml @@ -1,12 +1,18 @@ +### Model ### +modelName: meta-llama/Llama-3.1-8B-Instruct + +### Data ### +dataset: identity,alpaca_en_demo + +### Model output path ### +checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora" + # Resources: checkpointsReservedSize: 10Gi nodes: 2 gpusPerNode: 1 memoryPerNode: 32Gi -### Model output path ### -checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora" - hfTokenSecret: name: hf-token key: hf-token @@ -15,7 +21,6 @@ hfTokenSecret: ### this example adapted from https://github.com/hiyouga/LLaMA-Factory/blob/main/examples/train_lora/llama3_lora_sft_ray.yaml llamaFactoryConfig: ### model - model_name_or_path: meta-llama/Llama-3.1-8B-Instruct # or use local absolute path trust_remote_code: true ### method @@ -24,10 +29,9 @@ llamaFactoryConfig: finetuning_type: lora lora_rank: 8 lora_target: all + deepspeed: /workspace/LLaMA-Factory/examples/deepspeed/ds_z2_config.json # choices: [ds_z0_config.json, ds_z2_config.json, ds_z3_config ### dataset - dataset: identity,alpaca_en_demo - dataset_dir: REMOTE:llamafactory/demo_data # or use local absolute path template: llama3 cutoff_len: 2048 max_samples: 1000 @@ -62,11 +66,3 @@ llamaFactoryConfig: warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 - resume_from_checkpoint: null - - ### eval - # eval_dataset: alpaca_en_demo - # val_size: 0.1 - # per_device_eval_batch_size: 1 - # eval_strategy: steps - # eval_steps: 500 diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml index 1bf8a3f..e7f5d83 100644 --- a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml +++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-lora.yaml @@ -1,9 +1,15 @@ -# Resources: -checkpointsReservedSize: 10Gi +### Model ### +modelName: meta-llama/Llama-3.1-8B-Instruct + +### Data ### +dataset: identity,alpaca_en_demo ### Model output path ### checkpointsRemote: "default-bucket/experiments/llama3-8b-llama-factory-lora" +# Resources: +checkpointsReservedSize: 10Gi + hfTokenSecret: name: hf-token key: hf-token @@ -12,7 +18,6 @@ hfTokenSecret: ### this example from https://github.com/hiyouga/LLaMA-Factory/blob/main/examples/train_lora/llama3_lora_sft.yaml llamaFactoryConfig: ### model - model_name_or_path: meta-llama/Llama-3.1-8B-Instruct trust_remote_code: true ### method @@ -23,8 +28,6 @@ llamaFactoryConfig: lora_target: all ### dataset - dataset: identity,alpaca_en_demo - dataset_dir: REMOTE:llamafactory/demo_data # or use local absolute path template: llama3 cutoff_len: 2048 max_samples: 1000 @@ -49,11 +52,3 @@ llamaFactoryConfig: warmup_ratio: 0.1 bf16: true ddp_timeout: 180000000 - resume_from_checkpoint: null - - ### eval - # eval_dataset: alpaca_en_demo - # val_size: 0.1 - # per_device_eval_batch_size: 1 - # eval_strategy: steps - # eval_steps: 500 diff --git a/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml new file mode 100644 index 0000000..9a70ec3 --- /dev/null +++ b/workloads/llm-finetune-llama-factory/helm/overrides/finetune-model_data_from_minio.yaml @@ -0,0 +1,64 @@ +### Model ### +modelRemote: "default-bucket/models/tiny-llama/tinyllama-1.1b-chat-v1.0" + +### Data ### +# list datasets to use, can include datasets predefined in LLaMA-Factory or those defined in datasetInfo +dataset: argilla +# for remote datasets to be loaded from MinIO, specify the path to the dataset in the remote bucket as pathRemote +datasetInfo: + argilla: + pathRemote: "default-bucket/datasets/argilla-mistral-large-human-prompts.jsonl" + formatting: sharegpt + columns: + messages: "messages" + tags: + role_tag: "role" + content_tag: "content" + user_tag: "user" + assistant_tag: "assistant" + system_tag: "system" + +### Model output path ### +checkpointsRemote: "default-bucket/experiments/tinyllama-argilla-llama-factory-lora" +resumeFromCheckpoint: true + +# Resources: +checkpointsReservedSize: 10Gi + +### llama-factory config ### +llamaFactoryConfig: + ### model + trust_remote_code: true + + ### method + stage: sft + do_train: true + finetuning_type: lora + lora_rank: 8 + lora_target: all + + ### dataset + template: llama2 + cutoff_len: 8192 + max_samples: 1000 + overwrite_cache: true + preprocessing_num_workers: 16 + dataloader_num_workers: 4 + + ### output + logging_steps: 10 + save_steps: 500 + plot_loss: true + overwrite_output_dir: true + save_only_model: false + report_to: none # choices: [none, wandb, tensorboard, swanlab, mlflow] + + ### train + per_device_train_batch_size: 1 + gradient_accumulation_steps: 8 + learning_rate: 1.0e-4 + num_train_epochs: 3.0 + lr_scheduler_type: cosine + warmup_ratio: 0.1 + bf16: true + ddp_timeout: 180000000 diff --git a/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl b/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl index 645aceb..69f9319 100644 --- a/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl +++ b/workloads/llm-finetune-llama-factory/helm/templates/_helpers.tpl @@ -88,4 +88,8 @@ spec: mode: 0777 - key: llama_factory_config.yaml path: llama_factory_config.yaml + {{- if .Values.datasetInfo }} + - key: remote_dataset_info.json + path: remote_dataset_info.json + {{- end }} {{- end }} diff --git a/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml b/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml index f75cf3b..7d384ea 100644 --- a/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml +++ b/workloads/llm-finetune-llama-factory/helm/templates/configmap.yaml @@ -4,6 +4,13 @@ metadata: name: "{{ .Release.Name }}-configs" data: llama_factory_config.yaml: | + {{- if .Values.modelRemote }} + model_name_or_path: /workdir/basemodel + {{- else }} + model_name_or_path: "{{ .Values.modelName }}" + {{- end }} + dataset: {{ .Values.dataset }} + dataset_dir: /workspace/LLaMA-Factory/data output_dir: /workdir/checkpoints {{- if ne (int $.Values.nodes) 1 }} ray_run_name: "{{ .Release.Name }}" @@ -13,35 +20,74 @@ data: GPU: {{ .Values.gpusPerNode }} {{- end }} {{ toYaml .Values.llamaFactoryConfig | indent 4 }} +{{ if .Values.datasetInfo }} + remote_dataset_info.json: | +{{ toPrettyJson .Values.datasetInfo | indent 4 }} +{{- end }} entrypoint.sh: | #!/bin/bash + set -e # Print GPU Info: rocm-smi mkdir -p /workdir/checkpoints - {{- if .Values.checkpointsRemote }} + mkdir -p /workdir/datasets + cd /workspace/LLaMA-Factory + cp /configs/llama_factory_config.yaml llama_factory_config.yaml + {{- if .Values.datasetInfo }} + cp /configs/remote_dataset_info.json remote_dataset_info.json + {{- end }} # Setup MinIO mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_STORAGE_ACCESS_KEY $BUCKET_STORAGE_SECRET_KEY + {{- if .Values.modelRemote }} + # copy model from remote to local + mc cp --recursive \ + minio-host/{{ .Values.modelRemote | trimSuffix "/" }}/ \ + /workdir/basemodel + {{- end }} + {{- range .Values.datasetInfo }} + {{- if .pathRemote }} + # copy dataset from remote to local + mc cp \ + minio-host/{{ .pathRemote }} \ + /workdir/datasets/{{ .pathRemote | replace "/" "_" }} + sed -i 's;"pathRemote": "{{ .pathRemote }}";"file_name": "/workdir/datasets/{{ .pathRemote | replace "/" "_" }}";g' remote_dataset_info.json + {{- end }} + {{- end }} + {{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }} + {{- if .Values.checkpointsRemote }} + {{- if .Values.resumeFromCheckpoint }} # Sync checkpoints from remote to local - if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then - echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints' + if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then + echo 'Downloaded checkpoints from' {{ $checkpointsRemotePath }} 'to /workdir/checkpoints' ls -lah /workdir/checkpoints + echo "resume_from_checkpoint: /workdir/checkpoints" >> llama_factory_config.yaml else echo 'No checkpoints found yet' fi + {{- end }} echo "Starting checkpoint sync process" mc mirror \ --watch \ /workdir/checkpoints \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ & + {{ $checkpointsRemotePath }} & uploadPID=$! + # Check if the sync process started successfully + sleep 1 + if ! ps -p $uploadPID > /dev/null; then + echo "ERROR: Sync process failed to start" + exit 1 + fi {{- end }} # Run training: echo "Starting training process" - cd LLaMA-Factory/ + {{- if .Values.datasetInfo }} + jq -s add remote_dataset_info.json /workspace/LLaMA-Factory/data/dataset_info.json > dataset_info.json + cp dataset_info.json /workspace/LLaMA-Factory/data/dataset_info.json + {{- end }} {{- if ne (int $.Values.nodes) 1 }} export USE_RAY=1 {{- end }} - llamafactory-cli train /configs/llama_factory_config.yaml + llamafactory-cli train llama_factory_config.yaml {{- if .Values.checkpointsRemote }} echo "Training done, stop the upload process" kill $uploadPID @@ -50,6 +96,6 @@ data: echo 'Training done, syncing once more...' mc mirror --overwrite \ /workdir/checkpoints \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ + {{ $checkpointsRemotePath }} {{- end }} echo 'All done, exiting' diff --git a/workloads/llm-finetune-llama-factory/helm/values.schema.json b/workloads/llm-finetune-llama-factory/helm/values.schema.json index 95d8a9d..5c62416 100644 --- a/workloads/llm-finetune-llama-factory/helm/values.schema.json +++ b/workloads/llm-finetune-llama-factory/helm/values.schema.json @@ -6,6 +6,22 @@ "type": "string", "description": "Container image for finetuning" }, + "modelName": { + "type": "string", + "description": "Model path in Huggginface" + }, + "modelRemote": { + "type": "string", + "description": "Model path in remote MinIO storage, format: bucketName/path/in/bucket" + }, + "dataset": { + "type": "string", + "description": "Name of data set used for training. Use commas to separate multiple data sets." + }, + "datasetInfo": { + "type": "object", + "description": "Additional datasets can be specified in datasetInfo, according to the LLaMA-Factory dataset format, see https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md" + }, "kaiwo": { "type": "object", "properties": { @@ -109,12 +125,14 @@ "default": "16Gi" }, "checkpointsRemote": { - "type": [ - "string", - "null" - ], + "type": "string", "description": "Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket" }, + "resumeFromCheckpoint": { + "type": "boolean", + "description": "If true, resume from the last checkpoint in checkpointsRemote (if available)", + "default": false + }, "hfTokenSecret": { "type": "object", "properties": { diff --git a/workloads/llm-finetune-llama-factory/helm/values.yaml b/workloads/llm-finetune-llama-factory/helm/values.yaml index 06339b8..50e5117 100644 --- a/workloads/llm-finetune-llama-factory/helm/values.yaml +++ b/workloads/llm-finetune-llama-factory/helm/values.yaml @@ -1,6 +1,20 @@ ### General chart values ### finetuningImage: ghcr.io/silogen/llama-factory-rocm-pytorch-training:v0.3 +### Model ### +# either modelRemote OR modelName must be set +# to use a base model directly from Hugging Face, set modelName to the model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct") +modelName: "" +# for remote models to be loaded from MinIO, specify the path to the model in the remote bucket as modelRemote +modelRemote: "" + +### Data ### +# list datasets to use, can include datasets predefined in LLaMA-Factory or those defined in datasetInfo +dataset: "" +# Additional datasets can be specified in datasetInfo, according to the LLaMA-Factory dataset format, see https://github.com/hiyouga/LLaMA-Factory/blob/main/data/README.md +# For remote datasets to be loaded from MinIO, specify the path to the dataset in the remote bucket as pathRemote +datasetInfo: {} + # kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload) kaiwo: enabled: false @@ -9,8 +23,10 @@ kaiwo: labels: {} # Extra annotations such as an imagePullSecrets -imagePullSecrets: - - "regcred" +imagePullSecrets: [] + # Example: + # imagePullSecrets: + # - "regcred" # Configure these to match the credentials in your cluster: bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80 @@ -37,7 +53,8 @@ llamaFactoryConfig: stage: sft ### Model output path ### -checkpointsRemote: # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket) +checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket) +resumeFromCheckpoint: false # Set to true to resume from the last checkpoint in checkpointsRemote (if available) hfTokenSecret: {} # Optional secret reference that contains the Huggingface token # Example: diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml index 1a3ad6d..784283a 100644 --- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml +++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-70b.yaml @@ -19,8 +19,6 @@ finetuning_config: data_conf: training_data: type: CONCATENATION - datasets: - - path: "PLACEHOLDER" validation_data: type: AUTO_SPLIT ratio: 0.1 diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml index 50ee0f4..b0d5dac 100644 --- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml +++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b.yaml @@ -13,8 +13,6 @@ finetuning_config: data_conf: training_data: type: CONCATENATION - datasets: - - path: "PLACEHOLDER" validation_data: type: AUTO_SPLIT ratio: 0.1 diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml index 8f73618..d8426b8 100644 --- a/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml +++ b/workloads/llm-finetune-silogen-engine/helm/overrides/models/tiny-llama_tinyllama-1.1b-chat-v1.0.yaml @@ -13,8 +13,6 @@ finetuning_config: data_conf: training_data: type: CONCATENATION - datasets: - - path: "PLACEHOLDER" validation_data: type: AUTO_SPLIT ratio: 0.1 diff --git a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md b/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md index d229420..3c980b1 100644 --- a/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md +++ b/workloads/llm-finetune-silogen-engine/helm/silogen_finetuning_config_readme.md @@ -9,16 +9,17 @@ See the various sub-configs for their options. Additional properties are not all | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | -| method | `const` | | `sft` | `"sft"` | | | data_conf | `object` | ✅ | [ChatTrainValidConfig](#chattrainvalidconfig) | | The data input config | | training_args | `object` | ✅ | [SilogenTrainingArguments](#silogentrainingarguments) | | Transformer TrainingArguments with some restrictions | -| overrides | `object` | | [Overrides](#overrides) | `{"num_train_epochs": null, "lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface | | batchsize_conf | `object` | ✅ | [BatchsizeConfig](#batchsizeconfig) | | Batch size configuration | -| peft_conf | `object` | ✅ | [NoPeftConfig](#nopeftconfig) or [PretrainedPeftConfig](#pretrainedpeftconfig) or [GenericPeftConfig](#genericpeftconfig) | | Adapter configuration | +| peft_conf | `object` | ✅ | [GenericPeftConfig](#genericpeftconfig) and/or [NoPeftConfig](#nopeftconfig) and/or [PretrainedPeftConfig](#pretrainedpeftconfig) | | Adapter configuration | | run_conf | `object` | ✅ | [RunConfig](#runconfig) | | Model related configuration | -| tracking | `object` or `null` | | [FinetuningTrackingConfig](#finetuningtrackingconfig) | | MLFlow tracking configuration | -| quant_conf | `object` | | [BnBQuantizationConfig](#bnbquantizationconfig) or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration | | sft_args | `object` | ✅ | [SFTArguments](#sftarguments) | | SFT specific arguments | +| method | `const` | | `sft` | `"sft"` | | +| overrides | `object` | | [Overrides](#overrides) | `{"lr_multiplier": 1.0, "lr_batch_size_scaling": "none"}` | Override options to simplify the config interface | +| tracking | `object` or `null` | | [FinetuningTrackingConfig](#finetuningtrackingconfig) | | MLFlow tracking configuration | +| quant_conf | `object` | | [BnBQuantizationConfig](#bnbquantizationconfig) and/or [NoQuantizationConfig](#noquantizationconfig) | `{"quantization_type": "no-quantization"}` | Quantization configuration | + --- @@ -33,7 +34,7 @@ Automatic validation split from the training data | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | | type | `const` | ✅ | `AUTO_SPLIT` | | | -| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method | +| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. | | ratio | `number` | | number | `0.2` | Ratio of the training data to use for validation | | seed | `integer` | | integer | `1289525893` | Seed for the random number generator for splitting | @@ -78,11 +79,20 @@ see: https://huggingface.co/docs/transformers/en/main_classes/quantization#trans | bnb_4bit_use_double_quant | `boolean` | | boolean | `False` | | | bnb_4bit_quant_storage | `string` or `null` | | string | | | +## ChatTemplateName + +Chat template to use. + +#### Type: `string` + +**Possible Values:** `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31` + ## ChatTrainValidConfig -Training time data configuration. +Training time data configuration -Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput is also allowed for the validation side. +Always defines some DataInput for training data and can include validation DataInput, though a trivial NoneDataInput +is also allowed for the validation side. Additionally includes chat template and padding configurations, as those are part of the data input pipeline. @@ -90,9 +100,9 @@ Additionally includes chat template and padding configurations, as those are par | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | -| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) or [WeightedMixDataInput](#weightedmixdatainput) | | | -| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) or [ConcatenationDataInput](#concatenationdatainput) or [NoneDataInput](#nonedatainput) | | | -| chat_template_name | `string` | | `mistral-with-system` or `chat-ml` or `poro` or `keep-original` or `simplified-llama31` | `"mistral-with-system"` | | +| training_data | `object` | ✅ | [ConcatenationDataInput](#concatenationdatainput) and/or [WeightedMixDataInput](#weightedmixdatainput) | | | +| validation_data | `object` | ✅ | [AutoSplitDataInput](#autosplitdatainput) and/or [ConcatenationDataInput](#concatenationdatainput) and/or [NoneDataInput](#nonedatainput) | | | +| chat_template_name | `string` | | [ChatTemplateName](#chattemplatename) | `"mistral-with-system"` | | | padding_side | `string` | | string | `"right"` | Padding side, right is usually right. | | missing_pad_token_strategy | `string` | | [MissingPadTokenStrategy](#missingpadtokenstrategy) | `"bos-repurpose"` | See the MissingPadTokenStrategys for descriptions of the options | @@ -117,7 +127,7 @@ For DPO this means lines of: | -------- | ---- | -------- | --------------- | ------- | ----------- | | type | `const` | ✅ | `CONCATENATION` | | | | datasets | `array` | ✅ | [DatasetDefinition](#datasetdefinition) | | | -| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method | +| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. | ## DatasetDefinition @@ -137,11 +147,11 @@ Settings that define how run details are logged | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | -| mlflow_server_uri | `string` | ✅ | string | | MLflow server URI. Can be local path | -| experiment_name | `string` | ✅ | string | | Experiment name that is used for MLFlow tracking | -| run_id | `string` or `null` | | string | | Run id, to resume logging to previousely started run | -| run_name | `string` or `null` | | string | | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified | -| hf_mlflow_log_artifacts | `string` | | string | `"False"` | Whether to store model artifacts in MLFlow | +| mlflow_server_uri | `string` | ✅ | string | | MLflow server URI. Can be local path. | +| experiment_name | `string` | ✅ | string | | Experiment name that is used for MLFlow tracking. | +| run_id | `string` or `null` | | string | | Run id, to resume logging to previously started run. | +| run_name | `string` or `null` | | string | | Run name, to give meaningful name to the run to be displayed in MLFlow UI. Used only when run_id is unspecified. | +| hf_mlflow_log_artifacts | `string` | | string | `"False"` | Whether to store model artifacts in MLFlow. | ## GenericPeftConfig @@ -150,7 +160,8 @@ Config for any new initialized PEFT Adapter See https://huggingface.co/docs/peft/tutorial/peft_model_config for the possible kwargs and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py for the types. -### Example +Example: + >>> loaded_data = {'peft_type':'LORA', 'task_type': 'CAUSAL_LM', ... 'peft_kwargs': {'r': 32, 'target_modules': ['v_proj']}} >>> generic_conf = GenericPeftConfig(**loaded_data) @@ -171,8 +182,6 @@ and https://github.com/huggingface/peft/blob/v0.7.1/src/peft/utils/peft_types.py | task_type | `string` | | [TaskType](#tasktype) | `"CAUSAL_LM"` | | | peft_kwargs | `object` | | object | | | - - ## MissingPadTokenStrategy Specifies the available missing pad token strategies. @@ -207,9 +216,10 @@ See parameter docstrings and help at: https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained See below in "Parameters for big model inference" too, it affects training too. Also note that this link takes you to the transformers main branch version - be sure to compare with the installed version of transformers (that keeps -changing over time, and it is difficult to keep this doctstring up to date, so we wanted to link to the latest here). +changing over time, and it is difficult to keep this docstring up to date, so we wanted to link to the latest here). Some important parameters to consider are: + - device_map : A map that specifies where each submodule should go. It doesn’t need to be refined to each parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the same device. If we only pass @@ -230,26 +240,26 @@ NOTE: | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | | torch_dtype | `const` | | `auto` | `"auto"` | | -| device_map | `object` or `string` or `null` | | object and/or string | | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential" | +| device_map | `object` or `string` or `null` | | object and/or string | | Custom device map so that you can manually override the choices that HuggingFace would make. This can also be a string to specify "auto", "balanced_low_0", or "sequential". | | max_memory | `object` or `null` | | object | | | | low_cpu_mem_usage | `boolean` | | boolean | `False` | | -| attn_implementation | `string` or `null` | | string | | Note: this can be set to "sdpa", "flash_attention_2", "eager" | +| attn_implementation | `string` or `null` | | string | | Note: this can be set to "sdpa", "flash_attention_2", "eager". | | offload_folder | `string` or `null` | | string | | | | offload_state_dict | `boolean` or `null` | | boolean | | Default is True if offloading (otherwise no effect) | | offload_buffers | `boolean` or `null` | | boolean | | | -| use_cache | `boolean` | | boolean | `True` | Saves generated hidden states to speed up generation. See: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 use_cache is mutually exclusive with gradient_checkpointing | +| use_cache | `boolean` | | boolean | `true` | Saves generated hidden states to speed up generation, see: https://discuss.huggingface.co/t/what-is-the-purpose-of-use-cache-in-decoder/958 This is mutually exclusive with gradient_checkpointing. | | cache_dir | `string` or `null` | | string | | | -| force_download | `boolean` | | boolean | `False` | | -| local_files_only | `boolean` | | boolean | `False` | | +| force_download | `boolean` | | boolean | `False` | | +| local_files_only | `boolean` | | boolean | `False` | | | proxies | `object` or `null` | | object | | | -| resume_download | `boolean` | | boolean | `False` | | +| resume_download | `boolean` | | boolean | `False` | | | revision | `string` | | string | `"main"` | | | code_revision | `string` | | string | `"main"` | | | subfolder | `string` or `null` | | string | | | | token | `string` or `null` | | string | | | | use_safetensors | `boolean` or `null` | | boolean | | | | variant | `string` or `null` | | string | | | -| trust_remote_code | `boolean` | | boolean | `False` | Warning: if set to `True`, allows execution of downloaded remote code | +| trust_remote_code | `boolean` | | boolean | `False` | Warning: if set to True, allows execution of downloaded remote code. | ## NoPeftConfig @@ -280,23 +290,20 @@ A special type for not using data e.g. in validation | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | | type | `const` | ✅ | `NONE` | | | -| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method | +| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. | ## Overrides -Override options that allow simple interfaces for charts using these configs +Override options -This is particularly useful for a helm chart interface where we include the finetuning package config -as a part of the values.yaml file. These a more flexible helm interface with certain keys brought to the -top level. +These implement dynamic scaling for the learning rate. #### Type: `object` | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | -| num_train_epochs | `integer` or `number` or `null` | | number | | Overrides the number of epochs in the training_args | | lr_multiplier | `number` | | number | `1.0` | Multiplier applied to the learning rate in the training_args | -| lr_batch_size_scaling | `string` | | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size. `none`: No scaling. `sqrt`: Multiplies learning rate by square root of batch size (a classic scaling rule). `linear`: Multiplies learning rate by the batch size (a more modern scaling rule). | +| lr_batch_size_scaling | `string` | | `none` `sqrt` `linear` | `"none"` | Scales the learning rate in the training_args by a factor derived from the total training batch size. 'none': No scaling. 'sqrt': Multiplies learning rate by square root of batch size (a classic scaling rule). 'linear': Multiplies learning rate by the batch size (a more modern scaling rule). | ## PeftType @@ -335,7 +342,7 @@ PEFT adapter uses the config and initialisation from a pretrained adapter | Property | Type | Required | Possible values | Description | | -------- | ---- | -------- | --------------- | ----------- | | peft_type | `const` | ✅ | `PRETRAINED_PEFT` | | -| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft | +| name_or_path | `string` | ✅ | string | HF ID or path to the pretrained peft. | ## RunConfig @@ -345,12 +352,13 @@ Experiment running configuration | Property | Type | Required | Possible values | Default | Description | | -------- | ---- | -------- | --------------- | ------- | ----------- | -| model | `string` | | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be `/local_resources/basemodel` | +| model | `string` | | string | `"/local_resources/basemodel"` | Local path to model to be fine-tuned. Normally this should be /local_resources/basemodel | | model_args | `object` | | [ModelArguments](#modelarguments) | `{"torch_dtype": "auto", "device_map": "auto", "max_memory": null, "low_cpu_mem_usage": false, "attn_implementation": null, "offload_folder": null, "offload_state_dict": null, "offload_buffers": null, "use_cache": true, "cache_dir": null, "force_download": false, "local_files_only": false, "proxies": null, "resume_download": false, "revision": "main", "code_revision": "main", "subfolder": null, "token": null, "use_safetensors": null, "variant": null, "trust_remote_code": false}` | | | tokenizer | `string` or `null` | | string | | Model HuggingFace ID, or path, or None to use the one associated with the model | -| use_fast_tokenizer | `boolean` | | boolean | `True` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. | -| resume_from_checkpoint | `boolean` or `string` | | boolean and/or string | `False` | Normally should be set to 'auto' to continue if a checkpoint exists. Can set to `True` to always try to continue, `False` to never try, or a path to load from a specific path. | +| use_fast_tokenizer | `boolean` | | boolean | `true` | Use the Fast version of the tokenizer. The 'slow' version may be compatible with more features. | +| resume_from_checkpoint | `boolean` or `string` | | boolean and/or string | | Normally should be set to 'auto' to continue if a checkpoint exists. Can set to True to always try to continue, False to never try, or a path to load from a specific path. | | final_checkpoint_name | `string` | | string | `"checkpoint-final"` | Name of final checkpoint. Should be left as default | +| determinism | `string` | | `no` `half` `full` | `"no"` | Set the level of determinism in implementations. Deterministic implementations are not always available, and when they are, they are usually slower than their non-deterministic counterparts. Recommended for debugging only. 'no': No determinism. 'half': Prefer deterministic implementations. 'full': Only fully deterministic implementations, error out on operations that only have non-deterministic implementations. | ## SFTArguments @@ -425,5 +433,5 @@ For DPO this means lines of: | -------- | ---- | -------- | --------------- | ------- | ----------- | | type | `const` | ✅ | `PRECOMPUTE_WEIGHTED_MIX` | | | | datasets | `array` | ✅ | [WeightedDatasetDefinition](#weighteddatasetdefinition) | | | -| data_type | `string` | | string | `"ChatConversation"` | generally, the data_type is automatically set based on the experiment config method | +| data_type | `string` | | string | `"ChatConversation"` | Generally, the data_type is automatically set based on the experiment config method. | | seed | `integer` | | integer | `19851243` | Seed for the random number generator for interleaving draws | diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl index 75c4189..c1be6ca 100644 --- a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl +++ b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl @@ -4,29 +4,30 @@ echo 'Copying resources to container...'; mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY} mc cp --recursive \ - minio-host/{{ .Values.basemodel | trimSuffix "/" }}/ \ + minio-host/'{{ .Values.basemodel | trimSuffix "/" }}'/ \ /local_resources/basemodel {{- if $.Values.trainingData }} mc cp \ - minio-host/{{ $.Values.trainingData }} \ - /local_resources/{{ $.Values.trainingData | replace "/" "_" }} + minio-host/'{{ $.Values.trainingData | replace "'" "'\\''" }}' \ + /local_resources/'{{ $.Values.trainingData | replace "'" "'\\''" | replace "/" "_" }}' {{- else }} {{- range .Values.finetuning_config.data_conf.training_data.datasets }} mc cp \ - minio-host/{{ .path }} \ - /local_resources/{{ .path | replace "/" "_" }} + minio-host/'{{ .path | replace "'" "'\\''" }}' \ + /local_resources/'{{ .path | replace "'" "'\\''" | replace "/" "_" }}' {{- end }} {{- if (or (eq .Values.finetuning_config.data_conf.validation_data.type "AUTO_SPLIT" ) (eq .Values.finetuning_config.data_conf.validation_data.type "NONE")) }} {{- range .Values.finetuning_config.data_conf.validation_data.datasets }} mc cp \ - minio-host/{{ .path }} \ - /local_resources/{{ .path | replace "/" "_" }} + minio-host/'{{ .path | replace "'" "'\\''" }}' \ + /local_resources/'{{ .path | replace "'" "'\\''" | replace "/" "_" }}' {{- end }} {{- end }} {{- end }} # Sync checkpoints from remote to local -if mc mirror minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ /workdir/checkpoints 2>/dev/null; then - echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints' +{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }} +if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then + echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''" }} to /workdir/checkpoints' ls -lah /workdir/checkpoints else echo 'No checkpoints found yet' @@ -35,25 +36,38 @@ fi {/* ####################################################################################################################################################### */}} {{- define "finetuningAndUploadEntrypoint" -}} -{{- $logs_path := (default ( .Values.checkpointsRemote | trimSuffix "/" | printf "%s/logs/" ) .Values.logsRemote ) -}} +# quote paths with single quotes to avoid issues with special characters in paths, and replace any existing single quote with escaped single quote +{{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }} +{{- $logsRemotePath := printf "minio-host/'%s'/" ( (default ( .Values.checkpointsRemote | trimSuffix "/" | printf "%s/logs" ) .Values.logsRemote ) | trimSuffix "/" | replace "'" "'\\''") -}} # Print GPU Info: rocm-smi echo "Starting checkpoint sync process" mc mirror \ --watch \ /workdir/checkpoints \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ & + {{ $checkpointsRemotePath }} & uploadPID=$! +sleep 1 # Give some time for the process to start +# Check if the sync process started successfully +if ! ps -p $uploadPID > /dev/null; then + echo "ERROR: Sync process failed to start" + exit 1 +fi # Run training: {{- if .Values.runTensorboard }} tensorboard --logdir /workdir/logs --port 6006 & echo "Serving tensorboard on port 6006. Port-forward to access training logs during the training process lifetime." -echo "Also starting logs upload process, uploading to {{ $logs_path }}" +echo "Also starting logs upload process, uploading to {{ $logsRemotePath }}" mc mirror \ --watch \ /workdir/logs \ - minio-host/{{ $logs_path }} & + {{ $logsRemotePath }} & logsPID=$! +sleep 1 +if ! ps -p $logsPID > /dev/null; then + echo "ERROR: Logs sync process failed to start" + exit 1 +fi {{- end }} echo "Starting training process" accelerate launch \ @@ -81,16 +95,16 @@ merge_adapter $merge_base ./checkpoints/checkpoint-final ./checkpoints/checkpoin echo 'Training done, syncing once more...' mc mirror \ /workdir/checkpoints \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/ + {{ $checkpointsRemotePath }} {{- if .Values.runTensorboard }} mc mirror \ /workdir/logs \ - minio-host/{{ $logs_path }} + {{ $logsRemotePath }} {{- end }} # Sync the final checkpoint with overwrite to carry over vLLM-compatibility changes mc mirror \ --overwrite \ /workdir/checkpoints/checkpoint-final \ - minio-host/{{ .Values.checkpointsRemote | trimSuffix "/" }}/checkpoint-final/ + {{ $checkpointsRemotePath }}checkpoint-final/ echo 'All done, exiting' {{- end }} diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml index 739122d..ec33a0b 100644 --- a/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml +++ b/workloads/llm-finetune-silogen-engine/helm/templates/configmap.yaml @@ -47,7 +47,7 @@ data: main_process_port: null mixed_precision: bf16 num_machines: 1 - num_processes: {{ .Values.finetuningGpus }} + num_processes: 1 use_cpu: false {{- else if (eq .Values.distributedType "auto-ddp") }} compute_environment: LOCAL_MACHINE diff --git a/workloads/llm-finetune-verl/helm/Chart.yaml b/workloads/llm-finetune-verl/helm/Chart.yaml new file mode 100644 index 0000000..ca3ef76 --- /dev/null +++ b/workloads/llm-finetune-verl/helm/Chart.yaml @@ -0,0 +1,4 @@ +apiVersion: v2 +name: llm-finetune-verl-example +description: VeRL finetuning on SiloGen stack +version: 0.0.1 diff --git a/workloads/llm-finetune-verl/helm/README.md b/workloads/llm-finetune-verl/helm/README.md new file mode 100644 index 0000000..6408efa --- /dev/null +++ b/workloads/llm-finetune-verl/helm/README.md @@ -0,0 +1,49 @@ +# Finetuning with VeRL + +This is a Helm Chart for running a finetuning job using [VeRL](https://github.com/volcengine/verl) + +The output is saved with MinIO in the directory specified by `checkpointsRemote`. + +## Configuration + +Include any parameters for VeRL in the `verlConfig` parameter. See the override file [`overrides/ppo_qwen_gsm8k.yaml`](overrides/ppo_qwen_gsm8k.yaml) for an example and the [VeRL documentation](https://verl.readthedocs.io/en/latest/examples/config.html) for more details. + +## Running the workload + +The simplest is to run `helm template` and pipe the result to `kubectl create`. + +Example command using the example override file `overrides/ppo_qwen_gsm8k.yaml`: + +```bash +helm template workloads/llm-finetune-verl/helm \ + --values workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml \ + --name-template ppo-qwen-gsm8k-verl \ + | kubectl create -f - +``` + +## Data specification + +VeRL requires that the data is prepared for the policy training in a [particular way](https://verl.readthedocs.io/en/latest/preparation/prepare_data.html). + +Some example data preprocess scripts are provided, to use one of these, specify the name of data set used for training as `dataset`. Available datasets are "full_hh_rlhf", "geo3k", "gsm8k", "hellaswag", "math_dataset". + +To use your own datasets from MinIO, specify the path as `datasetRemote`. It should point to a directory with files that have already been appropriately processed (`train.parquet` and `test.parquet`). + +## Model specification + +To use a base model from HuggingFace or other source directly supported by LLaMA-Factory, specify the model name in `modelName`. + +Alternatively to use a model from MinIO, specify the path to the model in `modelRemote`. + +Either `modelName` or `modelRemote` must be specified. If both are included, the model from `modelRemote` is used. + +## Cleanup + +After the jobs are completed, please delete the resources created. To delete the resources, you can run the same `helm template` command, only replacing `kubectl create` with `kubectl delete`, e.g.: + +```bash +helm template workloads/llm-finetune-verl/helm \ + --values workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml \ + --name-template ppo-qwen-gsm8k-verl \ + | kubectl delete -f - +``` diff --git a/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml b/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml new file mode 100644 index 0000000..99f6801 --- /dev/null +++ b/workloads/llm-finetune-verl/helm/overrides/grpo_qwen_gsm8k.yaml @@ -0,0 +1,45 @@ +### Model ### +modelName: "Qwen/Qwen2-7B-Instruct" + +### Data ### +dataset: "gsm8k" + +# Resources: +checkpointsReservedSize: 512Gi +storageClass: mlstorage +finetuningGpus: 2 +memoryPerGpu: 64 +cpusPerGpu: 8 + +### Model output path ### +checkpointsRemote: "default-bucket/experiments/Qwen2_7B_Instruct_GRPO_gsm8k_verl" + +verlConfig: + algorithm: + adv_estimator: grpo + kl_ctrl: + kl_coef: 0.001 + data: + train_batch_size: 1024 + max_prompt_length: 512 + max_response_length: 1024 + actor_rollout_ref: + model: + use_remove_padding: True + enable_gradient_checkpointing: True + actor: + ppo_micro_batch_size_per_gpu: 80 + use_kl_loss: True + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + rollout: + n: 5 + log_prob_micro_batch_size_per_gpu: 40 + tensor_model_parallel_size: 2 + gpu_memory_utilization: 0.6 + ref: + log_prob_micro_batch_size_per_gpu: 40 + fsdp_config: + param_offload: True + trainer: + total_epochs: 10 diff --git a/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml b/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml new file mode 100644 index 0000000..e6d278a --- /dev/null +++ b/workloads/llm-finetune-verl/helm/overrides/kaiwo/kaiwo-enable.yaml @@ -0,0 +1,3 @@ +# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload) +kaiwo: + enabled: true diff --git a/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml b/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml new file mode 100644 index 0000000..fac53d8 --- /dev/null +++ b/workloads/llm-finetune-verl/helm/overrides/ppo_qwen_gsm8k.yaml @@ -0,0 +1,50 @@ +### Model ### +modelName: "Qwen/Qwen2-7B-Instruct" + +### Data ### +dataset: "gsm8k" + +# Resources: +checkpointsReservedSize: 512Gi +storageClass: mlstorage +finetuningGpus: 2 +memoryPerGpu: 64 +cpusPerGpu: 8 + +### Model output path ### +checkpointsRemote: "default-bucket/experiments/Qwen2_7B_Instruct_PPO_gsm8k_verl" + +verlConfig: + data: + train_batch_size: 1024 + max_prompt_length: 1024 + max_response_length: 512 + actor_rollout_ref: + model: + use_remove_padding: True + enable_gradient_checkpointing: True + actor: + ppo_micro_batch_size_per_gpu: 16 + rollout: + log_prob_micro_batch_size_per_gpu: 40 + tensor_model_parallel_size: 2 + gpu_memory_utilization: 0.6 + ref: + log_prob_micro_batch_size_per_gpu: 40 + fsdp_config: + param_offload: True + critic: + optim: + lr: 1e-5 + model: + use_remove_padding: True + enable_gradient_checkpointing: True + ppo_micro_batch_size_per_gpu: 32 + fsdp_config: + param_offload: False + optimizer_offload: False + algorithm: + kl_ctrl: + kl_coef: 0.001 + trainer: + total_epochs: 10 diff --git a/workloads/llm-finetune-verl/helm/templates/configmap.yaml b/workloads/llm-finetune-verl/helm/templates/configmap.yaml new file mode 100644 index 0000000..89e0424 --- /dev/null +++ b/workloads/llm-finetune-verl/helm/templates/configmap.yaml @@ -0,0 +1,119 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: "{{ .Release.Name }}-configs" +data: + verl_config.yaml: | + # @package _global_ +{{ toYaml .Values.verlConfig | indent 4 }} + entrypoint.sh: | + #!/bin/bash + set -eu + # Print GPU Info: + rocm-smi + mkdir -p /workdir/checkpoints + mkdir -p /workdir/datasets + + echo "Installing MinIO:" + curl https://dl.min.io/client/mc/release/linux-amd64/mc \ + --create-dirs \ + -o /minio-binaries/mc + chmod +x /minio-binaries/mc + export PATH="${PATH}:/minio-binaries/" + # Setup MinIO + mc alias set minio-host $BUCKET_STORAGE_HOST $BUCKET_STORAGE_ACCESS_KEY $BUCKET_STORAGE_SECRET_KEY + {{- if .Values.modelRemote }} + # copy model from remote to local + echo "Downloading model from remote: {{ .Values.modelRemote }}" + mc cp --recursive \ + minio-host/{{ .Values.modelRemote | trimSuffix "/" }}/ \ + /workdir/basemodel + MODEL_PATH=/workdir/basemodel + {{- else if .Values.modelName }} + MODEL_PATH={{ .Values.modelName }} + {{- else }} + {{- fail "either modelName or modelRemote must be set" }} + {{- end }} + python3 -c "import transformers;transformers.pipeline('text-generation', model='$MODEL_PATH')" + + {{- if .Values.datasetRemote }} + echo "Downloading dataset from remote: {{ .Values.datasetRemote }}" + mc cp --recursive \ + minio-host/{{ .Values.datasetRemote | trimSuffix "/" }}/ \ + /workdir/datasets/{{ .Values.datasetRemote | trimSuffix "/" }} + DATASET_PATH=/workdir/datasets/{{ .Values.datasetRemote | trimSuffix "/" }} + {{- else if .Values.dataset }} + {{- if eq .Values.dataset "full_hh_rlhf" }} + python3 /app/examples/data_preprocess/{{ .Values.dataset }}.py --split rm --local_dir /workdir/datasets/{{ .Values.dataset }} + DATASET_PATH=/workdir/datasets/{{ .Values.dataset }}/rm + {{- else }} + python3 /app/examples/data_preprocess/{{ .Values.dataset }}.py --local_dir /workdir/datasets/{{ .Values.dataset }} + DATASET_PATH=/workdir/datasets/{{ .Values.dataset }} + {{- end }} + {{- else }} + {{- fail "either dataset or datasetRemote must be set" }} + {{- end }} + + {{- $checkpointsRemotePath := printf "minio-host/'%s'/" (.Values.checkpointsRemote | trimSuffix "/" | replace "'" "'\\''") }} + {{- if .Values.checkpointsRemote }} + {{- if .Values.resumeFromCheckpoint }} + # Sync checkpoints from remote to local + if mc mirror {{ $checkpointsRemotePath }} /workdir/checkpoints 2>/dev/null; then + echo 'Downloaded checkpoints from {{ .Values.checkpointsRemote}} to /workdir/checkpoints' + ls -lah /workdir/checkpoints + RESUME_MODE='resume_path' + else + echo 'No checkpoints found yet' + RESUME_MODE='disable' + fi + {{- else }} + RESUME_MODE='disable' + {{- end }} + echo "Starting checkpoint sync process" + mc mirror \ + --watch \ + --overwrite \ + /workdir/checkpoints \ + {{ $checkpointsRemotePath }} & + uploadPID=$! + # Check if the sync process started successfully + sleep 1 + if ! ps -p $uploadPID > /dev/null; then + echo "ERROR: Sync process failed to start" + exit 1 + fi + {{- end }} + + export HIP_VISIBLE_DEVICES=$(rocm-smi --showall --csv | grep -P '^card\d+,' | cut -d',' -f1 | sed 's/card//g' | paste -sd ',' -) + export NUM_GPUS=$(echo $HIP_VISIBLE_DEVICES | tr ',' '\n' | wc -l) + export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + + # copy config file into the verl directory, this is necessary to apply it as an override with hydra + mkdir -p /app/verl/trainer/config/override + cp /configs/verl_config.yaml /app/verl/trainer/config/override/helm.yaml + + echo "Starting training process" + python3 -m verl.trainer.main_ppo +override=helm \ + data.train_files=$DATASET_PATH/train.parquet \ + data.val_files=$DATASET_PATH/test.parquet \ + actor_rollout_ref.model.path=$MODEL_PATH \ + critic.model.path=$MODEL_PATH \ + trainer.n_gpus_per_node=$NUM_GPUS \ + trainer.project_name='{{ .Release.Name }}' \ + trainer.experiment_name='{{ .Release.Name }}' \ + trainer.default_local_dir=/workdir/checkpoints \ + trainer.resume_mode=$RESUME_MODE \ + trainer.resume_from_path=/workdir/checkpoints + + {{- if .Values.checkpointsRemote }} + echo "Training done, stop the upload process" + kill $uploadPID + wait $uploadPID || true + # Once more to ensure everything gets uploaded + echo 'Training done, syncing once more...' + mc mirror --overwrite \ + /workdir/checkpoints \ + {{ $checkpointsRemotePath }} + {{- end }} + echo 'All done, exiting' diff --git a/workloads/llm-finetune-verl/helm/templates/job.yaml b/workloads/llm-finetune-verl/helm/templates/job.yaml new file mode 100644 index 0000000..3d3db12 --- /dev/null +++ b/workloads/llm-finetune-verl/helm/templates/job.yaml @@ -0,0 +1,113 @@ +{{- define "job" -}} +apiVersion: batch/v1 +kind: Job +metadata: + name: "{{ .Release.Name }}-job" + {{- if .Values.labels }} + labels: + {{- range $label, $value := .Values.labels }} + {{ $label }}: {{ $value | quote }} + {{- end }} + {{- end }} +spec: + ttlSecondsAfterFinished: 3600 + backoffLimit: 0 + template: + spec: + restartPolicy: Never + {{- if .Values.imagePullSecrets }} + imagePullSecrets: + {{- range .Values.imagePullSecrets }} + - name: {{ . }} + {{- end }} + {{- end }} + containers: + - name: finetuning + image: "{{ .Values.finetuningImage }}" + imagePullPolicy: Always + env: + {{- if .Values.hfTokenSecret }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.hfTokenSecret.name }} + key: {{ .Values.hfTokenSecret.key }} + {{- end }} + # storage + - name: BUCKET_STORAGE_HOST + value: {{ .Values.bucketStorageHost }} + - name: BUCKET_STORAGE_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.bucketCredentialsSecret.name }} + key: {{ .Values.bucketCredentialsSecret.accessKeyKey }} + - name: BUCKET_STORAGE_SECRET_KEY + valueFrom: + secretKeyRef: + name: {{ .Values.bucketCredentialsSecret.name }} + key: {{ .Values.bucketCredentialsSecret.secretKeyKey }} + command: + - /configs/entrypoint.sh + resources: + limits: + memory: "{{ mul .Values.finetuningGpus .Values.memoryPerGpu }}Gi" + cpu: "{{ mul .Values.finetuningGpus .Values.cpusPerGpu }}" + amd.com/gpu: "{{ .Values.finetuningGpus }}" + requests: + memory: "{{ mul .Values.finetuningGpus .Values.memoryPerGpu }}Gi" + cpu: "{{ mul .Values.finetuningGpus .Values.cpusPerGpu }}" + amd.com/gpu: "{{ .Values.finetuningGpus }}" + volumeMounts: + - name: dshm # Increase SHM size for the container by mounting /dev/shm, for Pytorch parallel processing + mountPath: /dev/shm + - name: checkpoints + mountPath: /workdir/checkpoints + readOnly: false + - name: configs + mountPath: /configs + readOnly: true + volumes: + - name: dshm + emptyDir: + medium: Memory # equivalent to `docker run --shm-size=(total_memory/2)` + {{- if .Values.storageClass }} + - name: checkpoints + ephemeral: + volumeClaimTemplate: + spec: + accessModes: [ "ReadWriteOnce" ] + storageClassName: {{ .Values.storageClass }} + resources: + requests: + storage: "{{ .Values.checkpointsReservedSize }}" + {{- else }} + - name: checkpoints + emptyDir: + sizeLimit: "{{ .Values.checkpointsReservedSize }}" + {{- end }} + - name: configs + configMap: + name: "{{ .Release.Name }}-configs" + items: + - key: entrypoint.sh + path: entrypoint.sh + mode: 0777 + - key: verl_config.yaml + path: verl_config.yaml +{{- end -}} + +{{- define "job_wrapped_with_kaiwojob" -}} +apiVersion: kaiwo.silogen.ai/v1alpha1 +kind: KaiwoJob +metadata: + name: "{{ .Release.Name }}-job" +spec: + job: + {{- include "job" . | nindent 4 }} +{{- end -}} + +{{- if .Values.kaiwo.enabled -}} +{{- include "job_wrapped_with_kaiwojob" . }} +{{- else -}} +{{- include "job" . }} +{{- end -}} diff --git a/workloads/llm-finetune-verl/helm/values.schema.json b/workloads/llm-finetune-verl/helm/values.schema.json new file mode 100644 index 0000000..348874c --- /dev/null +++ b/workloads/llm-finetune-verl/helm/values.schema.json @@ -0,0 +1,138 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "finetuningImage": { + "type": "string", + "description": "Container image for finetuning" + }, + "modelName": { + "type": "string", + "description": "Model path in HuggingFace" + }, + "modelRemote": { + "type": "string", + "description": "Model path in remote MinIO storage, format: bucketName/path/in/bucket" + }, + "dataset": { + "type": "string", + "description": "Name of data set to use for training" + }, + "datasetRemote": { + "type": "string", + "description": "Dataset path in remote MinIO storage, format: bucketName/path/in/bucket" + }, + "kaiwo": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean", + "description": "If true, use Kaiwo CRDs to have Kaiwo operator manage the workload", + "default": false + } + }, + "default": {} + }, + "labels": { + "type": "object", + "description": "Any labels to add for the manifest, recommended: kueue", + "additionalProperties": { + "type": "string" + }, + "default": {} + }, + "imagePullSecrets": { + "type": "array", + "description": "Any imagePullSecrets to use", + "items": { + "type": "string" + }, + "default": [] + }, + "bucketStorageHost": { + "type": "string", + "description": "The cloud storage host URL" + }, + "bucketCredentialsSecret": { + "type": "object", + "description": "Bucket storage credential secret values, required to have the secret already setup in the cluster (e.g. via external secrets)", + "properties": { + "name": { + "type": "string", + "description": "The name of the secret in the cluster that contains the bucket storage credentials", + "default": "minio-credentials" + }, + "accessKeyKey": { + "type": "string", + "description": "The key in the secret that contains the access key", + "default": "minio-access-key" + }, + "secretKeyKey": { + "type": "string", + "description": "The key in the secret that contains the access key", + "default": "minio-secret-key" + } + } + }, + "checkpointsReservedSize": { + "type": "string", + "description": "How much space to reserve for model and data downloads" + }, + "storageClass": { + "type": [ + "string", + "null" + ], + "description": "Optionally set this to use a specific storageClass for the storage" + }, + "cpusPerGpu": { + "type": "integer", + "description": "How many CPUs to use, per GPU", + "default": 8, + "minimum": 1 + }, + "finetuningGpus": { + "type": "integer", + "description": "How many GPUs to use for finetuning", + "default": 1, + "minimum": 0 + }, + "memoryPerGpu": { + "type": "integer", + "description": "How much memory to use in GB, per GPU", + "default": 64 + }, + "checkpointsRemote": { + "type": "string", + "description": "Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket" + }, + "resumeFromCheckpoint": { + "type": "boolean", + "description": "If true, resume from the last checkpoint in checkpointsRemote (if available)", + "default": false + }, + "hfTokenSecret": { + "type": "object", + "description": "Optional secret reference that contains a HuggingFace token", + "properties": { + "name": { + "type": "string", + "description": "The name of the secret in the cluster that contains the HuggingFace token" + }, + "key": { + "type": "string", + "description": "The key in the secret that contains the HuggingFace token" + } + }, + "default": {} + }, + "verlConfig": { + "type": "object", + "description": "VeRL configurations to use" + } + }, + "required": [ + "finetuningImage", + "verlConfig" + ] +} diff --git a/workloads/llm-finetune-verl/helm/values.yaml b/workloads/llm-finetune-verl/helm/values.yaml new file mode 100644 index 0000000..60e0837 --- /dev/null +++ b/workloads/llm-finetune-verl/helm/values.yaml @@ -0,0 +1,62 @@ +### General chart values ### +finetuningImage: rocm/verl:verl-0.3.0.post0_rocm6.2_vllm0.6.3 + +### Model ### +# either modelRemote OR modelName must be set +# to use a base model directly from Hugging Face, set modelName to the model identifier (e.g., "meta-llama/Llama-3.1-8B-Instruct") +modelName: "" +# for remote models to be loaded from MinIO, specify the path to the model in the remote bucket as modelRemote +modelRemote: "" + +### Data ### +# either dataset OR datasetRemote must be set +# to use one of the pre-existing datasets, set dataset to the dataset identifier (e.g., "gsm8k") +# available datasets: "full_hh_rlhf", "geo3k", "gsm8k", "hellaswag", "math_dataset" +dataset: "" +# for remote datasets to be loaded from MinIO, specify the path to the model in the remote bucket as datasetRemote +# Note: the dataset should be processed and stored in a format compatible with VeRL (train.parquet, test.parquet) +datasetRemote: "" + +# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload) +kaiwo: + enabled: false + +# Use to add labels to the metadata of the resources created by this workload. +labels: {} + +# Extra annotations such as an imagePullSecrets +imagePullSecrets: [] + # Example: + # imagePullSecrets: + # - "regcred" + +# Configure these to match the credentials in your cluster: +bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80 +bucketCredentialsSecret: + name: minio-credentials + accessKeyKey: minio-access-key + secretKeyKey: minio-secret-key + +# Resources: +checkpointsReservedSize: 512Gi +storageClass: mlstorage # set this to use a specific storageClass for the storage. +finetuningGpus: 1 +memoryPerGpu: 64 +cpusPerGpu: 8 + +### Model output path ### +checkpointsRemote: "" # Path where to sync checkpoints in bucket storage, format: bucketName/path/in/bucket +resumeFromCheckpoint: false # Set to true to resume from the last checkpoint in checkpointsRemote (if available) + +hfTokenSecret: {} # Optional secret reference that contains the HuggingFace token +# Example: +# hfTokenSecret: +# name: hf-token +# key: hf-token + +verlConfig: + trainer: + logger: ['console'] + test_freq: 10 + save_freq: 10 + total_epochs: 1 diff --git a/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh b/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh index 272f199..fe9d1a7 100644 --- a/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh +++ b/workloads/llm-inference-openai-benchmark-guidellm/helm/mount/entrypoint.sh @@ -5,8 +5,14 @@ mkdir -p /workload/output curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /workload/mc chmod +x /workload/mc /workload/mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY} -/workload/mc mirror --watch /workload/output/ minio-host/${BUCKET_RESULT_PATH} & +/workload/mc mirror --watch /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" & MINIOPID=$! +sleep 1 # Give some time for the process to start +# Check if the sync process started successfully +if ! ps -p $MINIOPID > /dev/null; then + echo "ERROR: Sync process failed to start" + exit 1 +fi OPENAI_API_BASE_URL=${OPENAI_API_BASE_URL%/} MODEL=$(curl -s ${OPENAI_API_BASE_URL}/models | jq -r '.data[0].id') @@ -28,5 +34,5 @@ guidellm benchmark --target $OPENAI_API_BASE_URL \ echo -e "<==========================\nBenchmarking completed" kill $MINIOPID wait $MINIOPID || true -/workload/mc mirror /workload/output/ minio-host/${BUCKET_RESULT_PATH} +/workload/mc mirror /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" echo "All data uploaded successfully" diff --git a/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh b/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh index 6b465c8..24db12e 100644 --- a/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh +++ b/workloads/llm-inference-openai-benchmark-rocmblog/helm/mount/entrypoint.sh @@ -53,4 +53,4 @@ chmod +x /minio-binaries/mc export PATH="${PATH}:/minio-binaries/" mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY} -mc cp --recursive $OUTPATH minio-host/${BUCKET_RESULT_PATH}/ +mc cp --recursive $OUTPATH minio-host/"${BUCKET_RESULT_PATH}"/ diff --git a/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh b/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh index 4518a74..ac7c098 100644 --- a/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh +++ b/workloads/llm-inference-vllm-benchmark-mad/helm/mount/entrypoint.sh @@ -15,8 +15,14 @@ chmod +x $WORKPATH/bin/mc mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY} # Start a background process that watches for changes and uploads them -mc mirror --watch $WORKPATH/output/ minio-host/${BUCKET_RESULT_PATH} & +mc mirror --watch $WORKPATH/output/ minio-host/"${BUCKET_RESULT_PATH}" & MINIOPID=$! +sleep 1 # Give some time for the process to start +# Check if the sync process started successfully +if ! ps -p $MINIOPID > /dev/null; then + echo "ERROR: Sync process failed to start" + exit 1 +fi bash $WORKPATH/mount/minio_download_models.sh @@ -53,5 +59,5 @@ kill $MINIOPID wait $MINIOPID || true # Run a final mirror command to ensure all data is uploaded -mc mirror $WORKPATH/output/ minio-host/${BUCKET_RESULT_PATH} +mc mirror $WORKPATH/output/ minio-host/"${BUCKET_RESULT_PATH}" echo 'All data uploaded successfully' diff --git a/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh b/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh index 2cfd716..81bf005 100644 --- a/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh +++ b/workloads/llm-inference-vllm-benchmark-rocmblog/helm/mount/run_benchmark.sh @@ -3,8 +3,14 @@ mkdir -p /workload/output curl https://dl.min.io/client/mc/release/linux-amd64/mc -o /workload/mc chmod +x /workload/mc /workload/mc alias set minio-host ${BUCKET_STORAGE_HOST} ${BUCKET_STORAGE_ACCESS_KEY} ${BUCKET_STORAGE_SECRET_KEY} -/workload/mc mirror --watch /workload/output/ minio-host/${BUCKET_RESULT_PATH} & +/workload/mc mirror --watch /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" & MINIOPID=$! # Capture the PID of the mc mirror process +sleep 1 # Give some time for the process to start +# Check if the sync process started successfully +if ! ps -p $MINIOPID > /dev/null; then + echo "ERROR: Sync process failed to start" + exit 1 +fi echo "vLLM server started with PID: $SERVER_PID" ATTEMPT=0 @@ -62,5 +68,5 @@ done echo "Benchmarking completed" kill $MINIOPID wait $MINIOPID || true -/workload/mc mirror /workload/output/ minio-host/${BUCKET_RESULT_PATH} +/workload/mc mirror /workload/output/ minio-host/"${BUCKET_RESULT_PATH}" echo "All data uploaded successfully" diff --git a/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml b/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml index ad7a277..b6ca364 100644 --- a/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml +++ b/workloads/llm-megatron-ckpt-conversion/helm/templates/conversion-job.yaml @@ -30,10 +30,10 @@ spec: mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY}; echo "Listing contents of the model path:"; - mc ls minio-host/{{ .Values.remoteSourceModelPath | trimSuffix "/" }}/ || echo "Model path not found!"; + mc ls minio-host/'{{ .Values.remoteSourceModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/ || echo "Model path not found!"; echo "Copying model checkpoint to container..."; - mc cp -r minio-host/{{ .Values.remoteSourceModelPath | trimSuffix "/" }}/ /local-resources/sourcemodel || echo "Failed to copy model!"; + mc cp -r minio-host/'{{ .Values.remoteSourceModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/ /local-resources/sourcemodel || echo "Failed to copy model!"; echo "Listing contents of /local-resources/:"; ls -la /local-resources/ || echo "Local resources directory not found!"; @@ -72,7 +72,7 @@ spec: echo "Conversion done, syncing checkpoint artifacts to remote storage..."; mc mirror --overwrite \ - /local-resources/checkpoints/ minio-host/{{ .Values.remoteDestinationModelPath | trimSuffix "/" }}/; + /local-resources/checkpoints/ minio-host/'{{ .Values.remoteDestinationModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/; echo "Done uploading. Signal to the main container that it can exit."; diff --git a/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml b/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml index b64d515..217e3fe 100644 --- a/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml +++ b/workloads/llm-pretraining-megatron-lm/helm/templates/job.yaml @@ -26,18 +26,19 @@ spec: # Setup MinIO, Download resources: mc alias set minio-host $${BUCKET_STORAGE_HOST} $${BUCKET_STORAGE_ACCESS_KEY} $${BUCKET_STORAGE_SECRET_KEY}; echo "Copying data to container..."; - mc cp -r minio-host/{{ .Values.remoteDataDirPath | trimSuffix "/" }}/{{ .Values.remoteDataNamePrefix }} /local-resources/data; + mc cp -r minio-host/'{{ .Values.remoteDataDirPath | trimSuffix "/" | replace "'" "'\\''" }}'/'{{ .Values.remoteDataNamePrefix }}' /local-resources/data; echo "Copying tokenizer to container..."; - mc cp -r minio-host/{{ .Values.remoteTokenizerPath | trimSuffix "/" }}/ /local-resources/tokenizer; + mc cp -r minio-host/'{{ .Values.remoteTokenizerPath | trimSuffix "/" | replace "'" "'\\''" }}'/ /local-resources/tokenizer; echo "Copying model checkpoint to container..."; - if last_ckpt=$(mc cat minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/latest_checkpointed_iteration.txt); then + {{- $remotePath := printf "minio-host/'%s'/" (.Values.remoteCheckpointsPath | trimSuffix "/" | replace "'" "'\\''") }} + if last_ckpt=$(mc cat {{ $remotePath }}/latest_checkpointed_iteration.txt); then last_ckpt=$(printf 'iter_%07d' "$last_ckpt") echo "Found checkpoint at iteration $last_ckpt. Downloading ..." - mc mirror minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/$last_ckpt/ /local-resources/basemodel/$last_ckpt - mc cp minio-host/{{ .Values.remoteCheckpointsPath | trimSuffix "/" }}/latest_checkpointed_iteration.txt /local-resources/basemodel/latest_checkpointed_iteration.txt + mc mirror {{ $remotePath }}/$last_ckpt/ /local-resources/basemodel/$last_ckpt + mc cp {{ $remotePath }}/latest_checkpointed_iteration.txt /local-resources/basemodel/latest_checkpointed_iteration.txt else echo "No checkpoints found yet. Downloading basemodel ..." - mc cp -r minio-host/{{ .Values.remoteBaseModelPath | trimSuffix "/" }}/ /local-resources/basemodel; + mc cp -r minio-host/'{{ .Values.remoteBaseModelPath | trimSuffix "/" | replace "'" "'\\''" }}'/ /local-resources/basemodel; fi resources: limits: diff --git a/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml b/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml index a6da3e6..7e1f0f7 100644 --- a/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml +++ b/workloads/prepare-data-for-megatron-lm/helm/templates/prepare-data-for-megatron-lm.yaml @@ -86,10 +86,10 @@ spec: sleep 60 done - echo "Preprocessing done, syncing data to remote storage {{ .Values.bucketDataDir | trimSuffix "/" }}..."; - mc cp --recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/; + echo "Preprocessing done, syncing data to remote storage {{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}..."; + mc cp --recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'/; mc mirror --overwrite --exclude "**/.cache/*" \ - /downloads/tokenizer/ minio-host/{{ .Values.bucketTokenizersDir | trimSuffix "/" }}/; + /downloads/tokenizer/ minio-host/'{{ .Values.bucketTokenizersDir | trimSuffix "/" | replace "'" "'\\''" }}'/; echo "Done uploading. Signal to the main container that it can exit."; touch /downloads/done_uploading; resources: