Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/llm-evaluation/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@ dataclasses-json==0.6.7
evaluate==0.4.3
jsonlines==4.0.0
minio==7.2.15
mlflow==3.1.0
openai==1.64.0
sentencepiece==0.2.0
11 changes: 3 additions & 8 deletions docker/llm-evaluation/run_inference_and_judge_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,8 @@ async def main(args: Namespace):

saved_results = []
parameters: dict = {}
llm_url_no_protocol = args.llm_base_url.removeprefix("http://").removeprefix(
"https://"
) # the Minio python client handles protocol itself
client = get_llm_client(base_url=llm_url_no_protocol, port=args.llm_port, endpoint=args.llm_endpoint)

client = get_llm_client(base_url=args.llm_base_url, port=args.llm_port, endpoint=args.llm_endpoint)

async for inference_result in run_call_inference_container(
dataset=ds,
Expand Down Expand Up @@ -123,10 +121,7 @@ async def main(args: Namespace):
logger.info(inferences_data)
logger.info("Inference ran.")

judge_url_no_protocol = args.judge_base_url.removeprefix("http://").removeprefix(
"https://"
) # the Minio python client handles protocol itself
judge_client = get_llm_client(base_url=judge_url_no_protocol, port=args.judge_port, endpoint=args.judge_endpoint)
judge_client = get_llm_client(base_url=args.judge_base_url, port=args.judge_port, endpoint=args.judge_endpoint)

aggregated_judge_results = AggregatedJudgeResults(
judge_results={},
Expand Down
18 changes: 16 additions & 2 deletions docker/llm-evaluation/run_inference_and_metrics_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
from llm_evaluation.call_inference_container.call_inference_container import (
save_inference_results,
)
from llm_evaluation.metrics.run_metrics_evaluation import read_inference_data
from llm_evaluation.metrics.run_metrics_evaluation import get_bert_score_distribution_graphs, read_inference_data
from llm_evaluation.metrics.run_metrics_evaluation import run as run_metrics_evaluation
from llm_evaluation.metrics.utils import save_results
from llm_evaluation.metrics.utils import log_metrics_in_mlflow, save_results


async def main(args: Namespace):
Expand Down Expand Up @@ -115,6 +115,20 @@ async def main(args: Namespace):

eval_results = run_metrics_evaluation(data)

distribution_graphs = get_bert_score_distribution_graphs(
scores=eval_results.scores,
)

if args.mlflow_server_uri:
logger.info("Logging results to MLFlow...")
log_metrics_in_mlflow(
distribution_graphs,
eval_results.scores,
mlflow_server_uri=args.mlflow_server_uri,
mlflow_experiment_name=args.mlflow_experiment_name,
mlflow_run_name=args.mlflow_run_name,
)

logger.info("Evaluation results:")
logger.info(eval_results)

Expand Down
24 changes: 23 additions & 1 deletion docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,11 @@ def get_inference_parser() -> ArgumentParser:
parser.add_argument("-p", "--llm-port", type=str, default="8080", help="Port number of the LLM service.")
parser.add_argument("-e", "--llm-endpoint", type=str, default="v1", help="Endpoint of the LLM service.")
parser.add_argument(
"-d", "--evaluation-dataset", type=str, default="abisee/cnn_dailymail", help="Name of the evaluation dataset."
"-d",
"--evaluation-dataset-name",
type=str,
default="abisee/cnn_dailymail",
help="Name of the evaluation dataset.",
)
parser.add_argument(
"-v", "--evaluation-dataset-version", type=str, default="3.0.0", help="Version of the evaluation dataset."
Expand Down Expand Up @@ -65,6 +69,24 @@ def get_inference_parser() -> ArgumentParser:
default="/home/evaluation/example_prompts/example_summary_prompt.txt",
help="Path to the prompt template file.",
)
parser.add_argument(
"--mlflow-server-uri",
type=str,
default="", # leave this argument empty to disable MLFlow tracking
help="MLFlow server URI for tracking.",
)
parser.add_argument(
"--mlflow-experiment-name",
type=str,
default="llm-evaluation-experiment",
help="MLFlow experiment name for tracking.",
)
parser.add_argument(
"--mlflow-run-name",
type=str,
default="llm-evaluation-run",
help="MLFlow run name for tracking.",
)
return parser


Expand Down
10 changes: 6 additions & 4 deletions docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
@dataclass_json
@dataclass
class EvaluationScores:
precision_bert: float
recall_bert: float
f1_bert: float
f1_list: List[float]
precision_avg_bert: float
recall_avg_bert: float
f1_avg_bert: float
precision_list_bert: List[float]
recall_list_bert: List[float]
f1_list_bert: List[float]
bleu_score: float
accuracy: float

Expand Down
10 changes: 2 additions & 8 deletions docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def compute_bertscore(
predictions: List[str], references: List[str], language: str = "en"
) -> Tuple[float, float, float, List[float]]:
) -> Tuple[List[float], List[float], List[float]]:
"""
Computes the BERTScore for a set of predictions and references.

Expand All @@ -32,13 +32,7 @@ def compute_bertscore(
recall_list = convert_negatives_to_zero(array=np.array(results["recall"]))
f1_list = convert_negatives_to_zero(array=np.array(results["f1"]))

precision_bert = round(np.average(precision_list), 4)
recall_bert = round(np.average(recall_list), 4)
f1_bert = round(np.average(f1_list), 4)

f1_list = [round(f1, 4) for f1 in f1_list]

return precision_bert, recall_bert, f1_bert, f1_list
return precision_list, recall_list, f1_list


def compute_exact_match(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
from typing import Any, Dict, List

import jsonlines
import matplotlib.pyplot as plt
import mlflow
import numpy as np
from llm_evaluation import logger
from llm_evaluation.argument_parsers import get_metrics_parser
from llm_evaluation.data.data_classes import EvaluationResults, EvaluationScores
Expand All @@ -28,7 +31,13 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS

bert_score_start_time = time.time()

precision_bert, recall_bert, f1_bert, f1_list = compute_bertscore(predictions=predictions, references=references)
precision_list_bert, recall_list_bert, f1_list_bert = compute_bertscore(
predictions=predictions, references=references
)

precision_avg_bert = round(np.average(precision_list_bert), 4)
recall_avg_bert = round(np.average(recall_list_bert), 4)
f1_avg_bert = round(np.average(f1_list_bert), 4)

logger.info(f"BERT-score computation took {time.time() - bert_score_start_time:.2f} seconds")

Expand All @@ -45,15 +54,53 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
logger.info(f"Exact match computation took {time.time() - exact_match_start_time:.2f} seconds")

return EvaluationScores(
precision_bert=precision_bert,
recall_bert=recall_bert,
f1_bert=f1_bert,
f1_list=f1_list,
precision_avg_bert=precision_avg_bert,
recall_avg_bert=recall_avg_bert,
f1_avg_bert=f1_avg_bert,
precision_list_bert=precision_list_bert,
recall_list_bert=recall_list_bert,
f1_list_bert=f1_list_bert,
bleu_score=bleu_score,
accuracy=accuracy,
)


def get_bert_score_distribution_graphs(scores: EvaluationScores) -> Dict[str, str]:
"""
Generate PNG images of the distributions of BERTScore precision, recall, and F1,
each with the mean value marked.

Args:
precision_list (list of float): List of BERTScore precision values.
recall_list (list of float): List of BERTScore recall values.
f1_list (list of float): List of BERTScore F1 values.

Returns:
dict: Dictionary with keys 'precision', 'recall', 'f1', each containing PNG image bytes.
"""
results = {}
metrics = [
("precision", scores.precision_list_bert),
("recall", scores.recall_list_bert),
("f1", scores.f1_list_bert),
]
for name, values in metrics:
fig, ax = plt.subplots()
values = np.array(values)
mean_val = np.mean(values)
ax.hist(values, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
ax.axvline(mean_val, color="red", linestyle="dashed", linewidth=2, label=f"Mean: {mean_val:.4f}")
ax.set_title(f"BERTScore {name.capitalize()} Distribution")
ax.set_xlabel(name.capitalize())
ax.set_ylabel("Frequency")
ax.legend()
plt.tight_layout()
plt.savefig(f"{name}_distribution.png", format="png")
plt.close(fig)
results[name] = f"{name}_distribution.png"
return results


def read_inference_data(input_path: str) -> List[Dict[str, Any]]:
"""
Reads inference data from a file or directory containing JSON/JSONL files.
Expand Down
42 changes: 40 additions & 2 deletions docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
from typing import Any, Dict, List

import jsonlines
import mlflow
import numpy as np
from llm_evaluation import logger
from llm_evaluation.data.data_classes import AggregatedJudgeResults, EvaluationResults
from minio import Minio, S3Error
from numpy import ndarray


def convert_negatives_to_zero(array: ndarray) -> ndarray:
def convert_negatives_to_zero(array: np.ndarray) -> np.ndarray:
"""Converts all negative values in an array to zero.

Args:
Expand Down Expand Up @@ -129,3 +130,40 @@ def read_jsonl_data(input_file_path: str) -> List[Dict[str, Any]]:
for line in reader.iter(type=dict, skip_invalid=True):
generations.append(line)
return generations


def log_metrics_in_mlflow(distribution_graphs, scores, mlflow_server_uri, mlflow_experiment_name, mlflow_run_name):

logger.info(f"Using MLflow tracking URI: {mlflow_server_uri}")

experiment_description = "Evaluation of LLM using BERTScore metric."

experiment_tags = {
"project_name": mlflow_experiment_name,
"mlflow.note.content": experiment_description,
}

client = mlflow.MlflowClient(tracking_uri=mlflow_server_uri)

# Create the Experiment, providing a unique name
try:
test_experiment = client.create_experiment(name=mlflow_experiment_name, tags=experiment_tags)
logger.info(f"Created experiment with ID: {test_experiment}")
except mlflow.exceptions.MlflowException as e:
# If the experiment already exists, retrieve its ID
logger.warning(f"Experiment '{mlflow_experiment_name}' already exists. Using existing experiment.")
test_experiment = client.get_experiment_by_name(mlflow_experiment_name).experiment_id
logger.info(f"Using existing experiment with ID: {test_experiment}")

mlflow.set_tracking_uri(mlflow_server_uri)
mlflow.set_experiment(experiment_name=mlflow_experiment_name)
with mlflow.start_run(run_name=mlflow_run_name, experiment_id=test_experiment) as run:

for name, file in distribution_graphs.items():
mlflow.log_metric("bert_score_mean_precision" + name, np.mean(scores.precision_avg_bert))
mlflow.log_metric("bert_score_mean_recall" + name, np.mean(scores.recall_avg_bert))
mlflow.log_metric("bert_score_mean_f1" + name, np.mean(scores.f1_avg_bert))
logger.info(
f"Saving artifact {file} (abs path: {os.path.abspath(file)}) to MLflow run {run.info.run_id}..."
)
mlflow.log_artifact(os.path.abspath(file), artifact_path="metrics_distributions")
1 change: 1 addition & 0 deletions docker/logistics/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ google-cloud-storage
hf_transfer
huggingface_hub[cli]
minio
wandb
50 changes: 50 additions & 0 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,56 @@ Thank you for considering contributing to the SiloGen AI Workloads development!
# install packages you need
```

### Pre-commit setup

We use [pre-commit](https://pre-commit.com/) for consistent formatting and cleaner code. Hooks are specified in `ai-workloads-dev/.pre-commit-config.yaml`.

To install:<br />
`cd ai-workloads-dev` (this is necessary for `pre-commit install`, which runs particular to a git repository)<br />
`source your_venv`<br />
`pip install pre-commit`<br />
`pre-commit install --config .pre-commit-config.yaml`<br />
`git commit -m "test commit"`<br />

With the final command, pre-commit should run automatically, with output something like the following:

>check json...........................................(no files to check)Skipped<br />
check yaml...........................................(no files to check)Skipped<br />
fix end of files.....................................(no files to check)Skipped<br />
fix requirements.txt.................................(no files to check)Skipped<br />
trim trailing whitespace.............................(no files to check)Skipped<br />
black................................................(no files to check)Skipped<br />
flake8...............................................(no files to check)Skipped<br />
isort (python).......................................(no files to check)Skipped<br />
mypy.................................................(no files to check)Skipped<br />
helmlint.............................................(no files to check)Skipped<br />

It's also possible to manually run pre-commit using

`pre-commit run --all-files`

#### Troubleshooting pre-commit

Many pre-commit bugs come from having an incorrect version of pre-commit active. Pre-commit can hang around as a system-wide version, in python venvs, or in your pre-commit cache.

It's easiest to use pre-commit as part of a python virtual environment. To check that the right pre-commit is being found, run `which pre-commit` and confirm that the binaries inside your venv are shown. For example: `/../../venvs/your_venv/bin/pre-commit`. A different path could indicate that your system is choosing the wrong pre-commit install.


From system:
`brew uninstall pre-commit` (mac)
`sudo apt remove pre-commit` (linux)

From venv:
`pip uninstall pre-commit`

Just the pre-commit hooks uninstall:
`pre-commit uninstall`
`pre-commit clean`


Then reinstall pre-commit from scratch as described above.


## Development Workflow

1. Create a branch for your feature or bugfix:
Expand Down
13 changes: 11 additions & 2 deletions workloads/dev-workspace-jupyterlab/helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,17 @@ entrypoint: |
pip install pipx ipykernel
pipx install --include-deps jupyter
pipx inject --include-deps jupyter jupyterlab-lsp 'python-lsp-server[all]' ipywidgets jupyterlab-git jupyterlab_code_formatter
python -m ipykernel install --user --name=default-python3
jupyter-lab --ServerApp.token='' --ServerApp.ip='0.0.0.0' --ServerApp.allow_root=True --ServerApp.base_url=$BASE_URL --no-browser --ServerApp.root_dir='/workload'
python -m ipykernel install --user --name=default-python3 --display-name="Python 3 (default)"

jupyter-lab --no-browser \
--IdentityProvider.token='' \
--ServerApp.ip='0.0.0.0' \
--ServerApp.allow_root=True \
--ServerApp.base_url=$BASE_URL \
--ServerApp.root_dir='/workload' \
--MultiKernelManager.default_kernel_name=default-python3 \
--KernelSpecManager.allowed_kernelspecs=default-python3 \
--KernelSpecManager.ensure_native_kernel=False

# kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
kaiwo:
Expand Down
4 changes: 2 additions & 2 deletions workloads/download-data-to-bucket/helm/templates/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ spec:
mkdir -p /downloads/datasets
python /scripts/data_script.py
########################
echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" }}'
mc cp -recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/
echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'
mc cp -recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace "'" "'\\''" }}'/
########################
echo 'Done'
env:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,14 @@ spec:
{{- end }}
--local-dir local_models/downloaded_model
###################################
echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" }}'
echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace "'" "'\\''" }}'
{{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace "'" "'\\''") }}
mc mirror --exclude '.cache/huggingface/*' \
--exclude '.gitattributes' \
{{- if .Values.allowOverwrite }}
--overwrite \
{{- end }}
local_models/downloaded_model/ minio-host/{{ .Values.bucketPath | trimSuffix "/" }}
local_models/downloaded_model/ {{ $remotePath }}
env:
{{- if .Values.hfTokenSecret }}
- name: HF_TOKEN
Expand Down
4 changes: 4 additions & 0 deletions workloads/download-wandb-model-to-bucket/helm/Chart.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v2
name: download-wandb-model-to-bucket
description: A Helm chart for downloading a Weights and Biases model to a bucket
version: 0.0.1
Loading