silogen · Gastron · Jul 8, 2025 · Jun 26, 2025 · Jun 26, 2025 · Jun 27, 2025
diff --git a/docker/llm-evaluation/requirements.txt b/docker/llm-evaluation/requirements.txt
@@ -3,5 +3,6 @@ dataclasses-json==0.6.7
 evaluate==0.4.3
 jsonlines==4.0.0
 minio==7.2.15
+mlflow==3.1.0
 openai==1.64.0
 sentencepiece==0.2.0
diff --git a/docker/llm-evaluation/run_inference_and_judge_evaluation.py b/docker/llm-evaluation/run_inference_and_judge_evaluation.py
@@ -89,10 +89,8 @@ async def main(args: Namespace):
 
     saved_results = []
     parameters: dict = {}
-    llm_url_no_protocol = args.llm_base_url.removeprefix("http://").removeprefix(
-        "https://"
-    )  # the Minio python client handles protocol itself
-    client = get_llm_client(base_url=llm_url_no_protocol, port=args.llm_port, endpoint=args.llm_endpoint)
+
+    client = get_llm_client(base_url=args.llm_base_url, port=args.llm_port, endpoint=args.llm_endpoint)
 
     async for inference_result in run_call_inference_container(
         dataset=ds,
@@ -123,10 +121,7 @@ async def main(args: Namespace):
     logger.info(inferences_data)
     logger.info("Inference ran.")
 
-    judge_url_no_protocol = args.judge_base_url.removeprefix("http://").removeprefix(
-        "https://"
-    )  # the Minio python client handles protocol itself
-    judge_client = get_llm_client(base_url=judge_url_no_protocol, port=args.judge_port, endpoint=args.judge_endpoint)
+    judge_client = get_llm_client(base_url=args.judge_base_url, port=args.judge_port, endpoint=args.judge_endpoint)
 
     aggregated_judge_results = AggregatedJudgeResults(
         judge_results={},

diff --git a/docker/llm-evaluation/run_inference_and_metrics_evaluation.py b/docker/llm-evaluation/run_inference_and_metrics_evaluation.py
@@ -14,9 +14,9 @@
 from llm_evaluation.call_inference_container.call_inference_container import (
     save_inference_results,
 )
-from llm_evaluation.metrics.run_metrics_evaluation import read_inference_data
+from llm_evaluation.metrics.run_metrics_evaluation import get_bert_score_distribution_graphs, read_inference_data
 from llm_evaluation.metrics.run_metrics_evaluation import run as run_metrics_evaluation
-from llm_evaluation.metrics.utils import save_results
+from llm_evaluation.metrics.utils import log_metrics_in_mlflow, save_results
 
 
 async def main(args: Namespace):
@@ -115,6 +115,20 @@ async def main(args: Namespace):
 
     eval_results = run_metrics_evaluation(data)
 
+    distribution_graphs = get_bert_score_distribution_graphs(
+        scores=eval_results.scores,
+    )
+
+    if args.mlflow_server_uri:
+        logger.info("Logging results to MLFlow...")
+        log_metrics_in_mlflow(
+            distribution_graphs,
+            eval_results.scores,
+            mlflow_server_uri=args.mlflow_server_uri,
+            mlflow_experiment_name=args.mlflow_experiment_name,
+            mlflow_run_name=args.mlflow_run_name,
+        )
+
     logger.info("Evaluation results:")
     logger.info(eval_results)
 

diff --git a/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py b/docker/llm-evaluation/src/llm_evaluation/argument_parsers.py
@@ -12,7 +12,11 @@ def get_inference_parser() -> ArgumentParser:
     parser.add_argument("-p", "--llm-port", type=str, default="8080", help="Port number of the LLM service.")
     parser.add_argument("-e", "--llm-endpoint", type=str, default="v1", help="Endpoint of the LLM service.")
     parser.add_argument(
-        "-d", "--evaluation-dataset", type=str, default="abisee/cnn_dailymail", help="Name of the evaluation dataset."
+        "-d",
+        "--evaluation-dataset-name",
+        type=str,
+        default="abisee/cnn_dailymail",
+        help="Name of the evaluation dataset.",
     )
     parser.add_argument(
         "-v", "--evaluation-dataset-version", type=str, default="3.0.0", help="Version of the evaluation dataset."
@@ -65,6 +69,24 @@ def get_inference_parser() -> ArgumentParser:
         default="/home/evaluation/example_prompts/example_summary_prompt.txt",
         help="Path to the prompt template file.",
     )
+    parser.add_argument(
+        "--mlflow-server-uri",
+        type=str,
+        default="",  # leave this argument empty to disable MLFlow tracking
+        help="MLFlow server URI for tracking.",
+    )
+    parser.add_argument(
+        "--mlflow-experiment-name",
+        type=str,
+        default="llm-evaluation-experiment",
+        help="MLFlow experiment name for tracking.",
+    )
+    parser.add_argument(
+        "--mlflow-run-name",
+        type=str,
+        default="llm-evaluation-run",
+        help="MLFlow run name for tracking.",
+    )
     return parser
 
 

diff --git a/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py b/docker/llm-evaluation/src/llm_evaluation/data/data_classes.py
@@ -10,10 +10,12 @@
 @dataclass_json
 @dataclass
 class EvaluationScores:
-    precision_bert: float
-    recall_bert: float
-    f1_bert: float
-    f1_list: List[float]
+    precision_avg_bert: float
+    recall_avg_bert: float
+    f1_avg_bert: float
+    precision_list_bert: List[float]
+    recall_list_bert: List[float]
+    f1_list_bert: List[float]
     bleu_score: float
     accuracy: float
 

diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py b/docker/llm-evaluation/src/llm_evaluation/metrics/metrics.py
@@ -7,7 +7,7 @@
 
 def compute_bertscore(
     predictions: List[str], references: List[str], language: str = "en"
-) -> Tuple[float, float, float, List[float]]:
+) -> Tuple[List[float], List[float], List[float]]:
     """
     Computes the BERTScore for a set of predictions and references.
 
@@ -32,13 +32,7 @@ def compute_bertscore(
     recall_list = convert_negatives_to_zero(array=np.array(results["recall"]))
     f1_list = convert_negatives_to_zero(array=np.array(results["f1"]))
 
-    precision_bert = round(np.average(precision_list), 4)
-    recall_bert = round(np.average(recall_list), 4)
-    f1_bert = round(np.average(f1_list), 4)
-
-    f1_list = [round(f1, 4) for f1 in f1_list]
-
-    return precision_bert, recall_bert, f1_bert, f1_list
+    return precision_list, recall_list, f1_list
 
 
 def compute_exact_match(

diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py b/docker/llm-evaluation/src/llm_evaluation/metrics/run_metrics_evaluation.py
@@ -6,6 +6,9 @@
 from typing import Any, Dict, List
 
 import jsonlines
+import matplotlib.pyplot as plt
+import mlflow
+import numpy as np
 from llm_evaluation import logger
 from llm_evaluation.argument_parsers import get_metrics_parser
 from llm_evaluation.data.data_classes import EvaluationResults, EvaluationScores
@@ -28,7 +31,13 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
 
     bert_score_start_time = time.time()
 
-    precision_bert, recall_bert, f1_bert, f1_list = compute_bertscore(predictions=predictions, references=references)
+    precision_list_bert, recall_list_bert, f1_list_bert = compute_bertscore(
+        predictions=predictions, references=references
+    )
+
+    precision_avg_bert = round(np.average(precision_list_bert), 4)
+    recall_avg_bert = round(np.average(recall_list_bert), 4)
+    f1_avg_bert = round(np.average(f1_list_bert), 4)
 
     logger.info(f"BERT-score computation took {time.time() - bert_score_start_time:.2f} seconds")
 
@@ -45,15 +54,53 @@ def compute_scores(predictions: List[str], references: List[str]) -> EvaluationS
     logger.info(f"Exact match computation took {time.time() - exact_match_start_time:.2f} seconds")
 
     return EvaluationScores(
-        precision_bert=precision_bert,
-        recall_bert=recall_bert,
-        f1_bert=f1_bert,
-        f1_list=f1_list,
+        precision_avg_bert=precision_avg_bert,
+        recall_avg_bert=recall_avg_bert,
+        f1_avg_bert=f1_avg_bert,
+        precision_list_bert=precision_list_bert,
+        recall_list_bert=recall_list_bert,
+        f1_list_bert=f1_list_bert,
         bleu_score=bleu_score,
         accuracy=accuracy,
     )
 
 
+def get_bert_score_distribution_graphs(scores: EvaluationScores) -> Dict[str, str]:
+    """
+    Generate PNG images of the distributions of BERTScore precision, recall, and F1,
+    each with the mean value marked.
+
+    Args:
+        precision_list (list of float): List of BERTScore precision values.
+        recall_list (list of float): List of BERTScore recall values.
+        f1_list (list of float): List of BERTScore F1 values.
+
+    Returns:
+        dict: Dictionary with keys 'precision', 'recall', 'f1', each containing PNG image bytes.
+    """
+    results = {}
+    metrics = [
+        ("precision", scores.precision_list_bert),
+        ("recall", scores.recall_list_bert),
+        ("f1", scores.f1_list_bert),
+    ]
+    for name, values in metrics:
+        fig, ax = plt.subplots()
+        values = np.array(values)
+        mean_val = np.mean(values)
+        ax.hist(values, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
+        ax.axvline(mean_val, color="red", linestyle="dashed", linewidth=2, label=f"Mean: {mean_val:.4f}")
+        ax.set_title(f"BERTScore {name.capitalize()} Distribution")
+        ax.set_xlabel(name.capitalize())
+        ax.set_ylabel("Frequency")
+        ax.legend()
+        plt.tight_layout()
+        plt.savefig(f"{name}_distribution.png", format="png")
+        plt.close(fig)
+        results[name] = f"{name}_distribution.png"
+    return results
+
+
 def read_inference_data(input_path: str) -> List[Dict[str, Any]]:
     """
     Reads inference data from a file or directory containing JSON/JSONL files.

diff --git a/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py b/docker/llm-evaluation/src/llm_evaluation/metrics/utils.py
@@ -4,13 +4,14 @@
 from typing import Any, Dict, List
 
 import jsonlines
+import mlflow
+import numpy as np
 from llm_evaluation import logger
 from llm_evaluation.data.data_classes import AggregatedJudgeResults, EvaluationResults
 from minio import Minio, S3Error
-from numpy import ndarray
 
 
-def convert_negatives_to_zero(array: ndarray) -> ndarray:
+def convert_negatives_to_zero(array: np.ndarray) -> np.ndarray:
     """Converts all negative values in an array to zero.
 
     Args:
@@ -129,3 +130,40 @@ def read_jsonl_data(input_file_path: str) -> List[Dict[str, Any]]:
         for line in reader.iter(type=dict, skip_invalid=True):
             generations.append(line)
     return generations
+
+
+def log_metrics_in_mlflow(distribution_graphs, scores, mlflow_server_uri, mlflow_experiment_name, mlflow_run_name):
+
+    logger.info(f"Using MLflow tracking URI: {mlflow_server_uri}")
+
+    experiment_description = "Evaluation of LLM using BERTScore metric."
+
+    experiment_tags = {
+        "project_name": mlflow_experiment_name,
+        "mlflow.note.content": experiment_description,
+    }
+
+    client = mlflow.MlflowClient(tracking_uri=mlflow_server_uri)
+
+    # Create the Experiment, providing a unique name
+    try:
+        test_experiment = client.create_experiment(name=mlflow_experiment_name, tags=experiment_tags)
+        logger.info(f"Created experiment with ID: {test_experiment}")
+    except mlflow.exceptions.MlflowException as e:
+        # If the experiment already exists, retrieve its ID
+        logger.warning(f"Experiment '{mlflow_experiment_name}' already exists. Using existing experiment.")
+        test_experiment = client.get_experiment_by_name(mlflow_experiment_name).experiment_id
+        logger.info(f"Using existing experiment with ID: {test_experiment}")
+
+    mlflow.set_tracking_uri(mlflow_server_uri)
+    mlflow.set_experiment(experiment_name=mlflow_experiment_name)
+    with mlflow.start_run(run_name=mlflow_run_name, experiment_id=test_experiment) as run:
+
+        for name, file in distribution_graphs.items():
+            mlflow.log_metric("bert_score_mean_precision" + name, np.mean(scores.precision_avg_bert))
+            mlflow.log_metric("bert_score_mean_recall" + name, np.mean(scores.recall_avg_bert))
+            mlflow.log_metric("bert_score_mean_f1" + name, np.mean(scores.f1_avg_bert))
+            logger.info(
+                f"Saving artifact {file} (abs path: {os.path.abspath(file)}) to MLflow run {run.info.run_id}..."
+            )
+            mlflow.log_artifact(os.path.abspath(file), artifact_path="metrics_distributions")
diff --git a/docker/logistics/requirements.txt b/docker/logistics/requirements.txt
@@ -4,3 +4,4 @@ google-cloud-storage
 hf_transfer
 huggingface_hub[cli]
 minio
+wandb
diff --git a/docs/contributing.md b/docs/contributing.md
@@ -18,6 +18,56 @@ Thank you for considering contributing to the SiloGen AI Workloads development!
    # install packages you need
    ```
 
+### Pre-commit setup
+
+We use [pre-commit](https://pre-commit.com/) for consistent formatting and cleaner code. Hooks are specified in `ai-workloads-dev/.pre-commit-config.yaml`.
+
+To install:<br />
+`cd ai-workloads-dev` (this is necessary for `pre-commit install`, which runs particular to a git repository)<br />
+`source your_venv`<br />
+`pip install pre-commit`<br />
+`pre-commit install --config .pre-commit-config.yaml`<br />
+`git commit -m "test commit"`<br />
+
+With the final command, pre-commit should run automatically, with output something like the following:
+
+   >check json...........................................(no files to check)Skipped<br />
+   check yaml...........................................(no files to check)Skipped<br />
+   fix end of files.....................................(no files to check)Skipped<br />
+   fix requirements.txt.................................(no files to check)Skipped<br />
+   trim trailing whitespace.............................(no files to check)Skipped<br />
+   black................................................(no files to check)Skipped<br />
+   flake8...............................................(no files to check)Skipped<br />
+   isort (python).......................................(no files to check)Skipped<br />
+   mypy.................................................(no files to check)Skipped<br />
+   helmlint.............................................(no files to check)Skipped<br />
+
+It's also possible to manually run pre-commit using
+
+`pre-commit run --all-files`
+
+#### Troubleshooting pre-commit
+
+Many pre-commit bugs come from having an incorrect version of pre-commit active. Pre-commit can hang around as a system-wide version, in python venvs, or in your pre-commit cache.
+
+ It's easiest to use pre-commit as part of a python virtual environment. To check that the right pre-commit is being found, run `which pre-commit` and confirm that the binaries inside your venv are shown. For example: `/../../venvs/your_venv/bin/pre-commit`. A different path could indicate that your system is choosing the wrong pre-commit install.
+
+
+From system:
+`brew uninstall pre-commit` (mac)
+`sudo apt remove pre-commit` (linux)
+
+From venv:
+`pip uninstall pre-commit`
+
+Just the pre-commit hooks uninstall:
+`pre-commit uninstall`
+`pre-commit clean`
+
+
+Then reinstall pre-commit from scratch as described above.
+
+
 ## Development Workflow
 
 1. Create a branch for your feature or bugfix:

diff --git a/workloads/dev-workspace-jupyterlab/helm/values.yaml b/workloads/dev-workspace-jupyterlab/helm/values.yaml
@@ -47,8 +47,17 @@ entrypoint: |
   pip install pipx ipykernel
   pipx install --include-deps jupyter
   pipx inject --include-deps jupyter jupyterlab-lsp 'python-lsp-server[all]' ipywidgets jupyterlab-git jupyterlab_code_formatter
-  python -m ipykernel install --user --name=default-python3
-  jupyter-lab --ServerApp.token='' --ServerApp.ip='0.0.0.0' --ServerApp.allow_root=True --ServerApp.base_url=$BASE_URL --no-browser --ServerApp.root_dir='/workload'
+  python -m ipykernel install --user --name=default-python3 --display-name="Python 3 (default)"
+
+  jupyter-lab --no-browser \
+    --IdentityProvider.token='' \
+    --ServerApp.ip='0.0.0.0' \
+    --ServerApp.allow_root=True \
+    --ServerApp.base_url=$BASE_URL \
+    --ServerApp.root_dir='/workload' \
+    --MultiKernelManager.default_kernel_name=default-python3 \
+    --KernelSpecManager.allowed_kernelspecs=default-python3 \
+    --KernelSpecManager.ensure_native_kernel=False
 
 # kaiwo settings (if enabled, use kaiwo CRDs to have kaiwo operator manage the workload)
 kaiwo:

diff --git a/workloads/download-data-to-bucket/helm/templates/job.yaml b/workloads/download-data-to-bucket/helm/templates/job.yaml
@@ -33,8 +33,8 @@ spec:
           mkdir -p /downloads/datasets
           python /scripts/data_script.py
           ########################
-          echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" }}'
-          mc cp -recursive /downloads/datasets/ minio-host/{{ .Values.bucketDataDir | trimSuffix "/" }}/
+          echo 'Uploading data to the bucket, to {{ .Values.bucketDataDir | trimSuffix "/" | replace  "'" "'\\''" }}'
+          mc cp -recursive /downloads/datasets/ minio-host/'{{ .Values.bucketDataDir | trimSuffix "/" | replace  "'" "'\\''" }}'/
           ########################
           echo 'Done'
         env:

diff --git a/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml b/workloads/download-huggingface-model-to-bucket/helm/templates/job.yaml
@@ -47,13 +47,14 @@ spec:
             {{- end }}
             --local-dir local_models/downloaded_model
           ###################################
-          echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" }}'
+          echo 'Uploading the model to the bucket, to {{ .Values.bucketPath | trimSuffix "/" | replace  "'" "'\\''" }}'
+          {{- $remotePath := printf "minio-host/'%s'/" (.Values.bucketPath | trimSuffix "/" | replace  "'" "'\\''") }}
           mc mirror --exclude '.cache/huggingface/*' \
             --exclude '.gitattributes' \
             {{- if .Values.allowOverwrite }}
             --overwrite \
             {{- end }}
-            local_models/downloaded_model/ minio-host/{{ .Values.bucketPath | trimSuffix "/" }}
+            local_models/downloaded_model/ {{ $remotePath }}
         env:
           {{- if .Values.hfTokenSecret }}
           - name: HF_TOKEN

diff --git a/workloads/download-wandb-model-to-bucket/helm/Chart.yaml b/workloads/download-wandb-model-to-bucket/helm/Chart.yaml
@@ -0,0 +1,4 @@
+apiVersion: v2
+name: download-wandb-model-to-bucket
+description: A Helm chart for downloading a Weights and Biases model to a bucket
+version: 0.0.1
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ google-cloud-storage @@
     hf_transfer
     huggingface_hub[cli]
     minio
+    wandb