From 77a5d9bbb996d083054165ab78b71f2754c2c244 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Mon, 26 Jan 2026 16:18:17 +0100 Subject: [PATCH 1/6] Align Laminar runs with CI eval IDs Co-authored-by: openhands --- benchmarks/utils/evaluation.py | 27 ++++++++++++++++--- benchmarks/utils/laminar.py | 49 ++++++++++++++++++++++++++++++---- 2 files changed, 68 insertions(+), 8 deletions(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index e2ae1db5..1a9be010 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -11,7 +11,7 @@ from contextlib import contextmanager from datetime import datetime, timezone from pathlib import Path -from typing import Callable, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from uuid import UUID from lmnr import Laminar @@ -252,16 +252,37 @@ def _run_iterative_mode( """Run evaluation with support for single or multiple attempts.""" all_instances = self.prepare_instances() + run_id = os.getenv("UNIQUE_EVAL_NAME") + benchmark_name = self.metadata.dataset + model_name = getattr(self.metadata.llm, "model", None) + # Initialize Laminar - LaminarService.get().initialize() + LaminarService.get().initialize(trace_session_id=run_id) + trace_metadata: dict[str, Any] = {} + if benchmark_name: + trace_metadata["benchmark"] = benchmark_name + if model_name: + trace_metadata["model"] = model_name + try: + if trace_metadata: + Laminar.set_trace_metadata(trace_metadata) + except Exception: # pragma: no cover - defensive logging + logger.debug("Failed to set Laminar trace metadata", exc_info=True) # Create Laminar evaluation now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + eval_metadata = self.metadata.model_dump(mode="json") + if benchmark_name: + eval_metadata["benchmark"] = benchmark_name + if model_name: + eval_metadata["model"] = model_name + self.metadata.lmnr = LaminarEvalMetadata( eval_id=LaminarService.get().create_evaluation( name=f"{self.metadata.dataset} {self.metadata.dataset_split} {now}", group_name=f"{self.metadata.dataset} {self.metadata.dataset_split}", - metadata=self.metadata.model_dump(mode="json"), + metadata=eval_metadata, + run_id=run_id, ) ) diff --git a/benchmarks/utils/laminar.py b/benchmarks/utils/laminar.py index 24b91ed2..f861f8b7 100644 --- a/benchmarks/utils/laminar.py +++ b/benchmarks/utils/laminar.py @@ -34,6 +34,7 @@ class LaminarService: def __init__(self) -> None: self._client: LaminarClient | None = None self._laminar_initialized = False + self._trace_session_id: str | None = None @classmethod def get(cls) -> "LaminarService": @@ -44,13 +45,34 @@ def get(cls) -> "LaminarService": def _is_enabled(self) -> bool: return bool(os.environ.get("LMNR_PROJECT_API_KEY")) - def initialize(self) -> bool: + def _get_trace_session_id(self, override: str | None = None) -> str | None: + """Prefer CI/eval IDs so Laminar traces align with job IDs.""" + + if override: + return override + return os.environ.get("UNIQUE_EVAL_NAME") + + def _set_trace_session_id(self, session_id: str | None) -> None: + if not session_id: + return + try: + Laminar.set_trace_session_id(session_id) + except Exception as exc: # pragma: no cover - defensive logging + logger.debug("Failed to set Laminar trace session id: %s", exc) + + def initialize(self, trace_session_id: str | None = None) -> bool: """ Initialize the Laminar SDK once per process. Returns True if initialization succeeded (or was already done), False otherwise. """ + session_id = self._get_trace_session_id(trace_session_id) + if self._laminar_initialized: + # Allow late-binding the session ID if Laminar was initialized earlier. + if session_id and session_id != self._trace_session_id: + self._trace_session_id = session_id + self._set_trace_session_id(session_id) return True if not self._is_enabled(): @@ -58,6 +80,8 @@ def initialize(self) -> bool: try: Laminar.initialize() + self._trace_session_id = session_id + self._set_trace_session_id(session_id) except Exception as exc: # pragma: no cover - defensive logging logger.debug("Failed to initialize Laminar SDK: %s", exc) return False @@ -79,23 +103,38 @@ def _get_client(self) -> LaminarClient | None: return self._client def create_evaluation( - self, name: str, group_name: str, metadata: dict[str, Any] | None = None + self, + name: str, + group_name: str, + metadata: dict[str, Any] | None = None, + run_id: str | None = None, ): client = self._get_client() if client is None: return None + if run_id: + self._trace_session_id = run_id + self._set_trace_session_id(run_id) + elif self._trace_session_id is None: + self._trace_session_id = self._get_trace_session_id() + + eval_name = run_id or name + metadata_payload = metadata.copy() if metadata else {} + if run_id: + metadata_payload.setdefault("run_id", run_id) + try: eval_id = client.evals.create_evaluation( - name=name, + name=eval_name, group_name=group_name, - metadata=metadata, + metadata=metadata_payload, ) return eval_id except Exception as exc: # pragma: no cover - defensive logging logger.debug( "Laminar evaluation %s (%s): %s", - name, + eval_name, group_name, exc, ) From f4d8a9bd784a47048c1b7d5bb171471a11b33bf7 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Mon, 26 Jan 2026 16:24:39 +0100 Subject: [PATCH 2/6] Use LLM.model for Laminar metadata Co-authored-by: openhands --- benchmarks/utils/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 1a9be010..df02be85 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -254,7 +254,7 @@ def _run_iterative_mode( run_id = os.getenv("UNIQUE_EVAL_NAME") benchmark_name = self.metadata.dataset - model_name = getattr(self.metadata.llm, "model", None) + model_name = self.metadata.llm.model # Initialize Laminar LaminarService.get().initialize(trace_session_id=run_id) From 25240fcac0dcddf3d1fa55b856fb1df7078fb553 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Mon, 26 Jan 2026 16:26:37 +0100 Subject: [PATCH 3/6] Fail fast setting Laminar trace metadata Co-authored-by: openhands --- benchmarks/utils/evaluation.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index df02be85..3835c695 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -263,11 +263,8 @@ def _run_iterative_mode( trace_metadata["benchmark"] = benchmark_name if model_name: trace_metadata["model"] = model_name - try: - if trace_metadata: - Laminar.set_trace_metadata(trace_metadata) - except Exception: # pragma: no cover - defensive logging - logger.debug("Failed to set Laminar trace metadata", exc_info=True) + if trace_metadata: + Laminar.set_trace_metadata(trace_metadata) # Create Laminar evaluation now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") From 4ee4ca3f3e1d4b5e7f8cb0e9566dfb9f04590a7c Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Mon, 26 Jan 2026 17:41:48 +0100 Subject: [PATCH 4/6] Send minimal Laminar eval metadata Co-authored-by: openhands --- benchmarks/utils/evaluation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 3835c695..aeea0222 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -268,7 +268,7 @@ def _run_iterative_mode( # Create Laminar evaluation now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - eval_metadata = self.metadata.model_dump(mode="json") + eval_metadata: dict[str, Any] = {} if benchmark_name: eval_metadata["benchmark"] = benchmark_name if model_name: From fc396ea6e327089bc1007b5530e18e0086b84b91 Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 27 Jan 2026 11:37:50 +0100 Subject: [PATCH 5/6] Simplify Laminar session handling Co-authored-by: openhands --- benchmarks/utils/laminar.py | 38 ++++++++----------------------------- 1 file changed, 8 insertions(+), 30 deletions(-) diff --git a/benchmarks/utils/laminar.py b/benchmarks/utils/laminar.py index f861f8b7..fc11d828 100644 --- a/benchmarks/utils/laminar.py +++ b/benchmarks/utils/laminar.py @@ -34,7 +34,6 @@ class LaminarService: def __init__(self) -> None: self._client: LaminarClient | None = None self._laminar_initialized = False - self._trace_session_id: str | None = None @classmethod def get(cls) -> "LaminarService": @@ -45,34 +44,13 @@ def get(cls) -> "LaminarService": def _is_enabled(self) -> bool: return bool(os.environ.get("LMNR_PROJECT_API_KEY")) - def _get_trace_session_id(self, override: str | None = None) -> str | None: - """Prefer CI/eval IDs so Laminar traces align with job IDs.""" - - if override: - return override - return os.environ.get("UNIQUE_EVAL_NAME") - - def _set_trace_session_id(self, session_id: str | None) -> None: - if not session_id: - return - try: - Laminar.set_trace_session_id(session_id) - except Exception as exc: # pragma: no cover - defensive logging - logger.debug("Failed to set Laminar trace session id: %s", exc) - def initialize(self, trace_session_id: str | None = None) -> bool: """ Initialize the Laminar SDK once per process. Returns True if initialization succeeded (or was already done), False otherwise. """ - session_id = self._get_trace_session_id(trace_session_id) - if self._laminar_initialized: - # Allow late-binding the session ID if Laminar was initialized earlier. - if session_id and session_id != self._trace_session_id: - self._trace_session_id = session_id - self._set_trace_session_id(session_id) return True if not self._is_enabled(): @@ -80,8 +58,8 @@ def initialize(self, trace_session_id: str | None = None) -> bool: try: Laminar.initialize() - self._trace_session_id = session_id - self._set_trace_session_id(session_id) + if trace_session_id: + Laminar.set_trace_session_id(trace_session_id) except Exception as exc: # pragma: no cover - defensive logging logger.debug("Failed to initialize Laminar SDK: %s", exc) return False @@ -113,17 +91,17 @@ def create_evaluation( if client is None: return None - if run_id: - self._trace_session_id = run_id - self._set_trace_session_id(run_id) - elif self._trace_session_id is None: - self._trace_session_id = self._get_trace_session_id() - eval_name = run_id or name metadata_payload = metadata.copy() if metadata else {} if run_id: metadata_payload.setdefault("run_id", run_id) + if run_id: + try: + Laminar.set_trace_session_id(run_id) + except Exception as exc: # pragma: no cover - defensive logging + logger.debug("Failed to set Laminar trace session id: %s", exc) + try: eval_id = client.evals.create_evaluation( name=eval_name, From 140c17363d0018b4007acf28f51562e9671cb36d Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Tue, 27 Jan 2026 11:41:44 +0100 Subject: [PATCH 6/6] Reuse Laminar metadata for trace and eval Co-authored-by: openhands --- benchmarks/utils/evaluation.py | 47 +++++++++++++++++----------------- benchmarks/utils/laminar.py | 39 ++++++++++------------------ 2 files changed, 38 insertions(+), 48 deletions(-) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index aeea0222..e782f98d 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -11,7 +11,7 @@ from contextlib import contextmanager from datetime import datetime, timezone from pathlib import Path -from typing import Any, Callable, List, Optional, Tuple +from typing import Callable, List, Optional, Tuple from uuid import UUID from lmnr import Laminar @@ -252,36 +252,35 @@ def _run_iterative_mode( """Run evaluation with support for single or multiple attempts.""" all_instances = self.prepare_instances() + # Initialize Laminar + LaminarService.get().initialize() + + # Build metadata for Laminar evaluation and traces run_id = os.getenv("UNIQUE_EVAL_NAME") - benchmark_name = self.metadata.dataset - model_name = self.metadata.llm.model + laminar_meta = { + k: v + for k, v in [ + ("benchmark", self.metadata.dataset), + ("model", self.metadata.llm.model), + ] + if v + } - # Initialize Laminar - LaminarService.get().initialize(trace_session_id=run_id) - trace_metadata: dict[str, Any] = {} - if benchmark_name: - trace_metadata["benchmark"] = benchmark_name - if model_name: - trace_metadata["model"] = model_name - if trace_metadata: - Laminar.set_trace_metadata(trace_metadata) - - # Create Laminar evaluation + # Create Laminar evaluation (use run_id as name if available) now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") - eval_metadata: dict[str, Any] = {} - if benchmark_name: - eval_metadata["benchmark"] = benchmark_name - if model_name: - eval_metadata["model"] = model_name - + eval_name = ( + run_id or f"{self.metadata.dataset} {self.metadata.dataset_split} {now}" + ) self.metadata.lmnr = LaminarEvalMetadata( eval_id=LaminarService.get().create_evaluation( - name=f"{self.metadata.dataset} {self.metadata.dataset_split} {now}", + name=eval_name, group_name=f"{self.metadata.dataset} {self.metadata.dataset_split}", - metadata=eval_metadata, - run_id=run_id, + metadata=laminar_meta or None, ) ) + # Store for use in datapoint creation + self._laminar_session_id = run_id + self._laminar_trace_meta = laminar_meta or None total_instances = len(all_instances) logger.info("prepared %d instances for evaluation", total_instances) @@ -351,6 +350,8 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: inst.id, self.metadata.model_dump(mode="json"), index, + session_id=self._laminar_session_id, + trace_metadata=self._laminar_trace_meta, ) ) if datapoint_id is not None: diff --git a/benchmarks/utils/laminar.py b/benchmarks/utils/laminar.py index fc11d828..92141819 100644 --- a/benchmarks/utils/laminar.py +++ b/benchmarks/utils/laminar.py @@ -44,7 +44,7 @@ def get(cls) -> "LaminarService": def _is_enabled(self) -> bool: return bool(os.environ.get("LMNR_PROJECT_API_KEY")) - def initialize(self, trace_session_id: str | None = None) -> bool: + def initialize(self) -> bool: """ Initialize the Laminar SDK once per process. Returns True if initialization succeeded (or was already done), False otherwise. @@ -58,8 +58,6 @@ def initialize(self, trace_session_id: str | None = None) -> bool: try: Laminar.initialize() - if trace_session_id: - Laminar.set_trace_session_id(trace_session_id) except Exception as exc: # pragma: no cover - defensive logging logger.debug("Failed to initialize Laminar SDK: %s", exc) return False @@ -85,37 +83,19 @@ def create_evaluation( name: str, group_name: str, metadata: dict[str, Any] | None = None, - run_id: str | None = None, ): client = self._get_client() if client is None: return None - eval_name = run_id or name - metadata_payload = metadata.copy() if metadata else {} - if run_id: - metadata_payload.setdefault("run_id", run_id) - - if run_id: - try: - Laminar.set_trace_session_id(run_id) - except Exception as exc: # pragma: no cover - defensive logging - logger.debug("Failed to set Laminar trace session id: %s", exc) - try: - eval_id = client.evals.create_evaluation( - name=eval_name, + return client.evals.create_evaluation( + name=name, group_name=group_name, - metadata=metadata_payload, + metadata=metadata, ) - return eval_id except Exception as exc: # pragma: no cover - defensive logging - logger.debug( - "Laminar evaluation %s (%s): %s", - eval_name, - group_name, - exc, - ) + logger.debug("Laminar evaluation %s (%s): %s", name, group_name, exc) def create_evaluation_datapoint( self, @@ -123,10 +103,13 @@ def create_evaluation_datapoint( data: Any, metadata: dict[str, Any], index: int, + session_id: str | None = None, + trace_metadata: dict[str, Any] | None = None, ) -> tuple[UUID | None, str | None]: """ Create a Laminar datapoint. Creates a new span for the evaluation and returns the span context. + Session ID and trace metadata are set on the span if provided. """ if eval_id is None: @@ -141,6 +124,12 @@ def create_evaluation_datapoint( "Evaluation", span_type="EVALUATION", # type: ignore ) + # Set session ID and metadata on the active span + if session_id: + Laminar.set_trace_session_id(session_id) + if trace_metadata: + Laminar.set_trace_metadata(trace_metadata) + lmnr_span_ctx = Laminar.serialize_span_context(eval_span) eval_span.end()