diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index e2ae1db5..e782f98d 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -255,15 +255,32 @@ def _run_iterative_mode( # Initialize Laminar LaminarService.get().initialize() - # Create Laminar evaluation + # Build metadata for Laminar evaluation and traces + run_id = os.getenv("UNIQUE_EVAL_NAME") + laminar_meta = { + k: v + for k, v in [ + ("benchmark", self.metadata.dataset), + ("model", self.metadata.llm.model), + ] + if v + } + + # Create Laminar evaluation (use run_id as name if available) now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + eval_name = ( + run_id or f"{self.metadata.dataset} {self.metadata.dataset_split} {now}" + ) self.metadata.lmnr = LaminarEvalMetadata( eval_id=LaminarService.get().create_evaluation( - name=f"{self.metadata.dataset} {self.metadata.dataset_split} {now}", + name=eval_name, group_name=f"{self.metadata.dataset} {self.metadata.dataset_split}", - metadata=self.metadata.model_dump(mode="json"), + metadata=laminar_meta or None, ) ) + # Store for use in datapoint creation + self._laminar_session_id = run_id + self._laminar_trace_meta = laminar_meta or None total_instances = len(all_instances) logger.info("prepared %d instances for evaluation", total_instances) @@ -333,6 +350,8 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: inst.id, self.metadata.model_dump(mode="json"), index, + session_id=self._laminar_session_id, + trace_metadata=self._laminar_trace_meta, ) ) if datapoint_id is not None: diff --git a/benchmarks/utils/laminar.py b/benchmarks/utils/laminar.py index 24b91ed2..92141819 100644 --- a/benchmarks/utils/laminar.py +++ b/benchmarks/utils/laminar.py @@ -79,26 +79,23 @@ def _get_client(self) -> LaminarClient | None: return self._client def create_evaluation( - self, name: str, group_name: str, metadata: dict[str, Any] | None = None + self, + name: str, + group_name: str, + metadata: dict[str, Any] | None = None, ): client = self._get_client() if client is None: return None try: - eval_id = client.evals.create_evaluation( + return client.evals.create_evaluation( name=name, group_name=group_name, metadata=metadata, ) - return eval_id except Exception as exc: # pragma: no cover - defensive logging - logger.debug( - "Laminar evaluation %s (%s): %s", - name, - group_name, - exc, - ) + logger.debug("Laminar evaluation %s (%s): %s", name, group_name, exc) def create_evaluation_datapoint( self, @@ -106,10 +103,13 @@ def create_evaluation_datapoint( data: Any, metadata: dict[str, Any], index: int, + session_id: str | None = None, + trace_metadata: dict[str, Any] | None = None, ) -> tuple[UUID | None, str | None]: """ Create a Laminar datapoint. Creates a new span for the evaluation and returns the span context. + Session ID and trace metadata are set on the span if provided. """ if eval_id is None: @@ -124,6 +124,12 @@ def create_evaluation_datapoint( "Evaluation", span_type="EVALUATION", # type: ignore ) + # Set session ID and metadata on the active span + if session_id: + Laminar.set_trace_session_id(session_id) + if trace_metadata: + Laminar.set_trace_metadata(trace_metadata) + lmnr_span_ctx = Laminar.serialize_span_context(eval_span) eval_span.end()