Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions benchmarks/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,15 +255,32 @@ def _run_iterative_mode(
# Initialize Laminar
LaminarService.get().initialize()

# Create Laminar evaluation
# Build metadata for Laminar evaluation and traces
run_id = os.getenv("UNIQUE_EVAL_NAME")
laminar_meta = {
k: v
for k, v in [
("benchmark", self.metadata.dataset),
("model", self.metadata.llm.model),
]
if v
}

# Create Laminar evaluation (use run_id as name if available)
now = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
eval_name = (
run_id or f"{self.metadata.dataset} {self.metadata.dataset_split} {now}"
)
self.metadata.lmnr = LaminarEvalMetadata(
eval_id=LaminarService.get().create_evaluation(
name=f"{self.metadata.dataset} {self.metadata.dataset_split} {now}",
name=eval_name,
group_name=f"{self.metadata.dataset} {self.metadata.dataset_split}",
metadata=self.metadata.model_dump(mode="json"),
metadata=laminar_meta or None,
)
)
# Store for use in datapoint creation
self._laminar_session_id = run_id
self._laminar_trace_meta = laminar_meta or None

total_instances = len(all_instances)
logger.info("prepared %d instances for evaluation", total_instances)
Expand Down Expand Up @@ -333,6 +350,8 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
inst.id,
self.metadata.model_dump(mode="json"),
index,
session_id=self._laminar_session_id,
trace_metadata=self._laminar_trace_meta,
)
)
if datapoint_id is not None:
Expand Down
24 changes: 15 additions & 9 deletions benchmarks/utils/laminar.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,37 +79,37 @@ def _get_client(self) -> LaminarClient | None:
return self._client

def create_evaluation(
self, name: str, group_name: str, metadata: dict[str, Any] | None = None
self,
name: str,
group_name: str,
metadata: dict[str, Any] | None = None,
):
client = self._get_client()
if client is None:
return None

try:
eval_id = client.evals.create_evaluation(
return client.evals.create_evaluation(
name=name,
group_name=group_name,
metadata=metadata,
)
return eval_id
except Exception as exc: # pragma: no cover - defensive logging
logger.debug(
"Laminar evaluation %s (%s): %s",
name,
group_name,
exc,
)
logger.debug("Laminar evaluation %s (%s): %s", name, group_name, exc)

def create_evaluation_datapoint(
self,
eval_id: UUID | None,
data: Any,
metadata: dict[str, Any],
index: int,
session_id: str | None = None,
trace_metadata: dict[str, Any] | None = None,
) -> tuple[UUID | None, str | None]:
"""
Create a Laminar datapoint.
Creates a new span for the evaluation and returns the span context.
Session ID and trace metadata are set on the span if provided.
"""

if eval_id is None:
Expand All @@ -124,6 +124,12 @@ def create_evaluation_datapoint(
"Evaluation",
span_type="EVALUATION", # type: ignore
)
# Set session ID and metadata on the active span
if session_id:
Laminar.set_trace_session_id(session_id)
if trace_metadata:
Laminar.set_trace_metadata(trace_metadata)

lmnr_span_ctx = Laminar.serialize_span_context(eval_span)
eval_span.end()

Expand Down