From 63791d2bf4f3f814ebe4eb721467d60be24e88a8 Mon Sep 17 00:00:00 2001 From: Bharathi Srinivasan Date: Tue, 31 Mar 2026 13:23:14 -0700 Subject: [PATCH 1/3] Add groundtruth-based evaluations tutorial --- .../README.md | 122 + .../groundtruth_evaluations.ipynb | 2864 +++++++++++++++++ .../hr_assistant_agent.py | 268 ++ .../requirements.txt | 6 + 4 files changed, 3260 insertions(+) create mode 100644 01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md create mode 100644 01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb create mode 100644 01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py create mode 100644 01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/requirements.txt diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md new file mode 100644 index 000000000..5102077cd --- /dev/null +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md @@ -0,0 +1,122 @@ +# Ground Truth Evaluations — EvaluationClient and EvaluationRunner + +## Overview + +This tutorial demonstrates end-to-end evaluation of an agentic application using Amazon Bedrock AgentCore's two primary evaluation interfaces: **EvaluationClient** and **EvaluationRunner**. Both are used with ground-truth reference inputs to measure factual correctness, goal achievement, and tool-use accuracy. + +The tutorial uses an **HR Assistant agent** for Acme Corp — a Strands agent that helps employees with PTO management, HR policy lookups, benefits information, and pay stubs. + +## What You'll Learn + +- How to deploy a Strands agent to AgentCore Runtime +- When to use `EvaluationClient` vs `EvaluationRunner` +- How to evaluate existing sessions with ground-truth `ReferenceInputs` +- How to define an evaluation dataset with `TurnByTurnScenario` and `Turn` +- How to run automated dataset evaluations with `EvaluationRunner` +- How to interpret built-in evaluator results for trajectory, correctness, and goal-success metrics + +## Prerequisites + +Before running this tutorial, ensure you have: + +- Python 3.10+ +- Docker running locally (for agent container image build) +- AWS credentials with permissions for: + - AgentCore Runtime (`bedrock-agentcore:*`) + - AgentCore Evaluations (`bedrock-agentcore:Evaluate`) + - CloudWatch Logs (`logs:FilterLogEvents`, `logs:DescribeLogGroups`) + - ECR (`ecr:*`) + - IAM (for auto-creating the agent execution role) + +## Files + +| File | Description | +|---|---| +| `groundtruth_evaluations.ipynb` | Main tutorial notebook (standalone, end-to-end) | +| `hr_assistant_agent.py` | HR Assistant Strands agent deployed to AgentCore Runtime | +| `requirements.txt` | Python dependencies for the agent container | + +## Tutorial Notebook + +### [groundtruth_evaluations.ipynb](groundtruth_evaluations.ipynb) + +A single self-contained notebook that walks through the full evaluation workflow in 7 steps: + +| Step | Description | +|---|---| +| 1 | Install dependencies | +| 2 | Configure AWS session and region | +| 3 | Deploy the HR Assistant agent to AgentCore Runtime | +| 4 | Invoke the agent to generate sessions with CloudWatch spans | +| 5 | **EvaluationClient** — evaluate existing sessions with ground truth | +| 6 | **EvaluationRunner** — automated dataset evaluation | +| 7 | Cleanup | + +## EvaluationClient vs EvaluationRunner + +| | EvaluationClient | EvaluationRunner | +|---|---|---| +| **When to use** | You already have recorded sessions | You have a test dataset | +| **Agent invocation** | Not included — evaluates existing sessions | Automatic — invokes your agent for every scenario | +| **Input** | `session_id` + `agent_id` | `Dataset` of `TurnByTurnScenario` objects | +| **Best for** | Post-hoc analysis, debugging, incident investigation | Regression testing, CI/CD pipelines, batch evaluation | + +## Ground-Truth Reference Inputs + +`ReferenceInputs` supplies optional ground truth to `EvaluationClient`. Each field is consumed by specific evaluators: + +| Field | Evaluators that use it | Description | +|---|---|---| +| `expected_response` | `Builtin.Correctness` | The ideal response text for semantic comparison | +| `expected_trajectory` | `Builtin.TrajectoryExactOrderMatch`, `Builtin.TrajectoryInOrderMatch`, `Builtin.TrajectoryAnyOrderMatch` | Ordered list of tool names the agent should call | +| `assertions` | `Builtin.GoalSuccessRate` | Free-text assertions the session should satisfy | + +Evaluators that don't require ground truth (`Builtin.Helpfulness`, `Builtin.ResponseRelevance`) can be included in the same call — each evaluator reads only the fields it needs. + +The same fields apply to `TurnByTurnScenario` objects in `EvaluationRunner` datasets. + +## Built-in Evaluators Used + +| Evaluator | Level | Ground Truth | +|---|---|---| +| `Builtin.Correctness` | TRACE | `expected_response` | +| `Builtin.GoalSuccessRate` | SESSION | `assertions` | +| `Builtin.TrajectoryExactOrderMatch` | SESSION | `expected_trajectory` | +| `Builtin.TrajectoryInOrderMatch` | SESSION | `expected_trajectory` | +| `Builtin.TrajectoryAnyOrderMatch` | SESSION | `expected_trajectory` | +| `Builtin.Helpfulness` | TRACE | None | +| `Builtin.ResponseRelevance` | TRACE | None | + +**Evaluation levels:** +- **TRACE** — evaluated once per agent response (one result per conversational turn) +- **SESSION** — evaluated once per conversation (one result per scenario) + +## The HR Assistant Agent + +The agent is built with the [Strands Agents SDK](https://strandsagents.com/) and deployed on AgentCore Runtime. It exposes five tools backed by deterministic mock data, making evaluations fully reproducible: + +| Tool | Description | +|---|---| +| `get_pto_balance` | Returns remaining PTO days for an employee | +| `submit_pto_request` | Submits a time-off request | +| `lookup_hr_policy` | Looks up PTO, remote work, parental leave, or code-of-conduct policies | +| `get_benefits_summary` | Returns health, dental, vision, 401k, or life insurance details | +| `get_pay_stub` | Retrieves gross and net pay for a given employee and period | + +## Evaluation Scenarios + +The notebook evaluates five scenarios that cover different evaluation patterns: + +| Scenario | Turns | Key evaluators | +|---|---|---| +| PTO balance check | 1 | Correctness, Helpfulness | +| PTO submission | 1 | GoalSuccessRate, Trajectory, Correctness | +| Pay stub lookup | 1 | Correctness, GoalSuccessRate | +| PTO planning session | 3 | GoalSuccessRate, TrajectoryExactOrderMatch | +| New employee onboarding | 4 | GoalSuccessRate, TrajectoryAnyOrderMatch | + +## Next Steps + +- **[programmatic_evaluators/](../programmatic_evaluators/)** — Extend evaluations with Lambda-backed code evaluators for deterministic, business-rule-driven scoring. +- Review evaluation results to identify scenarios where the agent calls the wrong tool or returns inaccurate facts. +- Integrate `EvaluationRunner` into your CI/CD pipeline to catch regressions automatically. diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb new file mode 100644 index 000000000..cf2388de2 --- /dev/null +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb @@ -0,0 +1,2864 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "intro-md", + "metadata": {}, + "source": [ + "# HR Assistant Agent — Ground Truth Evaluations\n", + "\n", + "This notebook demonstrates evaluation of an agentic application with ground truth using Amazon Bedrock AgentCore Evaluations:\n", + "\n", + "| Interface | When to use |\n", + "|---|---|\n", + "| **EvaluationClient** | You already have agent sessions in CloudWatch. Evaluate specific sessions against reference inputs. |\n", + "| **OnDemandEvaluationDatasetRunner** | You have a test dataset. You want to invoke the agent for every scenario and evaluate the results. |\n", + "\n", + "### The HR Assistant Agent\n", + "\n", + "We'll deploy an **HR Assistant** for Acme Corp — a Strands agent that helps employees with:\n", + "- PTO balance checks and time-off requests\n", + "- HR policy lookups (PTO, remote work, parental leave)\n", + "- Benefits information (health, dental, vision, 401k)\n", + "- Pay stub retrieval\n", + "\n", + "### What You'll Learn\n", + "- How to use `EvaluationClient` to evaluate existing agent sessions logged in Amazon CloudWatch with ground-truth references\n", + "- How to use `OnDemandEvaluationDatasetRunner` to run automated dataset evaluations\n", + "- How to interpret evaluation results across built-in evaluators (Correctness, GoalSuccessRate, Trajectory)\n", + "\n", + "### Tutorial Details\n", + "\n", + "| Information | Details |\n", + "|---|---|\n", + "| Agent framework | Strands Agents |\n", + "| Runtime | Amazon Bedrock AgentCore Runtime |\n", + "| Evaluation SDK | `bedrock-agentcore` |\n", + "| AWS services | AgentCore Runtime, AgentCore Evaluations, CloudWatch Logs |\n", + "\n", + "### Prerequisites\n", + "- Python 3.10+\n", + "- AWS credentials with permissions for AgentCore, Lambda, CloudWatch, ECR, IAM\n", + "- Docker running locally (for agent container image build)" + ] + }, + { + "cell_type": "markdown", + "id": "install-md", + "metadata": {}, + "source": [ + "## Step 1: Install Dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "install", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:08:32.996391Z", + "iopub.status.busy": "2026-03-31T18:08:32.996290Z", + "iopub.status.idle": "2026-03-31T18:08:34.578367Z", + "shell.execute_reply": "2026-03-31T18:08:34.577735Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install -r requirements.txt -q" + ] + }, + { + "cell_type": "markdown", + "id": "setup-md", + "metadata": {}, + "source": [ + "## Step 2: Configuration\n", + "\n", + "Import libraries and configure your AWS session." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "setup", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:08:34.580804Z", + "iopub.status.busy": "2026-03-31T18:08:34.580612Z", + "iopub.status.idle": "2026-03-31T18:08:34.719835Z", + "shell.execute_reply": "2026-03-31T18:08:34.719215Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Region : us-east-1\n" + ] + } + ], + "source": [ + "import boto3\n", + "import json\n", + "import time\n", + "import uuid\n", + "from datetime import timedelta\n", + "from boto3.session import Session\n", + "from IPython.display import display, Markdown\n", + "\n", + "region = \"us-east-1\"\n", + "boto_session = Session(region_name=region)\n", + "REGION = boto_session.region_name\n", + "\n", + "print(f\"Region : {REGION}\")" + ] + }, + { + "cell_type": "markdown", + "id": "deploy-md", + "metadata": {}, + "source": [ + "## Step 3: Deploy the HR Assistant Agent\n", + "\n", + "We deploy the HR Assistant to **AgentCore Runtime** using the AWS SDK directly (no starter toolkit).\n", + "The deployment steps are:\n", + "\n", + "1. **ECR authentication** — use `boto3` to get a temporary ECR token and run `docker login`\n", + "2. **Docker build** — build a container image from `hr_assistant_agent.py` and the local `Dockerfile`\n", + "3. **Docker push** — push the image to Amazon ECR\n", + "4. **CreateAgentRuntime / UpdateAgentRuntime** — register or update the agent endpoint via `bedrock-agentcore-control`\n", + "\n", + "If a `.bedrock_agentcore.yaml` from a previous run is present, its ECR repository, IAM role, and\n", + "existing agent ID are reused so the cell is idempotent (re-running triggers an update, not a duplicate create)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "nn72gdo2s4h", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:08:34.721827Z", + "iopub.status.busy": "2026-03-31T18:08:34.721686Z", + "iopub.status.idle": "2026-03-31T18:08:34.728424Z", + "shell.execute_reply": "2026-03-31T18:08:34.727890Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting hr_assistant_agent.py\n" + ] + } + ], + "source": [ + "%%writefile hr_assistant_agent.py\n", + "\"\"\"\n", + "HR Assistant Agent — Strands agent deployed on Bedrock AgentCore Runtime.\n", + "\n", + "Tools (deterministic / mock data for reproducible evaluations):\n", + " get_pto_balance — remaining PTO days for an employee\n", + " submit_pto_request — request time off\n", + " lookup_hr_policy — company policy documents\n", + " get_benefits_summary — health, dental, vision, 401k, life insurance details\n", + " get_pay_stub — pay stub for a given period\n", + "\"\"\"\n", + "\n", + "import logging\n", + "import re\n", + "\n", + "from bedrock_agentcore.runtime import BedrockAgentCoreApp\n", + "from strands import Agent, tool\n", + "from strands.models import BedrockModel\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "logger = logging.getLogger(__name__)\n", + "\n", + "app = BedrockAgentCoreApp()\n", + "\n", + "# ---------------------------------------------------------------------------\n", + "# Mock data\n", + "# ---------------------------------------------------------------------------\n", + "\n", + "_PTO_BALANCES = {\n", + " \"EMP-001\": {\"total_days\": 15, \"used_days\": 5, \"remaining_days\": 10},\n", + " \"EMP-002\": {\"total_days\": 15, \"used_days\": 12, \"remaining_days\": 3},\n", + " \"EMP-042\": {\"total_days\": 20, \"used_days\": 7, \"remaining_days\": 13},\n", + "}\n", + "\n", + "_HR_POLICIES = {\n", + " \"pto\": (\n", + " \"PTO Policy: Full-time employees accrue 15 days of PTO per year (20 days after 3 years). \"\n", + " \"PTO requests must be submitted at least 2 business days in advance. \"\n", + " \"Unused PTO up to 5 days rolls over to the next year. \"\n", + " \"PTO cannot be taken in advance of accrual.\"\n", + " ),\n", + " \"remote_work\": (\n", + " \"Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. \"\n", + " \"Core collaboration hours are 10am-3pm local time. \"\n", + " \"A dedicated workspace with reliable internet (25 Mbps+) is required. \"\n", + " \"Employees must be reachable via Slack and email during core hours.\"\n", + " ),\n", + " \"parental_leave\": (\n", + " \"Parental Leave Policy: Primary caregivers receive 16 weeks of fully paid parental leave. \"\n", + " \"Secondary caregivers receive 6 weeks of fully paid parental leave. \"\n", + " \"Leave may begin up to 2 weeks before the expected birth or adoption date. \"\n", + " \"Benefits continue unchanged during parental leave.\"\n", + " ),\n", + " \"code_of_conduct\": (\n", + " \"Code of Conduct: All employees are expected to treat colleagues, customers, and partners \"\n", + " \"with respect and professionalism. Harassment, discrimination, and retaliation of any kind \"\n", + " \"are strictly prohibited. Violations should be reported to HR or via the anonymous hotline.\"\n", + " ),\n", + "}\n", + "\n", + "_BENEFITS = {\n", + " \"health\": (\n", + " \"Health Insurance: The company covers 90% of premiums for employee-only coverage and 75% \"\n", + " \"for family coverage. Plans available: Blue Shield PPO, Kaiser HMO, and HDHP with HSA. \"\n", + " \"Annual deductible: $500 (PPO), $0 (HMO), $1,500 (HDHP). \"\n", + " \"Open enrollment is each November for the following calendar year.\"\n", + " ),\n", + " \"dental\": (\n", + " \"Dental Insurance: 100% coverage for preventive care (cleanings, X-rays). \"\n", + " \"80% coverage for basic restorative care (fillings, extractions). \"\n", + " \"50% coverage for major restorative care (crowns, bridges). \"\n", + " \"Annual maximum benefit: $2,000 per person. Orthodontia lifetime maximum: $1,500.\"\n", + " ),\n", + " \"vision\": (\n", + " \"Vision Insurance: Annual eye exam covered in full. \"\n", + " \"Frames or contacts allowance: $200 per year. \"\n", + " \"Laser vision correction discount: 15% off at participating providers.\"\n", + " ),\n", + " \"401k\": (\n", + " \"401(k) Plan: The company matches 100% of employee contributions up to 4% of salary. \"\n", + " \"An additional 50% match on the next 2% (total effective match up to 5%). \"\n", + " \"Employees are eligible to contribute immediately; company match vests over 3 years. \"\n", + " \"2026 IRS contribution limit: $23,500 (under 50), $31,000 (age 50+).\"\n", + " ),\n", + " \"life_insurance\": (\n", + " \"Life Insurance: Basic life insurance of 2x annual salary provided at no cost. \"\n", + " \"Employees may purchase supplemental coverage up to 5x salary during open enrollment. \"\n", + " \"Accidental death and dismemberment (AD&D) coverage equal to basic life benefit is included.\"\n", + " ),\n", + "}\n", + "\n", + "_PAY_STUBS = {\n", + " (\"EMP-001\", \"2025-12\"): {\n", + " \"gross_pay\": 8333.33, \"federal_tax\": 1458.33, \"state_tax\": 416.67,\n", + " \"social_security\": 516.67, \"medicare\": 120.83, \"health_premium\": 125.00,\n", + " \"401k_contribution\": 333.33, \"net_pay\": 5362.50, \"period\": \"December 2025\",\n", + " },\n", + " (\"EMP-001\", \"2026-01\"): {\n", + " \"gross_pay\": 8333.33, \"federal_tax\": 1458.33, \"state_tax\": 416.67,\n", + " \"social_security\": 516.67, \"medicare\": 120.83, \"health_premium\": 125.00,\n", + " \"401k_contribution\": 333.33, \"net_pay\": 5362.50, \"period\": \"January 2026\",\n", + " },\n", + " (\"EMP-042\", \"2026-01\"): {\n", + " \"gross_pay\": 10416.67, \"federal_tax\": 1875.00, \"state_tax\": 520.83,\n", + " \"social_security\": 645.83, \"medicare\": 151.04, \"health_premium\": 200.00,\n", + " \"401k_contribution\": 416.67, \"net_pay\": 6607.30, \"period\": \"January 2026\",\n", + " },\n", + "}\n", + "\n", + "_PTO_REQUEST_COUNTER = {\"n\": 0}\n", + "\n", + "\n", + "# ---------------------------------------------------------------------------\n", + "# Strands tools\n", + "# ---------------------------------------------------------------------------\n", + "\n", + "@tool\n", + "def get_pto_balance(employee_id: str) -> dict:\n", + " \"\"\"\n", + " Return the current PTO balance for an employee.\n", + "\n", + " Args:\n", + " employee_id: Employee identifier (e.g. EMP-001)\n", + "\n", + " Returns:\n", + " Dict with total_days, used_days, and remaining_days.\n", + " \"\"\"\n", + " balance = _PTO_BALANCES.get(employee_id)\n", + " if balance:\n", + " return {\"employee_id\": employee_id, **balance}\n", + " return {\"employee_id\": employee_id, \"error\": f\"Employee {employee_id} not found.\"}\n", + "\n", + "\n", + "@tool\n", + "def submit_pto_request(\n", + " employee_id: str,\n", + " start_date: str,\n", + " end_date: str,\n", + " reason: str = \"Personal time off\",\n", + ") -> dict:\n", + " \"\"\"\n", + " Submit a PTO request for an employee.\n", + "\n", + " Args:\n", + " employee_id: Employee identifier (e.g. EMP-001)\n", + " start_date: First day of leave in YYYY-MM-DD format\n", + " end_date: Last day of leave in YYYY-MM-DD format\n", + " reason: Optional reason for the request\n", + "\n", + " Returns:\n", + " Dict with request_id, status, and confirmation message.\n", + " \"\"\"\n", + " _PTO_REQUEST_COUNTER[\"n\"] += 1\n", + " request_id = f\"PTO-2026-{_PTO_REQUEST_COUNTER['n']:03d}\"\n", + " return {\n", + " \"request_id\": request_id,\n", + " \"employee_id\": employee_id,\n", + " \"start_date\": start_date,\n", + " \"end_date\": end_date,\n", + " \"reason\": reason,\n", + " \"status\": \"APPROVED\",\n", + " \"message\": f\"PTO request {request_id} approved for {employee_id} from {start_date} to {end_date}.\",\n", + " }\n", + "\n", + "\n", + "@tool\n", + "def lookup_hr_policy(topic: str) -> dict:\n", + " \"\"\"\n", + " Look up a company HR policy document by topic.\n", + "\n", + " Args:\n", + " topic: Policy topic. Supported values: pto, remote_work, parental_leave, code_of_conduct\n", + "\n", + " Returns:\n", + " Dict with topic and policy_text.\n", + " \"\"\"\n", + " key = topic.lower().replace(\" \", \"_\").replace(\"-\", \"_\")\n", + " text = _HR_POLICIES.get(key)\n", + " if text:\n", + " return {\"topic\": topic, \"policy_text\": text}\n", + " return {\"topic\": topic, \"error\": f\"Policy '{topic}' not found. Available: {list(_HR_POLICIES.keys())}\"}\n", + "\n", + "\n", + "@tool\n", + "def get_benefits_summary(benefit_type: str) -> dict:\n", + " \"\"\"\n", + " Return a summary of a specific employee benefit.\n", + "\n", + " Args:\n", + " benefit_type: Type of benefit. Supported values: health, dental, vision, 401k, life_insurance\n", + "\n", + " Returns:\n", + " Dict with benefit_type and summary text.\n", + " \"\"\"\n", + " key = benefit_type.lower().replace(\" \", \"_\").replace(\"-\", \"_\")\n", + " text = _BENEFITS.get(key)\n", + " if text:\n", + " return {\"benefit_type\": benefit_type, \"summary\": text}\n", + " return {\"benefit_type\": benefit_type, \"error\": f\"Benefit '{benefit_type}' not found. Available: {list(_BENEFITS.keys())}\"}\n", + "\n", + "\n", + "@tool\n", + "def get_pay_stub(employee_id: str, period: str) -> dict:\n", + " \"\"\"\n", + " Retrieve a pay stub for an employee for a specific pay period.\n", + "\n", + " Args:\n", + " employee_id: Employee identifier (e.g. EMP-001)\n", + " period: Pay period in YYYY-MM format (e.g. 2026-01)\n", + "\n", + " Returns:\n", + " Dict with gross pay, deductions, and net pay.\n", + " \"\"\"\n", + " stub = _PAY_STUBS.get((employee_id, period))\n", + " if stub:\n", + " return {\"employee_id\": employee_id, **stub}\n", + " return {\"employee_id\": employee_id, \"period\": period, \"error\": f\"Pay stub not found for {employee_id} period {period}.\"}\n", + "\n", + "\n", + "# ---------------------------------------------------------------------------\n", + "# Agent\n", + "# ---------------------------------------------------------------------------\n", + "\n", + "SYSTEM_PROMPT = \"\"\"You are a helpful HR Assistant for Acme Corp.\n", + "\n", + "You help employees with:\n", + "- Checking PTO (paid time off) balances\n", + "- Submitting PTO requests\n", + "- Looking up HR policies (PTO, remote work, parental leave, code of conduct)\n", + "- Understanding employee benefits (health, dental, vision, 401k, life insurance)\n", + "- Retrieving pay stub information\n", + "\n", + "Always use the available tools to answer questions accurately. Do not make up\n", + "policy details, benefit amounts, or pay information — look them up.\n", + "Be concise, professional, and friendly.\"\"\"\n", + "\n", + "_MODEL = BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\")\n", + "_TOOLS = [get_pto_balance, submit_pto_request, lookup_hr_policy, get_benefits_summary, get_pay_stub]\n", + "\n", + "# Session cache: session_id -> Agent (preserves conversation history across turns)\n", + "_SESSION_AGENTS: dict[str, Agent] = {}\n", + "\n", + "\n", + "@app.entrypoint\n", + "async def invoke(payload, context):\n", + " \"\"\"Handle an agent invocation from AgentCore Runtime.\"\"\"\n", + " prompt = payload.get(\"prompt\", \"\")\n", + " session_id = context.session_id\n", + " logger.info(\"Received prompt (session=%s): %s\", session_id, prompt[:80])\n", + "\n", + " if session_id and session_id in _SESSION_AGENTS:\n", + " agent = _SESSION_AGENTS[session_id]\n", + " else:\n", + " agent = Agent(model=_MODEL, tools=_TOOLS, system_prompt=SYSTEM_PROMPT)\n", + " if session_id:\n", + " _SESSION_AGENTS[session_id] = agent\n", + "\n", + " parts = []\n", + " async for event in agent.stream_async(prompt):\n", + " if \"data\" in event:\n", + " parts.append(str(event[\"data\"]))\n", + " response = \"\".join(parts)\n", + " # Strip inline ... blocks so spans contain only the final answer\n", + " response = re.sub(r\".*?\", \"\", response, flags=re.DOTALL).strip()\n", + " return response\n", + "\n", + "\n", + "if __name__ == \"__main__\":\n", + " app.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "deploy", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:08:34.729896Z", + "iopub.status.busy": "2026-03-31T18:08:34.729788Z", + "iopub.status.idle": "2026-03-31T18:09:25.948136Z", + "shell.execute_reply": "2026-03-31T18:09:25.947045Z" + } + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Entrypoint parsed: file=/Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py, bedrock_agentcore_name=hr_assistant_agent\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Memory disabled - agent will be stateless\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Configuring BedrockAgentCore agent: hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Memory disabled\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Network mode: PUBLIC\n" + ] + }, + { + "data": { + "text/html": [ + "
📄 Using existing Dockerfile: \n",
+       "/Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05\n",
+       "-groundtruth-based-evalautions/Dockerfile\n",
+       "
\n" + ], + "text/plain": [ + "📄 Using existing Dockerfile: \n", + "\u001b[35m/Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05\u001b[0m\n", + "\u001b[35m-groundtruth-based-evalautions/\u001b[0m\u001b[95mDockerfile\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generated .dockerignore: /Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.dockerignore\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Keeping 'hr_assistant_eval_tutorial' as default agent\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Bedrock AgentCore configured: /Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.bedrock_agentcore.yaml\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🚀 Launching Bedrock AgentCore (cloud mode - RECOMMENDED)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " • Deploy Python code directly to runtime\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " • No Docker required (DEFAULT behavior)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " • Production-ready deployment\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "💡 Deployment options:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " • runtime.launch() → Cloud (current)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " • runtime.launch(local=True) → Local development\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Memory disabled - skipping memory creation\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting CodeBuild ARM64 deployment for agent 'hr_assistant_eval_tutorial' to account 849138760372 (us-east-1)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Generated image tag: 20260331-180836-339\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting up AWS resources (ECR repository, execution roles)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Getting or creating ECR repository for agent: hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Configuration complete.\n", + "\n", + "Deploying HR Assistant Agent ...\n", + " This takes ~5 minutes on first run (image build + push + runtime creation).\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ECR repository available: 849138760372.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Getting or creating execution role for agent: hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using AWS region: us-east-1, account ID: 849138760372\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Role name: AmazonBedrockAgentCoreSDKRuntime-us-east-1-0c8cfc0a5d\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Reusing existing ECR repository: 849138760372.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ Reusing existing execution role: arn:aws:iam::849138760372:role/AmazonBedrockAgentCoreSDKRuntime-us-east-1-0c8cfc0a5d\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Execution role available: arn:aws:iam::849138760372:role/AmazonBedrockAgentCoreSDKRuntime-us-east-1-0c8cfc0a5d\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Preparing CodeBuild project and uploading source...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Getting or creating CodeBuild execution role for agent: hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Role name: AmazonBedrockAgentCoreSDKCodeBuild-us-east-1-0c8cfc0a5d\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Reusing existing CodeBuild execution role: arn:aws:iam::849138760372:role/AmazonBedrockAgentCoreSDKCodeBuild-us-east-1-0c8cfc0a5d\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using dockerignore.template with 47 patterns for zip filtering\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploaded source to S3: hr_assistant_eval_tutorial/source.zip\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Updated CodeBuild project: bedrock-agentcore-hr_assistant_eval_tutorial-builder\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting CodeBuild build (this may take several minutes)...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Starting CodeBuild monitoring...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔄 QUEUED started (total: 0s)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ QUEUED completed in 1.1s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔄 PROVISIONING started (total: 1s)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ PROVISIONING completed in 6.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔄 DOWNLOAD_SOURCE started (total: 8s)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ DOWNLOAD_SOURCE completed in 2.2s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔄 BUILD started (total: 10s)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ BUILD completed in 16.7s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔄 POST_BUILD started (total: 27s)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ POST_BUILD completed in 12.3s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔄 COMPLETED started (total: 39s)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ COMPLETED completed in 1.1s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🎉 CodeBuild completed successfully in 0m 40s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CodeBuild completed successfully\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CodeBuild project configuration saved\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Deploying to Bedrock AgentCore...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Agent created/updated: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Observability is enabled, configuring observability components...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "CloudWatch Logs resource policy already configured\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "X-Ray trace destination already configured\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "X-Ray indexing rule already configured\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Transaction Search already fully configured\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ObservabilityDeliveryManager initialized for region: us-east-1, account: 849138760372\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ Logs auto-created by AWS for runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ Traces delivery enabled for runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Observability enabled for runtime/hr_assistant_eval_tutorial-xfZ3yiH356 - logs: True, traces: True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "✅ X-Ray traces delivery enabled for agent hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔍 GenAI Observability Dashboard:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#gen-ai-observability/agent-core\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Polling for endpoint to be ready...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Agent endpoint: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356/runtime-endpoint/DEFAULT\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Deployment completed successfully - Agent: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Built with CodeBuild: bedrock-agentcore-hr_assistant_eval_tutorial-builder:8b4e4d86-70e7-445e-9948-b4a88c7f518a\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Deployed to cloud: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ECR image: 849138760372.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-hr_assistant_eval_tutorial:20260331-180836-339\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🔍 Agent logs available at:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-name-prefix \"2026/03/31/\\[runtime-logs]\"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-names \"otel-rt-logs\"\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "💡 Tail logs with: aws logs tail /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-name-prefix \"2026/03/31/\\[runtime-logs]\" --follow\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "💡 Or view recent logs: aws logs tail /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-name-prefix \"2026/03/31/\\[runtime-logs]\" --since 1h\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Launch complete.\n", + " agent_id : hr_assistant_eval_tutorial-xfZ3yiH356\n", + " agent_arn : arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" + ] + } + ], + "source": [ + "from bedrock_agentcore_starter_toolkit import Runtime\n", + "\n", + "_REGION = REGION or \"us-east-1\"\n", + "\n", + "agentcore_runtime = Runtime()\n", + "agentcore_runtime.configure(\n", + " entrypoint=\"hr_assistant_agent.py\",\n", + " agent_name=\"hr_assistant_eval_tutorial\",\n", + " region=_REGION,\n", + " auto_create_execution_role=True,\n", + " auto_create_ecr=True,\n", + " requirements_file=\"requirements.txt\",\n", + " non_interactive=True,\n", + ")\n", + "print(\"Configuration complete.\")\n", + "\n", + "print(\"\\nDeploying HR Assistant Agent ...\")\n", + "print(\" This takes ~5 minutes on first run (image build + push + runtime creation).\")\n", + "print()\n", + "\n", + "_launch = agentcore_runtime.launch(auto_update_on_conflict=True)\n", + "\n", + "print(f\"\\nLaunch complete.\")\n", + "print(f\" agent_id : {_launch.agent_id}\")\n", + "print(f\" agent_arn : {_launch.agent_arn}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "wait-deploy", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:09:25.951112Z", + "iopub.status.busy": "2026-03-31T18:09:25.950863Z", + "iopub.status.idle": "2026-03-31T18:09:26.424226Z", + "shell.execute_reply": "2026-03-31T18:09:26.423531Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Waiting for agent to reach READY status ...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Retrieved Bedrock AgentCore status for: hr_assistant_eval_tutorial\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " [ 0s] status = READY\n", + "\n", + "Agent is READY. Proceeding.\n" + ] + } + ], + "source": [ + "import time\n", + "\n", + "print(\"Waiting for agent to reach READY status ...\")\n", + "\n", + "_POLL_INTERVAL = 15 # seconds between status checks\n", + "_MAX_WAIT = 600 # 10-minute timeout\n", + "\n", + "_elapsed = 0\n", + "while _elapsed < _MAX_WAIT:\n", + " _status_result = agentcore_runtime.status()\n", + " _agent_info = _status_result.agent or {}\n", + " _agent_status = _agent_info.get(\"status\", \"UNKNOWN\")\n", + " print(f\" [{_elapsed:>3}s] status = {_agent_status}\")\n", + "\n", + " if _agent_status in (\"READY\", \"ACTIVE\"):\n", + " print(f\"\\nAgent is {_agent_status}. Proceeding.\")\n", + " break\n", + " if _agent_status in (\"FAILED\", \"CREATE_FAILED\", \"UPDATE_FAILED\"):\n", + " raise RuntimeError(\n", + " f\"Agent deployment failed with status '{_agent_status}'.\\n\"\n", + " f\"Details: {_agent_info}\"\n", + " )\n", + "\n", + " time.sleep(_POLL_INTERVAL)\n", + " _elapsed += _POLL_INTERVAL\n", + "else:\n", + " raise TimeoutError(\n", + " f\"Agent did not reach READY status within {_MAX_WAIT}s. \"\n", + " \"Check the AgentCore console for details.\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "agent-config", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:09:26.425933Z", + "iopub.status.busy": "2026-03-31T18:09:26.425801Z", + "iopub.status.idle": "2026-03-31T18:09:26.430602Z", + "shell.execute_reply": "2026-03-31T18:09:26.430128Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AGENT_ID : hr_assistant_eval_tutorial-xfZ3yiH356\n", + "AGENT_ARN : arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n", + "CW_LOG_GROUP : /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT\n" + ] + } + ], + "source": [ + "AGENT_ID = _launch.agent_id\n", + "AGENT_ARN = _launch.agent_arn\n", + "CW_LOG_GROUP = f\"/aws/bedrock-agentcore/runtimes/{AGENT_ID}-DEFAULT\"\n", + "\n", + "agentcore_client = boto3.client(\"bedrock-agentcore\", region_name=_REGION)\n", + "\n", + "print(f\"AGENT_ID : {AGENT_ID}\")\n", + "print(f\"AGENT_ARN : {AGENT_ARN}\")\n", + "print(f\"CW_LOG_GROUP : {CW_LOG_GROUP}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "store-agent", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:09:26.431991Z", + "iopub.status.busy": "2026-03-31T18:09:26.431884Z", + "iopub.status.idle": "2026-03-31T18:09:26.436625Z", + "shell.execute_reply": "2026-03-31T18:09:26.436137Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stored 'AGENT_ID' (str)\n", + "Stored 'AGENT_ARN' (str)\n", + "Stored 'CW_LOG_GROUP' (str)\n", + "Stored 'REGION' (str)\n" + ] + } + ], + "source": [ + "# Persist agent info for the programmatic_evaluators notebook\n", + "%store AGENT_ID\n", + "%store AGENT_ARN\n", + "%store CW_LOG_GROUP\n", + "%store REGION" + ] + }, + { + "cell_type": "markdown", + "id": "a6r2v510ko", + "metadata": {}, + "source": [ + "## Step 3b: Create Custom (LLM-as-a-Judge) Evaluators\n", + "\n", + "In addition to built-in evaluators, you can define your own evaluation criteria using\n", + "**LLM-as-a-judge custom evaluators**. These accept natural language instructions that\n", + "can reference **ground truth placeholders** automatically substituted at evaluation time.\n", + "\n", + "### Ground truth placeholders\n", + "\n", + "| Level | Available placeholders |\n", + "|---|---|\n", + "| **TRACE** | `{context}`, `{assistant_turn}`, `{expected_response}` |\n", + "| **SESSION** | `{context}`, `{available_tools}`, `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, `{assertions}` |\n", + "\n", + "For example, a trace-level evaluator comparing response similarity would include\n", + "`{assistant_turn}` and `{expected_response}` in its instructions. When the evaluator runs,\n", + "the service substitutes those placeholders with the actual agent output and the\n", + "`expectedResponse` from `ReferenceInputs`.\n", + "\n", + "### What we'll create\n", + "\n", + "| Evaluator | Level | Placeholders | Description |\n", + "|---|---|---|---|\n", + "| `HRResponseSimilarity` | TRACE | `{assistant_turn}`, `{expected_response}` | How closely the agent's response matches the expected answer |\n", + "| `HRAssertionChecker` | SESSION | `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, `{assertions}` | Whether the agent called the right tools and satisfied all session assertions |" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "76hyptexblj", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:09:26.438149Z", + "iopub.status.busy": "2026-03-31T18:09:26.438051Z", + "iopub.status.idle": "2026-03-31T18:09:27.825403Z", + "shell.execute_reply": "2026-03-31T18:09:27.824620Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating HRResponseSimilarity (TRACE) ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " evaluatorId : HRResponseSimilarity_a007e092-v0ojZ8ARHR\n", + "\n", + "Creating HRAssertionChecker (SESSION) ...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " evaluatorId : HRAssertionChecker_a007e092-zUFaMBBhck\n", + "\n", + "Custom evaluators ready:\n", + " HRResponseSimilarity (TRACE) : HRResponseSimilarity_a007e092-v0ojZ8ARHR\n", + " HRAssertionChecker (SESSION) : HRAssertionChecker_a007e092-zUFaMBBhck\n" + ] + } + ], + "source": [ + "import uuid\n", + "\n", + "_SUFFIX = uuid.uuid4().hex[:8]\n", + "_cp = boto3.client(\"bedrock-agentcore-control\", region_name=_REGION)\n", + "\n", + "# ---------------------------------------------------------------------------\n", + "# Trace-level: HRResponseSimilarity\n", + "# Compares the agent's response to the expected_response reference input.\n", + "# {assistant_turn} → actual agent output\n", + "# {expected_response} → expectedResponse field in ReferenceInputs\n", + "# ---------------------------------------------------------------------------\n", + "print(\"Creating HRResponseSimilarity (TRACE) ...\")\n", + "_resp_sim = _cp.create_evaluator(\n", + " evaluatorName=f\"HRResponseSimilarity_{_SUFFIX}\",\n", + " level=\"TRACE\",\n", + " evaluatorConfig={\n", + " \"llmAsAJudge\": {\n", + " \"instructions\": (\n", + " \"Compare the agent's response with the expected response.\\n\"\n", + " \"Agent response: {assistant_turn}\\n\"\n", + " \"Expected response: {expected_response}\\n\\n\"\n", + " \"Rate how closely the agent's response matches the expected response. \"\n", + " \"Focus on whether the key facts, numbers, and conclusions agree.\"\n", + " ),\n", + " \"ratingScale\": {\n", + " \"numerical\": [\n", + " {\n", + " \"value\": 0.0,\n", + " \"label\": \"not_similar\",\n", + " \"definition\": \"Response is factually different or missing key information from the expected response.\",\n", + " },\n", + " {\n", + " \"value\": 0.5,\n", + " \"label\": \"partially_similar\",\n", + " \"definition\": \"Response captures some expected content but omits or misrepresents parts.\",\n", + " },\n", + " {\n", + " \"value\": 1.0,\n", + " \"label\": \"highly_similar\",\n", + " \"definition\": \"Response is semantically equivalent to the expected response — all key facts match.\",\n", + " },\n", + " ]\n", + " },\n", + " \"modelConfig\": {\n", + " \"bedrockEvaluatorModelConfig\": {\n", + " \"modelId\": \"us.amazon.nova-lite-v1:0\",\n", + " \"inferenceConfig\": {\"maxTokens\": 512},\n", + " }\n", + " },\n", + " }\n", + " },\n", + ")\n", + "CUSTOM_RESPONSE_SIMILARITY_ID = _resp_sim[\"evaluatorId\"]\n", + "print(f\" evaluatorId : {CUSTOM_RESPONSE_SIMILARITY_ID}\")\n", + "\n", + "# ---------------------------------------------------------------------------\n", + "# Session-level: HRAssertionChecker\n", + "# Evaluates tool trajectory compliance and assertion satisfaction.\n", + "# {actual_tool_trajectory} → tools the agent actually called\n", + "# {expected_tool_trajectory} → expectedTrajectory from ReferenceInputs\n", + "# {assertions} → assertions list from ReferenceInputs\n", + "# ---------------------------------------------------------------------------\n", + "print(\"\\nCreating HRAssertionChecker (SESSION) ...\")\n", + "_assert_chk = _cp.create_evaluator(\n", + " evaluatorName=f\"HRAssertionChecker_{_SUFFIX}\",\n", + " level=\"SESSION\",\n", + " evaluatorConfig={\n", + " \"llmAsAJudge\": {\n", + " \"instructions\": (\n", + " \"Evaluate whether the agent fulfilled the session requirements.\\n\\n\"\n", + " \"Expected tool trajectory: {expected_tool_trajectory}\\n\"\n", + " \"Actual tool trajectory: {actual_tool_trajectory}\\n\"\n", + " \"Assertions to verify: {assertions}\\n\\n\"\n", + " \"Score the agent on how well it followed the expected tool trajectory \"\n", + " \"and satisfied every listed assertion.\"\n", + " ),\n", + " \"ratingScale\": {\n", + " \"numerical\": [\n", + " {\n", + " \"value\": 0.0,\n", + " \"label\": \"failed\",\n", + " \"definition\": \"Agent did not follow the trajectory and failed most assertions.\",\n", + " },\n", + " {\n", + " \"value\": 0.5,\n", + " \"label\": \"partial\",\n", + " \"definition\": \"Agent partially followed the trajectory or satisfied only some assertions.\",\n", + " },\n", + " {\n", + " \"value\": 1.0,\n", + " \"label\": \"passed\",\n", + " \"definition\": \"Agent followed the expected trajectory and satisfied all assertions.\",\n", + " },\n", + " ]\n", + " },\n", + " \"modelConfig\": {\n", + " \"bedrockEvaluatorModelConfig\": {\n", + " \"modelId\": \"us.amazon.nova-lite-v1:0\",\n", + " \"inferenceConfig\": {\"maxTokens\": 512},\n", + " }\n", + " },\n", + " }\n", + " },\n", + ")\n", + "CUSTOM_ASSERTION_CHECKER_ID = _assert_chk[\"evaluatorId\"]\n", + "print(f\" evaluatorId : {CUSTOM_ASSERTION_CHECKER_ID}\")\n", + "\n", + "print(f\"\\nCustom evaluators ready:\")\n", + "print(f\" HRResponseSimilarity (TRACE) : {CUSTOM_RESPONSE_SIMILARITY_ID}\")\n", + "print(f\" HRAssertionChecker (SESSION) : {CUSTOM_ASSERTION_CHECKER_ID}\")" + ] + }, + { + "cell_type": "markdown", + "id": "invoke-md", + "metadata": {}, + "source": [ + "## Step 4: Invoke the Agent to Generate Sessions\n", + "\n", + "Before we can evaluate, we need agent sessions with CloudWatch spans. We'll invoke the agent\n", + "for several scenarios and record the session IDs for use with `EvaluationClient`.\n", + "\n", + "Each session corresponds to one evaluation scenario." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "invoke-helper", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:09:27.827457Z", + "iopub.status.busy": "2026-03-31T18:09:27.827332Z", + "iopub.status.idle": "2026-03-31T18:09:27.831125Z", + "shell.execute_reply": "2026-03-31T18:09:27.830586Z" + } + }, + "outputs": [], + "source": [ + "def invoke_agent(prompt: str, session_id: str) -> str:\n", + " \"\"\"Send a single prompt to the HR assistant and return its text response.\"\"\"\n", + " resp = agentcore_client.invoke_agent_runtime(\n", + " agentRuntimeArn=AGENT_ARN,\n", + " qualifier=\"DEFAULT\",\n", + " runtimeSessionId=session_id,\n", + " payload=json.dumps({\"prompt\": prompt}).encode(\"utf-8\"),\n", + " )\n", + " raw = resp[\"response\"].read().decode(\"utf-8\")\n", + " parts = []\n", + " for line in raw.splitlines():\n", + " if line.startswith(\"data: \"):\n", + " chunk = line[len(\"data: \"):]\n", + " try:\n", + " chunk = json.loads(chunk)\n", + " except Exception:\n", + " pass\n", + " parts.append(str(chunk))\n", + " return \"\".join(parts) if parts else raw\n", + "\n", + "\n", + "def run_session(turns: list[str], session_prefix: str) -> str:\n", + " \"\"\"Invoke a multi-turn session and return its session ID.\"\"\"\n", + " session_id = f\"{session_prefix}-{uuid.uuid4()}\"\n", + " print(f\"Session: {session_id}\")\n", + " for turn_input in turns:\n", + " print(f\" > {turn_input[:70]}\")\n", + " response = invoke_agent(turn_input, session_id)\n", + " print(f\" < {response[:100]}\")\n", + " return session_id" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "invoke-single", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:09:27.832388Z", + "iopub.status.busy": "2026-03-31T18:09:27.832293Z", + "iopub.status.idle": "2026-03-31T18:10:06.856305Z", + "shell.execute_reply": "2026-03-31T18:10:06.855331Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Single-Turn Sessions ===\n", + "Session: pto-balance-check-6174aaad-8137-428b-8b54-64aa650cbedf\n", + " > What is the current PTO balance for employee EMP-001?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"The current PTO balance for employee EMP-001 is 10 remaining days. They have a total of 15 days all\n", + "Session: submit-pto-request-3b5a7027-eedb-427b-ab71-357f5ea55393\n", + " > Please submit a PTO request for employee EMP-001 from 2026-04-14 to 20\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"Hi there! Your PTO request from 2026-04-14 to 2026-04-16 for a family vacation has been successfull\n", + "Session: pay-stub-lookup-739b4301-7fed-4486-922f-fae9b33cca3a\n", + " > Can you pull up the January 2026 pay stub for employee EMP-001?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"Here is the pay stub for employee EMP-001 for January 2026:\\n\\n- Gross Pay: $8,333.33\\n- Federal Ta\n", + "\n", + "Single-turn sessions created.\n" + ] + } + ], + "source": [ + "# --- Single-turn sessions ---\n", + "\n", + "print(\"=== Single-Turn Sessions ===\")\n", + "\n", + "session_pto_balance = run_session(\n", + " [\"What is the current PTO balance for employee EMP-001?\"],\n", + " \"pto-balance-check\"\n", + ")\n", + "\n", + "session_submit_pto = run_session(\n", + " [\"Please submit a PTO request for employee EMP-001 from 2026-04-14 to 2026-04-16 for a family vacation.\"],\n", + " \"submit-pto-request\"\n", + ")\n", + "\n", + "session_pay_stub = run_session(\n", + " [\"Can you pull up the January 2026 pay stub for employee EMP-001?\"],\n", + " \"pay-stub-lookup\"\n", + ")\n", + "\n", + "print(\"\\nSingle-turn sessions created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "invoke-multi", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:10:06.861600Z", + "iopub.status.busy": "2026-03-31T18:10:06.861400Z", + "iopub.status.idle": "2026-03-31T18:10:13.485540Z", + "shell.execute_reply": "2026-03-31T18:10:13.485047Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Multi-Turn Session: PTO Planning ===\n", + "Session: pto-planning-session-4d42b1e2-b0c3-43c5-9298-05ead1cf4822\n", + " > How many PTO days do I have left? My employee ID is EMP-001.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"Hi there! Based on your employee ID, you have 10 PTO days remaining. You've used 5 days out of your\n", + " > Great. I'd like to take December 23 to December 25 off. Please submit \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"Your PTO request for December 23 to December 25 has been approved. Your request ID is PTO-2026-001.\n", + " > Remind me — what is the policy on rolling over unused PTO?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"According to Acme Corp's PTO policy, unused PTO up to 5 days rolls over to the next year. This mean\n", + "\n", + "Multi-turn session created.\n" + ] + } + ], + "source": [ + "# --- Multi-turn session: PTO planning ---\n", + "\n", + "print(\"=== Multi-Turn Session: PTO Planning ===\")\n", + "\n", + "session_pto_planning = run_session(\n", + " [\n", + " \"How many PTO days do I have left? My employee ID is EMP-001.\",\n", + " \"Great. I'd like to take December 23 to December 25 off. Please submit a request.\",\n", + " \"Remind me — what is the policy on rolling over unused PTO?\",\n", + " ],\n", + " \"pto-planning-session\"\n", + ")\n", + "\n", + "print(\"\\nMulti-turn session created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "invoke-onboard", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:10:13.487685Z", + "iopub.status.busy": "2026-03-31T18:10:13.487505Z", + "iopub.status.idle": "2026-03-31T18:11:21.679433Z", + "shell.execute_reply": "2026-03-31T18:11:21.678254Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Multi-Turn Session: New Employee Onboarding ===\n", + "Session: new-employee-onboarding-1b08d214-9233-4050-a550-095962aa53f3\n", + " > I just joined the company. What is the remote work policy?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"Here is the remote work policy for Acme Corp:\\n\\n**Remote Work Policy**\\n- Employees may work remot\n", + " > How much PTO do I get as a new employee?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"It appears there was an error because the employee ID \\\"EMP-NEW\\\" does not exist in the system. Cou\n", + " > What life insurance benefit does the company provide?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"Here is the summary of the life insurance benefit provided by Acme Corp:\\n\\n**Life Insurance**\\n- B\n", + " > Can you check the current PTO balance for employee EMP-042?\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " < \"The current PTO balance for employee EMP-042 is:\\n\\n- **Total days**: 20\\n- **Used days**: 7\\n- **R\n", + "\n", + "All sessions created. Waiting 60s for CloudWatch log ingestion...\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ready to evaluate.\n" + ] + } + ], + "source": [ + "# --- Multi-turn session: New employee onboarding ---\n", + "\n", + "print(\"=== Multi-Turn Session: New Employee Onboarding ===\")\n", + "\n", + "session_onboarding = run_session(\n", + " [\n", + " \"I just joined the company. What is the remote work policy?\",\n", + " \"How much PTO do I get as a new employee?\",\n", + " \"What life insurance benefit does the company provide?\",\n", + " \"Can you check the current PTO balance for employee EMP-042?\",\n", + " ],\n", + " \"new-employee-onboarding\"\n", + ")\n", + "\n", + "print(\"\\nAll sessions created. Waiting 60s for CloudWatch log ingestion...\")\n", + "time.sleep(60)\n", + "print(\"Ready to evaluate.\")" + ] + }, + { + "cell_type": "markdown", + "id": "eval-client-md", + "metadata": {}, + "source": [ + "## Step 5: EvaluationClient — Evaluate Existing Sessions\n", + "\n", + "`EvaluationClient` is the right tool when you **already have agent sessions** recorded in CloudWatch.\n", + "It looks up the agent's spans for a given `session_id` and runs evaluators against them.\n", + "No agent re-invocation occurs.\n", + "\n", + "### Ground-Truth Reference Inputs\n", + "\n", + "`ReferenceInputs` lets you supply optional ground truth:\n", + "\n", + "| Field | Evaluators that use it | Description |\n", + "|---|---|---|\n", + "| `expected_response` | `Builtin.Correctness` | The ideal response text |\n", + "| `expected_trajectory` | `Builtin.TrajectoryExactOrderMatch`, `Builtin.TrajectoryInOrderMatch`, `Builtin.TrajectoryAnyOrderMatch` | Ordered list of tool names |\n", + "| `assertions` | `Builtin.GoalSuccessRate` | Free-text assertions the session should satisfy |\n", + "\n", + "Evaluators that don't need ground truth (`Helpfulness`, `ResponseRelevance`) can be included in the same call.\n", + "Each evaluator only reads the fields it needs." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "eval-client-init", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:11:21.683849Z", + "iopub.status.busy": "2026-03-31T18:11:21.683641Z", + "iopub.status.idle": "2026-03-31T18:11:21.747795Z", + "shell.execute_reply": "2026-03-31T18:11:21.746884Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EvaluationClient initialised (region=us-east-1)\n", + " HRResponseSimilarity_a007e092-v0ojZ8ARHR → TRACE (custom: HRResponseSimilarity)\n", + " HRAssertionChecker_a007e092-zUFaMBBhck → SESSION (custom: HRAssertionChecker)\n" + ] + } + ], + "source": [ + "from bedrock_agentcore.evaluation import EvaluationClient, ReferenceInputs\n", + "\n", + "eval_client = EvaluationClient(region_name=REGION)\n", + "\n", + "# Seed the evaluator level cache with custom evaluator IDs so the client\n", + "# doesn't need to call get_evaluator for them (BUG-001 workaround).\n", + "eval_client._evaluator_level_cache.update({\n", + " CUSTOM_RESPONSE_SIMILARITY_ID: \"TRACE\",\n", + " CUSTOM_ASSERTION_CHECKER_ID: \"SESSION\",\n", + "})\n", + "\n", + "print(f\"EvaluationClient initialised (region={REGION})\")\n", + "print(f\" {CUSTOM_RESPONSE_SIMILARITY_ID} → TRACE (custom: HRResponseSimilarity)\")\n", + "print(f\" {CUSTOM_ASSERTION_CHECKER_ID} → SESSION (custom: HRAssertionChecker)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "print-helper", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:11:21.749705Z", + "iopub.status.busy": "2026-03-31T18:11:21.749545Z", + "iopub.status.idle": "2026-03-31T18:11:21.753571Z", + "shell.execute_reply": "2026-03-31T18:11:21.752959Z" + } + }, + "outputs": [], + "source": [ + "def display_eval_results(label: str, results: list) -> None:\n", + " \"\"\"Pretty-print EvaluationClient results as a markdown table.\"\"\"\n", + " rows = [\"| Evaluator | Value | Label | Explanation |\",\n", + " \"|---|---|---|---|\"]\n", + " for r in results:\n", + " evaluator = r.get(\"evaluatorId\", \"\")[:40]\n", + " value = str(r.get(\"value\", r.get(\"score\", \"N/A\")))\n", + " lbl = str(r.get(\"label\", r.get(\"rating\", \"\")))\n", + " explanation = (r.get(\"explanation\", r.get(\"reason\", \"\")) or \"\")[:120].replace(\"\\n\", \" \")\n", + " error_code = r.get(\"errorCode\")\n", + " if error_code:\n", + " lbl = f\"ERR:{error_code}\"\n", + " explanation = (r.get(\"errorMessage\", \"\") or \"\")[:120]\n", + " rows.append(f\"| `{evaluator}` | {value} | {lbl} | {explanation} |\")\n", + "\n", + " if len(rows) == 2: # only header rows, no data\n", + " rows.append(\"| No results — session may be too recent or spans not yet visible | | | |\")\n", + "\n", + " md = f\"### {label}\\n\\n\" + \"\\n\".join(rows)\n", + " display(Markdown(md))" + ] + }, + { + "cell_type": "markdown", + "id": "ec-single-md", + "metadata": {}, + "source": [ + "### 5a. Single-Turn: PTO Balance — Correctness + Helpfulness + Custom ResponseSimilarity\n", + "\n", + "We evaluate the PTO balance response against a known expected answer using `Builtin.Correctness`\n", + "and the custom `HRResponseSimilarity` evaluator (which uses the `{assistant_turn}` and\n", + "`{expected_response}` placeholders). Both measure factual accuracy but use different scoring rubrics." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "ec-pto-balance", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:11:21.755187Z", + "iopub.status.busy": "2026-03-31T18:11:21.755069Z", + "iopub.status.idle": "2026-03-31T18:11:43.660604Z", + "shell.execute_reply": "2026-03-31T18:11:43.659651Z" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### PTO Balance — Correctness + Quality + Custom ResponseSimilarity\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response provides all the core factual information present in the expected response: EMP-001 has 10 remaining |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clear and straightforward: to obtain the current PTO balance for employee EMP-001. The assistant's re |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked for the current PTO balance for employee EMP-001. The tool output provided detailed information showing: |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response and the expected response convey the same key facts: Employee EMP-001 has 10 remaining PTO days out |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pto_balance_results = eval_client.run(\n", + " evaluator_ids=[\n", + " \"Builtin.Correctness\", # TRACE: built-in factual accuracy\n", + " \"Builtin.Helpfulness\", # TRACE: no ground truth needed\n", + " \"Builtin.ResponseRelevance\", # TRACE: no ground truth needed\n", + " CUSTOM_RESPONSE_SIMILARITY_ID, # TRACE: custom — uses {assistant_turn} + {expected_response}\n", + " ],\n", + " session_id=session_pto_balance,\n", + " agent_id=AGENT_ID,\n", + " look_back_time=timedelta(hours=2),\n", + " reference_inputs=ReferenceInputs(\n", + " expected_response=\"Employee EMP-001 has 10 remaining PTO days out of 15 total (5 days used).\",\n", + " ),\n", + ")\n", + "\n", + "display_eval_results(\"PTO Balance — Correctness + Quality + Custom ResponseSimilarity\", pto_balance_results)" + ] + }, + { + "cell_type": "markdown", + "id": "ec-traj-md", + "metadata": {}, + "source": [ + "### 5b. Single-Turn: PTO Submission — Assertions + Trajectory + Custom AssertionChecker\n", + "\n", + "This cell runs both built-in trajectory evaluators **and** the custom `HRAssertionChecker`\n", + "(which uses `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, and `{assertions}` placeholders)\n", + "plus the custom `HRResponseSimilarity` for the response. This lets you compare built-in vs. custom\n", + "scoring side by side." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ec-submit-pto", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:11:43.665034Z", + "iopub.status.busy": "2026-03-31T18:11:43.664870Z", + "iopub.status.idle": "2026-03-31T18:11:58.887975Z", + "shell.execute_reply": "2026-03-31T18:11:58.887479Z" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### PTO Submission — Built-in + Custom ResponseSimilarity\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.GoalSuccessRate` | 1.0 | Yes | The agent successfully completed all three success assertions: 1) The tool execution history shows the agent called `sub |\n", + "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['submit_pto_request'] matches expected trajectory ['submit_pto_request'] |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['submit_pto_request'] found in actual ['submit_pto_request'] |\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response correctly conveys all the core factual information from the expected response: (1) PTO request was su |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all key facts from the expected response: the employee ID (EMP-001), the start and end dat |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "submit_pto_results = eval_client.run(\n", + " evaluator_ids=[\n", + " \"Builtin.GoalSuccessRate\", # SESSION: built-in assertion evaluator\n", + " \"Builtin.TrajectoryExactOrderMatch\", # SESSION: built-in trajectory evaluator\n", + " \"Builtin.TrajectoryAnyOrderMatch\", # SESSION: built-in trajectory evaluator\n", + " \"Builtin.Correctness\", # TRACE: built-in response accuracy\n", + " CUSTOM_RESPONSE_SIMILARITY_ID, # TRACE (custom): {assistant_turn} + {expected_response}\n", + " ],\n", + " session_id=session_submit_pto,\n", + " agent_id=AGENT_ID,\n", + " look_back_time=timedelta(hours=2),\n", + " reference_inputs=ReferenceInputs(\n", + " expected_trajectory=[\"submit_pto_request\"],\n", + " assertions=[\n", + " \"Agent called submit_pto_request for employee EMP-001\",\n", + " \"Agent confirmed the PTO request was approved\",\n", + " \"Agent provided a request ID (e.g. PTO-2026-001)\",\n", + " ],\n", + " expected_response=\"PTO request submitted and approved for EMP-001 from 2026-04-14 to 2026-04-16.\",\n", + " ),\n", + ")\n", + "\n", + "display_eval_results(\"PTO Submission — Built-in + Custom ResponseSimilarity\", submit_pto_results)" + ] + }, + { + "cell_type": "markdown", + "id": "ec-paystub-md", + "metadata": {}, + "source": [ + "### 5c. Single-Turn: Pay Stub — Factual Correctness\n", + "\n", + "Factual data retrieval scenarios are well-suited for `Builtin.Correctness` combined with\n", + "`Builtin.GoalSuccessRate`. The expected_response provides the ground truth figures." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "ec-pay-stub", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:11:58.889746Z", + "iopub.status.busy": "2026-03-31T18:11:58.889635Z", + "iopub.status.idle": "2026-03-31T18:12:01.774714Z", + "shell.execute_reply": "2026-03-31T18:12:01.773612Z" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Pay Stub Lookup — Correctness + GoalSuccessRate\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 90453e5cd76a35d1 and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.GoalSuccessRate` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 90453e5cd76a35d1 and name: invoke_agent Strands Agents is missing a corre |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pay_stub_results = eval_client.run(\n", + " evaluator_ids=[\n", + " \"Builtin.Correctness\",\n", + " \"Builtin.GoalSuccessRate\",\n", + " ],\n", + " session_id=session_pay_stub,\n", + " agent_id=AGENT_ID,\n", + " look_back_time=timedelta(hours=2),\n", + " reference_inputs=ReferenceInputs(\n", + " expected_response=\"EMP-001 January 2026: gross pay $8,333.33, net pay $5,362.50.\",\n", + " assertions=[\n", + " \"Agent called get_pay_stub for EMP-001 period 2026-01\",\n", + " \"Agent reported the correct gross pay of $8,333.33\",\n", + " \"Agent reported the correct net pay of $5,362.50\",\n", + " ],\n", + " ),\n", + ")\n", + "\n", + "display_eval_results(\"Pay Stub Lookup — Correctness + GoalSuccessRate\", pay_stub_results)" + ] + }, + { + "cell_type": "markdown", + "id": "ec-multi-md", + "metadata": {}, + "source": [ + "### 5d. Multi-Turn: PTO Planning Session (3 turns) + Custom AssertionChecker\n", + "\n", + "For multi-turn sessions, `EvaluationClient` fetches all spans for the session and evaluates\n", + "the complete conversation. The trajectory and assertions apply across all turns.\n", + "\n", + "This scenario also exercises the custom `HRAssertionChecker` evaluator (SESSION level),\n", + "which uses `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, and `{assertions}`\n", + "placeholders. A 3-turn session with distinct tool calls per turn gives the evaluator\n", + "a rich trajectory to compare against the expected sequence." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ec-multi-pto", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:01.777633Z", + "iopub.status.busy": "2026-03-31T18:12:01.777461Z", + "iopub.status.idle": "2026-03-31T18:12:06.100420Z", + "shell.execute_reply": "2026-03-31T18:12:06.099869Z" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### PTO Planning — Multi-Turn (3 turns) + Custom AssertionChecker\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.GoalSuccessRate` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.TrajectoryExactOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.TrajectoryInOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.Helpfulness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.Helpfulness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.Helpfulness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", + "| `HRAssertionChecker_a007e092-zUFaMBBhck` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "pto_planning_results = eval_client.run(\n", + " evaluator_ids=[\n", + " \"Builtin.GoalSuccessRate\",\n", + " \"Builtin.TrajectoryExactOrderMatch\",\n", + " \"Builtin.TrajectoryInOrderMatch\",\n", + " \"Builtin.TrajectoryAnyOrderMatch\",\n", + " \"Builtin.Helpfulness\",\n", + " CUSTOM_ASSERTION_CHECKER_ID, # SESSION (custom): {actual_tool_trajectory} + {expected_tool_trajectory} + {assertions}\n", + " ],\n", + " session_id=session_pto_planning,\n", + " agent_id=AGENT_ID,\n", + " look_back_time=timedelta(hours=2),\n", + " reference_inputs=ReferenceInputs(\n", + " expected_trajectory=[\"get_pto_balance\", \"submit_pto_request\", \"lookup_hr_policy\"],\n", + " assertions=[\n", + " \"Agent correctly reported 10 remaining PTO days for EMP-001 in turn 1\",\n", + " \"Agent submitted a PTO request for December 23-25, 2026 in turn 2\",\n", + " \"Agent correctly stated the 5-day PTO rollover limit in turn 3\",\n", + " ],\n", + " ),\n", + ")\n", + "\n", + "display_eval_results(\"PTO Planning — Multi-Turn (3 turns) + Custom AssertionChecker\", pto_planning_results)" + ] + }, + { + "cell_type": "markdown", + "id": "ec-onboard-md", + "metadata": {}, + "source": [ + "### 5e. Multi-Turn: New Employee Onboarding (4 turns)\n", + "\n", + "This scenario checks that the agent correctly identifies which tool to use for each type of question\n", + "(policy lookup vs. benefits lookup vs. PTO balance check) across a realistic onboarding conversation." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "ec-onboard", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:06.102151Z", + "iopub.status.busy": "2026-03-31T18:12:06.101988Z", + "iopub.status.idle": "2026-03-31T18:12:09.646290Z", + "shell.execute_reply": "2026-03-31T18:12:09.642775Z" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### New Employee Onboarding — Multi-Turn Session (4 turns)\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.GoalSuccessRate` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 9b7976374cd80134 and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 9b7976374cd80134 and name: invoke_agent Strands Agents is missing a corre |\n", + "| `Builtin.TrajectoryExactOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 9b7976374cd80134 and name: invoke_agent Strands Agents is missing a corre |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "onboarding_results = eval_client.run(\n", + " evaluator_ids=[\n", + " \"Builtin.GoalSuccessRate\",\n", + " \"Builtin.TrajectoryAnyOrderMatch\",\n", + " \"Builtin.TrajectoryExactOrderMatch\",\n", + " ],\n", + " session_id=session_onboarding,\n", + " agent_id=AGENT_ID,\n", + " look_back_time=timedelta(hours=2),\n", + " reference_inputs=ReferenceInputs(\n", + " expected_trajectory=[\n", + " \"lookup_hr_policy\", # turn 1: remote work policy\n", + " \"lookup_hr_policy\", # turn 2: PTO policy\n", + " \"get_benefits_summary\", # turn 3: life insurance\n", + " \"get_pto_balance\", # turn 4: EMP-042 balance\n", + " ],\n", + " assertions=[\n", + " \"Agent looked up the remote work policy in turn 1 and mentioned 3 days per week\",\n", + " \"Agent looked up the PTO policy in turn 2 and mentioned 15 days for new employees\",\n", + " \"Agent looked up life insurance benefits in turn 3 and mentioned 2x annual salary\",\n", + " \"Agent called get_pto_balance for EMP-042 in turn 4 and reported 13 remaining days\",\n", + " ],\n", + " ),\n", + ")\n", + "\n", + "display_eval_results(\"New Employee Onboarding — Multi-Turn Session (4 turns)\", onboarding_results)" + ] + }, + { + "cell_type": "markdown", + "id": "runner-md", + "metadata": {}, + "source": [ + "## Step 6: OnDemandEvaluationDatasetRunner — Automated Dataset Evaluation\n", + "\n", + "`OnDemandEvaluationDatasetRunner` is the right tool when you have a **test dataset** and want to:\n", + "1. Automatically invoke your agent for each scenario\n", + "2. Collect CloudWatch spans\n", + "3. Run evaluators against each scenario's results\n", + "\n", + "This is ideal for regression testing, CI/CD pipelines, and batch evaluation against curated datasets.\n", + "\n", + "### Dataset structure\n", + "\n", + "A dataset consists of **scenarios**, each with one or more **turns**. Optional ground-truth fields:\n", + "- `Turn.expected_response` — per-turn expected answer\n", + "- `TurnByTurnScenario.expected_trajectory` — ordered list of tool names\n", + "- `TurnByTurnScenario.assertions` — session-level assertions\n", + "\n", + "### How OnDemandEvaluationDatasetRunner works\n", + "\n", + "```\n", + "For each scenario:\n", + " 1. Create a new session ID\n", + " 2. Call your agent_invoker function for each turn\n", + " 3. Wait for CloudWatch spans to appear (evaluation_delay_seconds)\n", + " 4. Submit spans + ground truth to the evaluation service\n", + " 5. Collect and return results\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "runner-imports", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:09.650245Z", + "iopub.status.busy": "2026-03-31T18:12:09.649779Z", + "iopub.status.idle": "2026-03-31T18:12:09.655809Z", + "shell.execute_reply": "2026-03-31T18:12:09.655250Z" + } + }, + "outputs": [], + "source": [ + "from bedrock_agentcore.evaluation import (\n", + " AgentInvokerInput,\n", + " AgentInvokerOutput,\n", + " CloudWatchAgentSpanCollector,\n", + " Dataset,\n", + " EvaluationRunConfig,\n", + " OnDemandEvaluationDatasetRunner,\n", + " EvaluatorConfig,\n", + " Turn,\n", + " PredefinedScenario,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "runner-invoker", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:09.657880Z", + "iopub.status.busy": "2026-03-31T18:12:09.657740Z", + "iopub.status.idle": "2026-03-31T18:12:09.663480Z", + "shell.execute_reply": "2026-03-31T18:12:09.662844Z" + } + }, + "outputs": [], + "source": [ + "def agent_invoker(invoker_input: AgentInvokerInput) -> AgentInvokerOutput:\n", + " \"\"\"\n", + " Called by OnDemandEvaluationDatasetRunner once per turn. Invoke the HR assistant\n", + " and return the text response.\n", + "\n", + " AgentInvokerInput fields:\n", + " - payload: The turn input (str or dict) from the dataset.\n", + " - session_id: Framework-managed session ID, stable across all turns\n", + " in a scenario. Pass it to your agent for conversation continuity.\n", + " \"\"\"\n", + " payload = invoker_input.payload\n", + " body = {\"prompt\": payload} if isinstance(payload, str) else payload\n", + "\n", + " resp = agentcore_client.invoke_agent_runtime(\n", + " agentRuntimeArn=AGENT_ARN,\n", + " qualifier=\"DEFAULT\",\n", + " runtimeSessionId=invoker_input.session_id,\n", + " payload=json.dumps(body).encode(\"utf-8\"),\n", + " )\n", + "\n", + " raw = resp[\"response\"].read().decode(\"utf-8\")\n", + " parts = []\n", + " for line in raw.splitlines():\n", + " if line.startswith(\"data: \"):\n", + " chunk = line[len(\"data: \"):]\n", + " try:\n", + " chunk = json.loads(chunk)\n", + " except Exception:\n", + " pass\n", + " parts.append(str(chunk))\n", + " return AgentInvokerOutput(agent_output=\"\".join(parts) if parts else raw)" + ] + }, + { + "cell_type": "markdown", + "id": "runner-dataset-md", + "metadata": {}, + "source": [ + "### 6a. Define the Evaluation Dataset\n", + "\n", + "We define scenarios inline. A mix of single-turn and multi-turn scenarios exercises\n", + "different aspects of the agent." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "runner-dataset", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:09.665778Z", + "iopub.status.busy": "2026-03-31T18:12:09.665579Z", + "iopub.status.idle": "2026-03-31T18:12:09.670891Z", + "shell.execute_reply": "2026-03-31T18:12:09.670266Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset contains 5 scenarios.\n" + ] + } + ], + "source": [ + "dataset = Dataset(\n", + " scenarios=[\n", + " # --- Single-turn: PTO balance ---\n", + " PredefinedScenario(\n", + " scenario_id=\"pto-balance-check\",\n", + " turns=[\n", + " Turn(\n", + " input=\"What is the current PTO balance for employee EMP-001?\",\n", + " expected_response=\"Employee EMP-001 has 10 remaining PTO days out of 15 total (5 days used).\",\n", + " )\n", + " ],\n", + " expected_trajectory=[\"get_pto_balance\"],\n", + " assertions=[\n", + " \"Agent called get_pto_balance with employee_id=EMP-001\",\n", + " \"Agent reported 10 remaining PTO days\",\n", + " ],\n", + " ),\n", + "\n", + " # --- Single-turn: HR policy lookup ---\n", + " PredefinedScenario(\n", + " scenario_id=\"pto-policy-lookup\",\n", + " turns=[\n", + " Turn(\n", + " input=\"What is the company PTO policy?\",\n", + " expected_response=\"Full-time employees accrue 15 days of PTO per year. Requests must be submitted at least 2 business days in advance. Up to 5 unused days roll over each year.\",\n", + " )\n", + " ],\n", + " expected_trajectory=[\"lookup_hr_policy\"],\n", + " assertions=[\n", + " \"Agent called lookup_hr_policy with topic=pto\",\n", + " \"Agent mentioned the 15-day annual accrual for full-time employees\",\n", + " \"Agent mentioned the 2 business day advance notice requirement\",\n", + " ],\n", + " ),\n", + "\n", + " # --- Single-turn: 401k benefits ---\n", + " PredefinedScenario(\n", + " scenario_id=\"401k-info\",\n", + " turns=[\n", + " Turn(\n", + " input=\"How does the 401k match work?\",\n", + " expected_response=\"The company matches 100% of contributions up to 4% of salary, plus 50% on the next 2%, for a total effective match of up to 5%. The match vests over 3 years.\",\n", + " )\n", + " ],\n", + " expected_trajectory=[\"get_benefits_summary\"],\n", + " assertions=[\n", + " \"Agent called get_benefits_summary with benefit_type=401k\",\n", + " \"Agent correctly described the 4% full match and 50% match on next 2%\",\n", + " \"Agent mentioned the 3-year vesting schedule\",\n", + " ],\n", + " ),\n", + "\n", + " # --- Single-turn: check balance then submit PTO ---\n", + " PredefinedScenario(\n", + " scenario_id=\"check-and-submit-pto\",\n", + " turns=[\n", + " Turn(\n", + " input=\"Check the PTO balance for EMP-002, and if they have at least 2 days, submit a request for 2026-05-26 to 2026-05-27.\",\n", + " expected_response=\"EMP-002 has 3 remaining PTO days. PTO request submitted and approved for 2026-05-26 to 2026-05-27.\",\n", + " )\n", + " ],\n", + " expected_trajectory=[\"get_pto_balance\", \"submit_pto_request\"],\n", + " assertions=[\n", + " \"Agent first called get_pto_balance for EMP-002\",\n", + " \"Agent confirmed 3 remaining days is sufficient\",\n", + " \"Agent then called submit_pto_request for the correct dates\",\n", + " ],\n", + " ),\n", + "\n", + " # --- Multi-turn: benefits exploration ---\n", + " PredefinedScenario(\n", + " scenario_id=\"benefits-exploration\",\n", + " turns=[\n", + " Turn(\n", + " input=\"Can you walk me through the health insurance options?\",\n", + " expected_response=\"The company covers 90% of premiums for employee-only coverage. Three plans are available: Blue Shield PPO, Kaiser HMO, and HDHP with HSA.\",\n", + " ),\n", + " Turn(\n", + " input=\"What about dental?\",\n", + " expected_response=\"The dental plan covers 100% of preventive care, 80% of basic restorative care, and 50% of major work, with a $2,000 annual maximum.\",\n", + " ),\n", + " Turn(\n", + " input=\"And how much does the company contribute to the 401k?\",\n", + " expected_response=\"The company matches 100% up to 4% of salary, plus 50% on the next 2%, for a total effective match of up to 5%.\",\n", + " ),\n", + " ],\n", + " expected_trajectory=[\"get_benefits_summary\", \"get_benefits_summary\", \"get_benefits_summary\"],\n", + " assertions=[\n", + " \"Agent called get_benefits_summary three times across the conversation\",\n", + " \"Agent correctly described health, dental, and 401k benefits in their respective turns\",\n", + " \"Agent maintained conversational context across all three turns\",\n", + " ],\n", + " ),\n", + " ]\n", + ")\n", + "\n", + "print(f\"Dataset contains {len(dataset.scenarios)} scenarios.\")" + ] + }, + { + "cell_type": "markdown", + "id": "runner-config-md", + "metadata": {}, + "source": [ + "### 6b. Configure and Run OnDemandEvaluationDatasetRunner" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "runner-config", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:09.672709Z", + "iopub.status.busy": "2026-03-31T18:12:09.672562Z", + "iopub.status.idle": "2026-03-31T18:12:09.683717Z", + "shell.execute_reply": "2026-03-31T18:12:09.683225Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OnDemandEvaluationDatasetRunner configured. Starting evaluation...\n", + " Scenarios : 5\n", + " Evaluators: 9 (7 built-in + 2 custom)\n", + " Delay : 180s (waiting for CloudWatch ingestion)\n" + ] + } + ], + "source": [ + "# Span collector: polls CloudWatch for OTel spans emitted by the agent\n", + "span_collector = CloudWatchAgentSpanCollector(\n", + " log_group_name=CW_LOG_GROUP,\n", + " region=REGION,\n", + " max_wait_seconds=180,\n", + " poll_interval_seconds=15,\n", + ")\n", + "\n", + "# Evaluator level cache — built-ins + custom evaluators\n", + "EVALUATOR_LEVELS = {\n", + " \"Builtin.GoalSuccessRate\": \"SESSION\",\n", + " \"Builtin.TrajectoryExactOrderMatch\": \"SESSION\",\n", + " \"Builtin.TrajectoryInOrderMatch\": \"SESSION\",\n", + " \"Builtin.TrajectoryAnyOrderMatch\": \"SESSION\",\n", + " \"Builtin.Correctness\": \"TRACE\",\n", + " \"Builtin.Helpfulness\": \"TRACE\",\n", + " \"Builtin.ResponseRelevance\": \"TRACE\",\n", + " \"Builtin.Coherence\": \"TRACE\",\n", + " \"Builtin.InstructionFollowing\": \"TRACE\",\n", + "}\n", + "# Custom evaluators (IDs are runtime values from Step 3b)\n", + "EVALUATOR_LEVELS[CUSTOM_RESPONSE_SIMILARITY_ID] = \"TRACE\"\n", + "EVALUATOR_LEVELS[CUSTOM_ASSERTION_CHECKER_ID] = \"SESSION\"\n", + "\n", + "# Evaluator configuration — mix of built-in and custom evaluators\n", + "config = EvaluationRunConfig(\n", + " evaluator_config=EvaluatorConfig(\n", + " evaluator_ids=[\n", + " \"Builtin.Correctness\", # TRACE — expected_response\n", + " \"Builtin.GoalSuccessRate\", # SESSION — assertions\n", + " \"Builtin.TrajectoryExactOrderMatch\", # SESSION — expected_trajectory\n", + " \"Builtin.TrajectoryInOrderMatch\", # SESSION — expected_trajectory\n", + " \"Builtin.TrajectoryAnyOrderMatch\", # SESSION — expected_trajectory\n", + " \"Builtin.Helpfulness\", # TRACE — no ground truth\n", + " \"Builtin.ResponseRelevance\", # TRACE — no ground truth\n", + " CUSTOM_RESPONSE_SIMILARITY_ID, # TRACE (custom) — {assistant_turn} + {expected_response}\n", + " CUSTOM_ASSERTION_CHECKER_ID, # SESSION (custom) — {actual_tool_trajectory} + {assertions}\n", + " ]\n", + " ),\n", + " evaluation_delay_seconds=180,\n", + " max_concurrent_scenarios=3,\n", + ")\n", + "\n", + "runner = OnDemandEvaluationDatasetRunner(region=REGION)\n", + "runner._evaluator_level_cache.update(EVALUATOR_LEVELS)\n", + "\n", + "print(\"OnDemandEvaluationDatasetRunner configured. Starting evaluation...\")\n", + "print(f\" Scenarios : {len(dataset.scenarios)}\")\n", + "print(f\" Evaluators: {len(config.evaluator_config.evaluator_ids)} \"\n", + " f\"(7 built-in + 2 custom)\")\n", + "print(f\" Delay : {config.evaluation_delay_seconds}s (waiting for CloudWatch ingestion)\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "runner-run", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T18:12:09.685148Z", + "iopub.status.busy": "2026-03-31T18:12:09.685047Z", + "iopub.status.idle": "2026-03-31T19:07:11.806861Z", + "shell.execute_reply": "2026-03-31T19:07:11.806186Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Evaluation complete: 5 completed, 0 failed out of 5 scenarios.\n" + ] + } + ], + "source": [ + "# Run the evaluation.\n", + "# OnDemandEvaluationDatasetRunner will:\n", + "# 1. Invoke agent_invoker for each turn in each scenario\n", + "# 2. Wait evaluation_delay_seconds for CloudWatch ingestion\n", + "# 3. Submit spans to the evaluation service\n", + "# 4. Return aggregated results\n", + "\n", + "eval_result = runner.run(\n", + " config=config,\n", + " dataset=dataset,\n", + " agent_invoker=agent_invoker,\n", + " span_collector=span_collector,\n", + ")\n", + "\n", + "completed = sum(1 for sr in eval_result.scenario_results if sr.status == \"COMPLETED\")\n", + "failed = sum(1 for sr in eval_result.scenario_results if sr.status == \"FAILED\")\n", + "print(f\"\\nEvaluation complete: {completed} completed, {failed} failed out of {len(eval_result.scenario_results)} scenarios.\")" + ] + }, + { + "cell_type": "markdown", + "id": "runner-results-md", + "metadata": {}, + "source": [ + "### 6c. Inspect Results" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "runner-results", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T19:07:11.818123Z", + "iopub.status.busy": "2026-03-31T19:07:11.817895Z", + "iopub.status.idle": "2026-03-31T19:07:11.835130Z", + "shell.execute_reply": "2026-03-31T19:07:11.834370Z" + } + }, + "outputs": [ + { + "data": { + "text/markdown": [ + "### Scenario: `pto-balance-check`\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response provides all the core factual information present in the expected response: employee EMP-001 has 15 total PTO d |\n", + "| `Builtin.GoalSuccessRate` | 1.0 | Yes | The conversation record shows that the agent called the 'get_pto_balance' tool with the parameter 'employee_id': 'EMP-001', which |\n", + "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_pto_balance'] matches expected trajectory ['get_pto_balance'] |\n", + "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_pto_balance'] found in order within actual ['get_pto_balance'] |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_pto_balance'] found in actual ['get_pto_balance'] |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clear and straightforward: to obtain the current PTO balance for employee EMP-001. The assistant successfully r |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked for the current PTO balance for employee EMP-001. The LLM response directly addresses this question by providing th |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all key facts present in the expected response: the total number of PTO days (15), the number of use |\n", + "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory by calling 'get_pto_balance'. Additionally, the agent satisfied the assertion that |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Scenario: `pto-policy-lookup`\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response includes all the core factual information from the expected response: (1) full-time employees accrue 15 days of |\n", + "| `Builtin.GoalSuccessRate` | 1.0 | Yes | Evaluating the three success assertions: 1. **Agent called lookup_hr_policy with topic=pto**: The tool execution history shows th |\n", + "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['lookup_hr_policy'] matches expected trajectory ['lookup_hr_policy'] |\n", + "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['lookup_hr_policy'] found in order within actual ['lookup_hr_policy'] |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['lookup_hr_policy'] found in actual ['lookup_hr_policy'] |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clear and straightforward: to learn about the company's PTO policy. The assistant's response directly addresses |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked 'What is the company PTO policy?' The LLM response directly addresses this question by providing the complete PTO p |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 0.5 | partially_similar | The agent's response includes all the key facts from the expected response: full-time employees accrue 15 days of PTO per year, re |\n", + "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory by calling 'lookup_hr_policy' with the topic 'pto'. Additionally, the agent mentio |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Scenario: `401k-info`\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response accurately conveys all the core factual information from the expected response: (1) 100% match up to 4% of sala |\n", + "| `Builtin.GoalSuccessRate` | 1.0 | Yes | Evaluating the three success assertions: 1. \"Agent called get_benefits_summary with benefit_type=401k\" - The tool execution histo |\n", + "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_benefits_summary'] matches expected trajectory ['get_benefits_summary'] |\n", + "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_benefits_summary'] found in order within actual ['get_benefits_summary'] |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_benefits_summary'] found in actual ['get_benefits_summary'] |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clearly to understand how the 401k match works at their company. The assistant's response directly addresses th |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked 'How does the 401k match work?' The LLM response directly addresses this question by explaining the 401k matching s |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all key facts from the expected response, such as the company's matching contribution of 100% up to |\n", + "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory by calling 'get_benefits_summary'. It also satisfied all assertions by correctly d |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Scenario: `check-and-submit-pto`\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response contains all the core factual information from the expected response: (1) EMP-002 has 3 remaining PTO days, (2) |\n", + "| `Builtin.GoalSuccessRate` | 1.0 | Yes | The agent successfully completed all three assertions: (1) First called get_pto_balance for EMP-002, which returned 3 remaining da |\n", + "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_pto_balance', 'submit_pto_request'] matches expected trajectory ['get_pto_balance', 'submit_p |\n", + "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_pto_balance', 'submit_pto_request'] found in order within actual ['get_pto_balance', 'sub |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_pto_balance', 'submit_pto_request'] found in actual ['get_pto_balance', 'submit_pto_requ |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal was clear: check if EMP-002 has at least 2 days of PTO balance, and if so, submit a request for the specified date |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user's question asks to: (1) check the PTO balance for EMP-002, and (2) if they have at least 2 days, submit a request for 202 |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 0.5 | partially_similar | The agent's response includes all the key information from the expected response: EMP-002 has 3 remaining PTO days, and the PTO re |\n", + "| `HRAssertionChecker_a007e092-zUFaMBBhck` | N/A | ERR:ValueError | No score found in evaluation result |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/markdown": [ + "### Scenario: `benefits-exploration`\n", + "\n", + "| Evaluator | Value | Label | Explanation |\n", + "|---|---|---|---|\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response contains all the core factual information from the expected response: (1) the company covers 90% of premiums fo |\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response contains all the core factual information from the expected response: 100% coverage for preventive care, 80% fo |\n", + "| `Builtin.Correctness` | 1.0 | Correct | The agent response correctly conveys the core factual information about the company's 401k contribution. It states that the compan |\n", + "| `Builtin.GoalSuccessRate` | 1.0 | Yes | Evaluating the three success assertions: 1. **Agent called get_benefits_summary three times across the conversation**: The tool e |\n", + "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_benefits_summary', 'get_benefits_summary', 'get_benefits_summary'] matches expected trajector |\n", + "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_benefits_summary', 'get_benefits_summary', 'get_benefits_summary'] found in order within |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_benefits_summary', 'get_benefits_summary', 'get_benefits_summary'] found in actual ['get |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is to understand the health insurance options available to them. The assistant's response directly addresses this |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is to understand health insurance options at Acme Corp, and they've now expanded their inquiry to include dental b |\n", + "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is to understand employee benefits at Acme Corp, specifically asking about 401k company contributions. The assista |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked to be walked through the health insurance options. The LLM response directly addresses this question by providing a |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user's question is 'What about dental?' which is a follow-up question asking about dental insurance options. The LLM response |\n", + "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user's question is: 'And how much does the company contribute to the 401k?' The LLM response provides information about the 40 |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all the key facts mentioned in the expected response. It accurately states the premium coverage perc |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response provides a comprehensive summary of the dental insurance options, including coverage details, annual maximum |\n", + "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response provides a detailed summary that includes all the key facts mentioned in the expected response. It correctly |\n", + "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory exactly as specified, calling 'get_benefits_summary' three times. Additionally, th |" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def display_runner_results(eval_result) -> None:\n", + " \"\"\"Display OnDemandEvaluationDatasetRunner results as a markdown table per scenario.\"\"\"\n", + " for sr in eval_result.scenario_results:\n", + " if sr.status == \"FAILED\":\n", + " display(Markdown(f\"**Scenario `{sr.scenario_id}`** — FAILED: {sr.error}\"))\n", + " continue\n", + "\n", + " rows = [\"| Evaluator | Value | Label | Explanation |\",\n", + " \"|---|---|---|---|\"]\n", + " for er in sr.evaluator_results:\n", + " for res in er.results:\n", + " value = str(res.get(\"value\", res.get(\"score\", \"N/A\")))\n", + " lbl = str(res.get(\"label\", res.get(\"rating\", \"\")))\n", + " explanation = (res.get(\"explanation\", \"\") or \"\")[:130].replace(\"\\n\", \" \")\n", + " error_code = res.get(\"errorCode\")\n", + " if error_code:\n", + " lbl = f\"ERR:{error_code}\"\n", + " explanation = (res.get(\"errorMessage\", \"\") or \"\")[:130]\n", + " rows.append(f\"| `{er.evaluator_id[:40]}` | {value} | {lbl} | {explanation} |\")\n", + "\n", + " md = f\"### Scenario: `{sr.scenario_id}`\\n\\n\" + \"\\n\".join(rows)\n", + " display(Markdown(md))\n", + "\n", + "\n", + "display_runner_results(eval_result)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "runner-summary", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T19:07:11.837290Z", + "iopub.status.busy": "2026-03-31T19:07:11.837118Z", + "iopub.status.idle": "2026-03-31T19:07:11.840760Z", + "shell.execute_reply": "2026-03-31T19:07:11.840171Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Evaluator Summary (average score across all scenarios)\n", + "============================================================\n", + " Builtin.Correctness avg=1.00 (n=7)\n", + " Builtin.GoalSuccessRate avg=1.00 (n=5)\n", + " Builtin.Helpfulness avg=0.83 (n=7)\n", + " Builtin.ResponseRelevance avg=1.00 (n=7)\n", + " Builtin.TrajectoryAnyOrderMatch avg=1.00 (n=5)\n", + " Builtin.TrajectoryExactOrderMatch avg=1.00 (n=5)\n", + " Builtin.TrajectoryInOrderMatch avg=1.00 (n=5)\n", + " HRAssertionChecker_a007e092-zUFaMBBhck avg=1.00 (n=4)\n", + " HRResponseSimilarity_a007e092-v0ojZ8ARHR avg=0.86 (n=7)\n" + ] + } + ], + "source": [ + "# Aggregate summary: average score per evaluator across all scenarios\n", + "from collections import defaultdict\n", + "\n", + "scores_by_evaluator = defaultdict(list)\n", + "for sr in eval_result.scenario_results:\n", + " if sr.status != \"COMPLETED\":\n", + " continue\n", + " for er in sr.evaluator_results:\n", + " for res in er.results:\n", + " if \"value\" in res and res[\"value\"] is not None and not res.get(\"errorCode\"):\n", + " scores_by_evaluator[er.evaluator_id].append(float(res[\"value\"]))\n", + "\n", + "print(\"\\nEvaluator Summary (average score across all scenarios)\")\n", + "print(\"=\" * 60)\n", + "for evaluator_id, scores in sorted(scores_by_evaluator.items()):\n", + " avg = sum(scores) / len(scores)\n", + " print(f\" {evaluator_id:<45} avg={avg:.2f} (n={len(scores)})\")" + ] + }, + { + "cell_type": "markdown", + "id": "save-results-md", + "metadata": {}, + "source": [ + "### 6d. Save Results to File" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "save-results", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T19:07:11.842441Z", + "iopub.status.busy": "2026-03-31T19:07:11.842317Z", + "iopub.status.idle": "2026-03-31T19:07:11.851164Z", + "shell.execute_reply": "2026-03-31T19:07:11.850695Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Results saved to: results/groundtruth_eval_20260331_190711.json\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/9h/jk764rms7493dnsp6_d37wqm0000gq/T/ipykernel_96692/3226257259.py:5: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n", + " timestamp = datetime.utcnow().strftime(\"%Y%m%d_%H%M%S\")\n" + ] + } + ], + "source": [ + "import os\n", + "from datetime import datetime\n", + "\n", + "os.makedirs(\"results\", exist_ok=True)\n", + "timestamp = datetime.utcnow().strftime(\"%Y%m%d_%H%M%S\")\n", + "results_path = f\"results/groundtruth_eval_{timestamp}.json\"\n", + "\n", + "with open(results_path, \"w\") as f:\n", + " json.dump(eval_result.model_dump(), f, indent=2, default=str)\n", + "\n", + "print(f\"Results saved to: {results_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cleanup-md", + "metadata": {}, + "source": [ + "## Step 7: Cleanup\n", + "\n", + "Delete the agent runtime endpoint when you're done to avoid ongoing costs." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "cleanup", + "metadata": { + "execution": { + "iopub.execute_input": "2026-03-31T19:07:11.852819Z", + "iopub.status.busy": "2026-03-31T19:07:11.852701Z", + "iopub.status.idle": "2026-03-31T19:07:11.855335Z", + "shell.execute_reply": "2026-03-31T19:07:11.854895Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cleanup skipped. Uncomment the cell above to delete the agent runtime.\n" + ] + } + ], + "source": [ + "# Uncomment to delete the agent runtime\n", + "# agent_runtime.delete()\n", + "# print(\"Agent runtime deleted.\")\n", + "\n", + "print(\"Cleanup skipped. Uncomment the cell above to delete the agent runtime.\")" + ] + }, + { + "cell_type": "markdown", + "id": "next-steps-md", + "metadata": {}, + "source": [ + "### Key takeaways\n", + "\n", + "| | EvaluationClient | OnDemandEvaluationDatasetRunner |\n", + "|---|---|---|\n", + "| **When to use** | You have existing sessions | You have a test dataset |\n", + "| **Agent invocation** | Not included | Automatic |\n", + "| **Best for** | Post-hoc analysis, debugging | Regression testing, CI/CD |\n", + "| **Input** | session_id | Dataset of scenarios |\n", + "\n", + "### Built-in evaluator reference\n", + "\n", + "| Evaluator | Level | Ground truth required |\n", + "|---|---|---|\n", + "| `Builtin.Correctness` | TRACE | `expected_response` |\n", + "| `Builtin.GoalSuccessRate` | SESSION | `assertions` |\n", + "| `Builtin.TrajectoryExactOrderMatch` | SESSION | `expected_trajectory` |\n", + "| `Builtin.TrajectoryInOrderMatch` | SESSION | `expected_trajectory` |\n", + "| `Builtin.TrajectoryAnyOrderMatch` | SESSION | `expected_trajectory` |\n", + "| `Builtin.Helpfulness` | TRACE | None |\n", + "| `Builtin.ResponseRelevance` | TRACE | None |\n", + "| `Builtin.Coherence` | TRACE | None |\n", + "\n", + "### Custom evaluator ground truth placeholders\n", + "\n", + "Custom (LLM-as-a-judge) evaluators reference ground truth via placeholders in their `instructions`.\n", + "\n", + "| Level | Placeholder | Filled from |\n", + "|---|---|---|\n", + "| TRACE | `{assistant_turn}` | Agent's actual response |\n", + "| TRACE | `{expected_response}` | `ReferenceInputs.expected_response` |\n", + "| TRACE | `{context}` | Session context |\n", + "| SESSION | `{actual_tool_trajectory}` | Tools called by the agent |\n", + "| SESSION | `{expected_tool_trajectory}` | `ReferenceInputs.expected_trajectory` |\n", + "| SESSION | `{assertions}` | `ReferenceInputs.assertions` |\n", + "| SESSION | `{available_tools}` | Tools available to the agent |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py new file mode 100644 index 000000000..f0ecb1b34 --- /dev/null +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py @@ -0,0 +1,268 @@ +""" +HR Assistant Agent — Strands agent deployed on Bedrock AgentCore Runtime. + +Tools (deterministic / mock data for reproducible evaluations): + get_pto_balance — remaining PTO days for an employee + submit_pto_request — request time off + lookup_hr_policy — company policy documents + get_benefits_summary — health, dental, vision, 401k, life insurance details + get_pay_stub — pay stub for a given period +""" + +import logging +import re + +from bedrock_agentcore.runtime import BedrockAgentCoreApp +from strands import Agent, tool +from strands.models import BedrockModel + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +app = BedrockAgentCoreApp() + +# --------------------------------------------------------------------------- +# Mock data +# --------------------------------------------------------------------------- + +_PTO_BALANCES = { + "EMP-001": {"total_days": 15, "used_days": 5, "remaining_days": 10}, + "EMP-002": {"total_days": 15, "used_days": 12, "remaining_days": 3}, + "EMP-042": {"total_days": 20, "used_days": 7, "remaining_days": 13}, +} + +_HR_POLICIES = { + "pto": ( + "PTO Policy: Full-time employees accrue 15 days of PTO per year (20 days after 3 years). " + "PTO requests must be submitted at least 2 business days in advance. " + "Unused PTO up to 5 days rolls over to the next year. " + "PTO cannot be taken in advance of accrual." + ), + "remote_work": ( + "Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. " + "Core collaboration hours are 10am-3pm local time. " + "A dedicated workspace with reliable internet (25 Mbps+) is required. " + "Employees must be reachable via Slack and email during core hours." + ), + "parental_leave": ( + "Parental Leave Policy: Primary caregivers receive 16 weeks of fully paid parental leave. " + "Secondary caregivers receive 6 weeks of fully paid parental leave. " + "Leave may begin up to 2 weeks before the expected birth or adoption date. " + "Benefits continue unchanged during parental leave." + ), + "code_of_conduct": ( + "Code of Conduct: All employees are expected to treat colleagues, customers, and partners " + "with respect and professionalism. Harassment, discrimination, and retaliation of any kind " + "are strictly prohibited. Violations should be reported to HR or via the anonymous hotline." + ), +} + +_BENEFITS = { + "health": ( + "Health Insurance: The company covers 90% of premiums for employee-only coverage and 75% " + "for family coverage. Plans available: Blue Shield PPO, Kaiser HMO, and HDHP with HSA. " + "Annual deductible: $500 (PPO), $0 (HMO), $1,500 (HDHP). " + "Open enrollment is each November for the following calendar year." + ), + "dental": ( + "Dental Insurance: 100% coverage for preventive care (cleanings, X-rays). " + "80% coverage for basic restorative care (fillings, extractions). " + "50% coverage for major restorative care (crowns, bridges). " + "Annual maximum benefit: $2,000 per person. Orthodontia lifetime maximum: $1,500." + ), + "vision": ( + "Vision Insurance: Annual eye exam covered in full. " + "Frames or contacts allowance: $200 per year. " + "Laser vision correction discount: 15% off at participating providers." + ), + "401k": ( + "401(k) Plan: The company matches 100% of employee contributions up to 4% of salary. " + "An additional 50% match on the next 2% (total effective match up to 5%). " + "Employees are eligible to contribute immediately; company match vests over 3 years. " + "2026 IRS contribution limit: $23,500 (under 50), $31,000 (age 50+)." + ), + "life_insurance": ( + "Life Insurance: Basic life insurance of 2x annual salary provided at no cost. " + "Employees may purchase supplemental coverage up to 5x salary during open enrollment. " + "Accidental death and dismemberment (AD&D) coverage equal to basic life benefit is included." + ), +} + +_PAY_STUBS = { + ("EMP-001", "2025-12"): { + "gross_pay": 8333.33, "federal_tax": 1458.33, "state_tax": 416.67, + "social_security": 516.67, "medicare": 120.83, "health_premium": 125.00, + "401k_contribution": 333.33, "net_pay": 5362.50, "period": "December 2025", + }, + ("EMP-001", "2026-01"): { + "gross_pay": 8333.33, "federal_tax": 1458.33, "state_tax": 416.67, + "social_security": 516.67, "medicare": 120.83, "health_premium": 125.00, + "401k_contribution": 333.33, "net_pay": 5362.50, "period": "January 2026", + }, + ("EMP-042", "2026-01"): { + "gross_pay": 10416.67, "federal_tax": 1875.00, "state_tax": 520.83, + "social_security": 645.83, "medicare": 151.04, "health_premium": 200.00, + "401k_contribution": 416.67, "net_pay": 6607.30, "period": "January 2026", + }, +} + +_PTO_REQUEST_COUNTER = {"n": 0} + + +# --------------------------------------------------------------------------- +# Strands tools +# --------------------------------------------------------------------------- + +@tool +def get_pto_balance(employee_id: str) -> dict: + """ + Return the current PTO balance for an employee. + + Args: + employee_id: Employee identifier (e.g. EMP-001) + + Returns: + Dict with total_days, used_days, and remaining_days. + """ + balance = _PTO_BALANCES.get(employee_id) + if balance: + return {"employee_id": employee_id, **balance} + return {"employee_id": employee_id, "error": f"Employee {employee_id} not found."} + + +@tool +def submit_pto_request( + employee_id: str, + start_date: str, + end_date: str, + reason: str = "Personal time off", +) -> dict: + """ + Submit a PTO request for an employee. + + Args: + employee_id: Employee identifier (e.g. EMP-001) + start_date: First day of leave in YYYY-MM-DD format + end_date: Last day of leave in YYYY-MM-DD format + reason: Optional reason for the request + + Returns: + Dict with request_id, status, and confirmation message. + """ + _PTO_REQUEST_COUNTER["n"] += 1 + request_id = f"PTO-2026-{_PTO_REQUEST_COUNTER['n']:03d}" + return { + "request_id": request_id, + "employee_id": employee_id, + "start_date": start_date, + "end_date": end_date, + "reason": reason, + "status": "APPROVED", + "message": f"PTO request {request_id} approved for {employee_id} from {start_date} to {end_date}.", + } + + +@tool +def lookup_hr_policy(topic: str) -> dict: + """ + Look up a company HR policy document by topic. + + Args: + topic: Policy topic. Supported values: pto, remote_work, parental_leave, code_of_conduct + + Returns: + Dict with topic and policy_text. + """ + key = topic.lower().replace(" ", "_").replace("-", "_") + text = _HR_POLICIES.get(key) + if text: + return {"topic": topic, "policy_text": text} + return {"topic": topic, "error": f"Policy '{topic}' not found. Available: {list(_HR_POLICIES.keys())}"} + + +@tool +def get_benefits_summary(benefit_type: str) -> dict: + """ + Return a summary of a specific employee benefit. + + Args: + benefit_type: Type of benefit. Supported values: health, dental, vision, 401k, life_insurance + + Returns: + Dict with benefit_type and summary text. + """ + key = benefit_type.lower().replace(" ", "_").replace("-", "_") + text = _BENEFITS.get(key) + if text: + return {"benefit_type": benefit_type, "summary": text} + return {"benefit_type": benefit_type, "error": f"Benefit '{benefit_type}' not found. Available: {list(_BENEFITS.keys())}"} + + +@tool +def get_pay_stub(employee_id: str, period: str) -> dict: + """ + Retrieve a pay stub for an employee for a specific pay period. + + Args: + employee_id: Employee identifier (e.g. EMP-001) + period: Pay period in YYYY-MM format (e.g. 2026-01) + + Returns: + Dict with gross pay, deductions, and net pay. + """ + stub = _PAY_STUBS.get((employee_id, period)) + if stub: + return {"employee_id": employee_id, **stub} + return {"employee_id": employee_id, "period": period, "error": f"Pay stub not found for {employee_id} period {period}."} + + +# --------------------------------------------------------------------------- +# Agent +# --------------------------------------------------------------------------- + +SYSTEM_PROMPT = """You are a helpful HR Assistant for Acme Corp. + +You help employees with: +- Checking PTO (paid time off) balances +- Submitting PTO requests +- Looking up HR policies (PTO, remote work, parental leave, code of conduct) +- Understanding employee benefits (health, dental, vision, 401k, life insurance) +- Retrieving pay stub information + +Always use the available tools to answer questions accurately. Do not make up +policy details, benefit amounts, or pay information — look them up. +Be concise, professional, and friendly.""" + +_MODEL = BedrockModel(model_id="us.amazon.nova-lite-v1:0") +_TOOLS = [get_pto_balance, submit_pto_request, lookup_hr_policy, get_benefits_summary, get_pay_stub] + +# Session cache: session_id -> Agent (preserves conversation history across turns) +_SESSION_AGENTS: dict[str, Agent] = {} + + +@app.entrypoint +async def invoke(payload, context): + """Handle an agent invocation from AgentCore Runtime.""" + prompt = payload.get("prompt", "") + session_id = context.session_id + logger.info("Received prompt (session=%s): %s", session_id, prompt[:80]) + + if session_id and session_id in _SESSION_AGENTS: + agent = _SESSION_AGENTS[session_id] + else: + agent = Agent(model=_MODEL, tools=_TOOLS, system_prompt=SYSTEM_PROMPT) + if session_id: + _SESSION_AGENTS[session_id] = agent + + parts = [] + async for event in agent.stream_async(prompt): + if "data" in event: + parts.append(str(event["data"])) + response = "".join(parts) + # Strip inline ... blocks so spans contain only the final answer + response = re.sub(r".*?", "", response, flags=re.DOTALL).strip() + return response + + +if __name__ == "__main__": + app.run() diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/requirements.txt b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/requirements.txt new file mode 100644 index 000000000..39492b397 --- /dev/null +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/requirements.txt @@ -0,0 +1,6 @@ +bedrock-agentcore>=1.5.0 +bedrock-agentcore-starter-toolkit>=0.3.0 +boto3>=1.42.0 +strands-agents +strands-agents-tools +aws-opentelemetry-distro From e90762ea607a5945d6121fe60892ef7498c5496d Mon Sep 17 00:00:00 2001 From: Bharathi Srinivasan Date: Tue, 31 Mar 2026 14:51:23 -0700 Subject: [PATCH 2/3] updating README --- .../README.md | 49 +- .../groundtruth_evaluations.ipynb | 1648 +++-------------- 2 files changed, 258 insertions(+), 1439 deletions(-) diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md index 5102077cd..d7af7fe4a 100644 --- a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md @@ -2,7 +2,7 @@ ## Overview -This tutorial demonstrates end-to-end evaluation of an agentic application using Amazon Bedrock AgentCore's two primary evaluation interfaces: **EvaluationClient** and **EvaluationRunner**. Both are used with ground-truth reference inputs to measure factual correctness, goal achievement, and tool-use accuracy. +This tutorial demonstrates end-to-end evaluation of an agentic application using Amazon Bedrock AgentCore's two primary evaluation interfaces: **EvaluationClient** and **OnDemandEvaluationDatasetRunner**. Both are used with ground-truth reference inputs to measure factual correctness, goal achievement, and tool-use accuracy. The tutorial uses an **HR Assistant agent** for Acme Corp — a Strands agent that helps employees with PTO management, HR policy lookups, benefits information, and pay stubs. @@ -14,13 +14,13 @@ The tutorial uses an **HR Assistant agent** for Acme Corp — a Strands agent th - How to define an evaluation dataset with `TurnByTurnScenario` and `Turn` - How to run automated dataset evaluations with `EvaluationRunner` - How to interpret built-in evaluator results for trajectory, correctness, and goal-success metrics +- How to create **custom LLM-as-a-judge evaluators** with ground-truth placeholders ## Prerequisites Before running this tutorial, ensure you have: - Python 3.10+ -- Docker running locally (for agent container image build) - AWS credentials with permissions for: - AgentCore Runtime (`bedrock-agentcore:*`) - AgentCore Evaluations (`bedrock-agentcore:Evaluate`) @@ -46,10 +46,11 @@ A single self-contained notebook that walks through the full evaluation workflow |---|---| | 1 | Install dependencies | | 2 | Configure AWS session and region | -| 3 | Deploy the HR Assistant agent to AgentCore Runtime | +| 3a | Deploy the HR Assistant agent to AgentCore Runtime | +| 3b | **Create custom LLM-as-a-judge evaluators** with ground-truth placeholders | | 4 | Invoke the agent to generate sessions with CloudWatch spans | | 5 | **EvaluationClient** — evaluate existing sessions with ground truth | -| 6 | **EvaluationRunner** — automated dataset evaluation | +| 6 | **OnDemandEvaluationDatasetRunner** — automated dataset evaluation | | 7 | Cleanup | ## EvaluationClient vs EvaluationRunner @@ -57,7 +58,6 @@ A single self-contained notebook that walks through the full evaluation workflow | | EvaluationClient | EvaluationRunner | |---|---|---| | **When to use** | You already have recorded sessions | You have a test dataset | -| **Agent invocation** | Not included — evaluates existing sessions | Automatic — invokes your agent for every scenario | | **Input** | `session_id` + `agent_id` | `Dataset` of `TurnByTurnScenario` objects | | **Best for** | Post-hoc analysis, debugging, incident investigation | Regression testing, CI/CD pipelines, batch evaluation | @@ -73,7 +73,33 @@ A single self-contained notebook that walks through the full evaluation workflow Evaluators that don't require ground truth (`Builtin.Helpfulness`, `Builtin.ResponseRelevance`) can be included in the same call — each evaluator reads only the fields it needs. -The same fields apply to `TurnByTurnScenario` objects in `EvaluationRunner` datasets. +The same fields apply to `PredefinedScenario` objects in `OnDemandEvaluationDatasetRunner` datasets. + +## Custom Evaluators with Ground Truth + +In addition to built-in evaluators, you can define **custom LLM-as-a-judge evaluators** with +evaluation criteria written in natural language. Custom evaluators support the same ground-truth +fields through **placeholders** that the service substitutes at evaluation time. + +### Placeholder reference + +| Level | Placeholder | Filled from | +|---|---|---| +| TRACE | `{assistant_turn}` | Agent's actual response for that turn | +| TRACE | `{expected_response}` | `ReferenceInputs.expected_response` | +| TRACE | `{context}` | Conversation context preceding the turn | +| SESSION | `{actual_tool_trajectory}` | Tools the agent called during the session | +| SESSION | `{expected_tool_trajectory}` | `ReferenceInputs.expected_trajectory` | +| SESSION | `{assertions}` | `ReferenceInputs.assertions` | +| SESSION | `{available_tools}` | Tools available to the agent | + + +The notebook demonstrates two custom evaluators: + +| Evaluator | Level | Placeholders | Description | +|---|---|---|---| +| `HRResponseSimilarity` | TRACE | `{assistant_turn}`, `{expected_response}` | Scores how closely the agent's response matches the expected answer | +| `HRAssertionChecker` | SESSION | `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, `{assertions}` | Scores whether the agent called the right tools and satisfied all assertions | ## Built-in Evaluators Used @@ -109,14 +135,9 @@ The notebook evaluates five scenarios that cover different evaluation patterns: | Scenario | Turns | Key evaluators | |---|---|---| -| PTO balance check | 1 | Correctness, Helpfulness | -| PTO submission | 1 | GoalSuccessRate, Trajectory, Correctness | +| PTO balance check | 1 | Correctness, Helpfulness, **HRResponseSimilarity** (custom) | +| PTO submission | 1 | GoalSuccessRate, Trajectory, Correctness, **HRResponseSimilarity** (custom) | | Pay stub lookup | 1 | Correctness, GoalSuccessRate | -| PTO planning session | 3 | GoalSuccessRate, TrajectoryExactOrderMatch | +| PTO planning session | 3 | GoalSuccessRate, TrajectoryExactOrderMatch, **HRAssertionChecker** (custom) | | New employee onboarding | 4 | GoalSuccessRate, TrajectoryAnyOrderMatch | -## Next Steps - -- **[programmatic_evaluators/](../programmatic_evaluators/)** — Extend evaluations with Lambda-backed code evaluators for deterministic, business-rule-driven scoring. -- Review evaluation results to identify scenarios where the agent calls the wrong tool or returns inaccurate facts. -- Integrate `EvaluationRunner` into your CI/CD pipeline to catch regressions automatically. diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb index cf2388de2..4a7137881 100644 --- a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb @@ -52,7 +52,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "install", "metadata": { "execution": { @@ -62,17 +62,7 @@ "shell.execute_reply": "2026-03-31T18:08:34.577735Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.1.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m26.0.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "!pip install -r requirements.txt -q" ] @@ -89,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "setup", "metadata": { "execution": { @@ -99,15 +89,7 @@ "shell.execute_reply": "2026-03-31T18:08:34.719215Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Region : us-east-1\n" - ] - } - ], + "outputs": [], "source": [ "import boto3\n", "import json\n", @@ -117,7 +99,7 @@ "from boto3.session import Session\n", "from IPython.display import display, Markdown\n", "\n", - "region = \"us-east-1\"\n", + "region = \"aws_region\" # Add AWS region here \n", "boto_session = Session(region_name=region)\n", "REGION = boto_session.region_name\n", "\n", @@ -145,7 +127,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "nn72gdo2s4h", "metadata": { "execution": { @@ -155,15 +137,7 @@ "shell.execute_reply": "2026-03-31T18:08:34.727890Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting hr_assistant_agent.py\n" - ] - } - ], + "outputs": [], "source": [ "%%writefile hr_assistant_agent.py\n", "\"\"\"\n", @@ -438,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "deploy", "metadata": { "execution": { @@ -448,585 +422,7 @@ "shell.execute_reply": "2026-03-31T18:09:25.947045Z" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Entrypoint parsed: file=/Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py, bedrock_agentcore_name=hr_assistant_agent\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Memory disabled - agent will be stateless\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Configuring BedrockAgentCore agent: hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Memory disabled\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Network mode: PUBLIC\n" - ] - }, - { - "data": { - "text/html": [ - "
📄 Using existing Dockerfile: \n",
-       "/Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05\n",
-       "-groundtruth-based-evalautions/Dockerfile\n",
-       "
\n" - ], - "text/plain": [ - "📄 Using existing Dockerfile: \n", - "\u001b[35m/Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05\u001b[0m\n", - "\u001b[35m-groundtruth-based-evalautions/\u001b[0m\u001b[95mDockerfile\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generated .dockerignore: /Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.dockerignore\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Keeping 'hr_assistant_eval_tutorial' as default agent\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Bedrock AgentCore configured: /Users/bhrsrini/projects/Agentcore/Policy/amazon-bedrock-agentcore-samples/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.bedrock_agentcore.yaml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🚀 Launching Bedrock AgentCore (cloud mode - RECOMMENDED)...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " • Deploy Python code directly to runtime\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " • No Docker required (DEFAULT behavior)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " • Production-ready deployment\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "💡 Deployment options:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " • runtime.launch() → Cloud (current)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " • runtime.launch(local=True) → Local development\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Memory disabled - skipping memory creation\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Starting CodeBuild ARM64 deployment for agent 'hr_assistant_eval_tutorial' to account 849138760372 (us-east-1)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Generated image tag: 20260331-180836-339\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Setting up AWS resources (ECR repository, execution roles)...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Getting or creating ECR repository for agent: hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Configuration complete.\n", - "\n", - "Deploying HR Assistant Agent ...\n", - " This takes ~5 minutes on first run (image build + push + runtime creation).\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ECR repository available: 849138760372.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Getting or creating execution role for agent: hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using AWS region: us-east-1, account ID: 849138760372\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Role name: AmazonBedrockAgentCoreSDKRuntime-us-east-1-0c8cfc0a5d\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "✅ Reusing existing ECR repository: 849138760372.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ Reusing existing execution role: arn:aws:iam::849138760372:role/AmazonBedrockAgentCoreSDKRuntime-us-east-1-0c8cfc0a5d\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Execution role available: arn:aws:iam::849138760372:role/AmazonBedrockAgentCoreSDKRuntime-us-east-1-0c8cfc0a5d\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Preparing CodeBuild project and uploading source...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Getting or creating CodeBuild execution role for agent: hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Role name: AmazonBedrockAgentCoreSDKCodeBuild-us-east-1-0c8cfc0a5d\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Reusing existing CodeBuild execution role: arn:aws:iam::849138760372:role/AmazonBedrockAgentCoreSDKCodeBuild-us-east-1-0c8cfc0a5d\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using dockerignore.template with 47 patterns for zip filtering\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Uploaded source to S3: hr_assistant_eval_tutorial/source.zip\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Updated CodeBuild project: bedrock-agentcore-hr_assistant_eval_tutorial-builder\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Starting CodeBuild build (this may take several minutes)...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Starting CodeBuild monitoring...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔄 QUEUED started (total: 0s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ QUEUED completed in 1.1s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔄 PROVISIONING started (total: 1s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ PROVISIONING completed in 6.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔄 DOWNLOAD_SOURCE started (total: 8s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ DOWNLOAD_SOURCE completed in 2.2s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔄 BUILD started (total: 10s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ BUILD completed in 16.7s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔄 POST_BUILD started (total: 27s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ POST_BUILD completed in 12.3s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔄 COMPLETED started (total: 39s)\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ COMPLETED completed in 1.1s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🎉 CodeBuild completed successfully in 0m 40s\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "CodeBuild completed successfully\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "CodeBuild project configuration saved\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Deploying to Bedrock AgentCore...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Agent created/updated: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Observability is enabled, configuring observability components...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "CloudWatch Logs resource policy already configured\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "X-Ray trace destination already configured\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "X-Ray indexing rule already configured\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Transaction Search already fully configured\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ObservabilityDeliveryManager initialized for region: us-east-1, account: 849138760372\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ Logs auto-created by AWS for runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ Traces delivery enabled for runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Observability enabled for runtime/hr_assistant_eval_tutorial-xfZ3yiH356 - logs: True, traces: True\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "✅ X-Ray traces delivery enabled for agent hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔍 GenAI Observability Dashboard:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#gen-ai-observability/agent-core\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Polling for endpoint to be ready...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Agent endpoint: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356/runtime-endpoint/DEFAULT\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Deployment completed successfully - Agent: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Built with CodeBuild: bedrock-agentcore-hr_assistant_eval_tutorial-builder:8b4e4d86-70e7-445e-9948-b4a88c7f518a\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Deployed to cloud: arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ECR image: 849138760372.dkr.ecr.us-east-1.amazonaws.com/bedrock-agentcore-hr_assistant_eval_tutorial:20260331-180836-339\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "🔍 Agent logs available at:\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-name-prefix \"2026/03/31/\\[runtime-logs]\"\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-names \"otel-rt-logs\"\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "💡 Tail logs with: aws logs tail /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-name-prefix \"2026/03/31/\\[runtime-logs]\" --follow\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "💡 Or view recent logs: aws logs tail /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT --log-stream-name-prefix \"2026/03/31/\\[runtime-logs]\" --since 1h\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Launch complete.\n", - " agent_id : hr_assistant_eval_tutorial-xfZ3yiH356\n", - " agent_arn : arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n" - ] - } - ], + "outputs": [], "source": [ "from bedrock_agentcore_starter_toolkit import Runtime\n", "\n", @@ -1057,7 +453,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "wait-deploy", "metadata": { "execution": { @@ -1067,31 +463,7 @@ "shell.execute_reply": "2026-03-31T18:09:26.423531Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Waiting for agent to reach READY status ...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Retrieved Bedrock AgentCore status for: hr_assistant_eval_tutorial\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " [ 0s] status = READY\n", - "\n", - "Agent is READY. Proceeding.\n" - ] - } - ], + "outputs": [], "source": [ "import time\n", "\n", @@ -1127,7 +499,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "agent-config", "metadata": { "execution": { @@ -1137,17 +509,7 @@ "shell.execute_reply": "2026-03-31T18:09:26.430128Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "AGENT_ID : hr_assistant_eval_tutorial-xfZ3yiH356\n", - "AGENT_ARN : arn:aws:bedrock-agentcore:us-east-1:849138760372:runtime/hr_assistant_eval_tutorial-xfZ3yiH356\n", - "CW_LOG_GROUP : /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT\n" - ] - } - ], + "outputs": [], "source": [ "AGENT_ID = _launch.agent_id\n", "AGENT_ARN = _launch.agent_arn\n", @@ -1162,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "store-agent", "metadata": { "execution": { @@ -1172,18 +534,7 @@ "shell.execute_reply": "2026-03-31T18:09:26.436137Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'AGENT_ID' (str)\n", - "Stored 'AGENT_ARN' (str)\n", - "Stored 'CW_LOG_GROUP' (str)\n", - "Stored 'REGION' (str)\n" - ] - } - ], + "outputs": [], "source": [ "# Persist agent info for the programmatic_evaluators notebook\n", "%store AGENT_ID\n", @@ -1194,13 +545,167 @@ }, { "cell_type": "markdown", - "id": "a6r2v510ko", + "id": "91312ba2", + "metadata": {}, + "source": [ + "## Step 3: Invoke the Agent to Generate Sessions\n", + "\n", + "Before we can evaluate, we need agent sessions with CloudWatch spans. We'll invoke the agent\n", + "for several scenarios and record the session IDs for use with `EvaluationClient`.\n", + "\n", + "Each session corresponds to one evaluation scenario." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fb84d7e", "metadata": {}, + "outputs": [], "source": [ - "## Step 3b: Create Custom (LLM-as-a-Judge) Evaluators\n", + "def invoke_agent(prompt: str, session_id: str) -> str:\n", + " \"\"\"Send a single prompt to the HR assistant and return its text response.\"\"\"\n", + " resp = agentcore_client.invoke_agent_runtime(\n", + " agentRuntimeArn=AGENT_ARN,\n", + " qualifier=\"DEFAULT\",\n", + " runtimeSessionId=session_id,\n", + " payload=json.dumps({\"prompt\": prompt}).encode(\"utf-8\"),\n", + " )\n", + " raw = resp[\"response\"].read().decode(\"utf-8\")\n", + " parts = []\n", + " for line in raw.splitlines():\n", + " if line.startswith(\"data: \"):\n", + " chunk = line[len(\"data: \"):]\n", + " try:\n", + " chunk = json.loads(chunk)\n", + " except Exception:\n", + " pass\n", + " parts.append(str(chunk))\n", + " return \"\".join(parts) if parts else raw\n", + "\n", + "\n", + "def run_session(turns: list[str], session_prefix: str) -> str:\n", + " \"\"\"Invoke a multi-turn session and return its session ID.\"\"\"\n", + " session_id = f\"{session_prefix}-{uuid.uuid4()}\"\n", + " print(f\"Session: {session_id}\")\n", + " for turn_input in turns:\n", + " print(f\" > {turn_input[:70]}\")\n", + " response = invoke_agent(turn_input, session_id)\n", + " print(f\" < {response[:100]}\")\n", + " return session_id" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a7ca4e6c", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Single-turn sessions ---\n", + "\n", + "print(\"=== Single-Turn Sessions ===\")\n", + "\n", + "session_pto_balance = run_session(\n", + " [\"What is the current PTO balance for employee EMP-001?\"],\n", + " \"pto-balance-check\"\n", + ")\n", + "\n", + "session_submit_pto = run_session(\n", + " [\"Please submit a PTO request for employee EMP-001 from 2026-04-14 to 2026-04-16 for a family vacation.\"],\n", + " \"submit-pto-request\"\n", + ")\n", + "\n", + "session_pay_stub = run_session(\n", + " [\"Can you pull up the January 2026 pay stub for employee EMP-001?\"],\n", + " \"pay-stub-lookup\"\n", + ")\n", + "\n", + "print(\"\\nSingle-turn sessions created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c603619", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Multi-turn session: PTO planning ---\n", + "\n", + "print(\"=== Multi-Turn Session: PTO Planning ===\")\n", + "\n", + "session_pto_planning = run_session(\n", + " [\n", + " \"How many PTO days do I have left? My employee ID is EMP-001.\",\n", + " \"Great. I'd like to take December 23 to December 25 off. Please submit a request.\",\n", + " \"Remind me — what is the policy on rolling over unused PTO?\",\n", + " ],\n", + " \"pto-planning-session\"\n", + ")\n", + "\n", + "print(\"\\nMulti-turn session created.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42ff04a5", + "metadata": {}, + "outputs": [], + "source": [ + "# --- Multi-turn session: New employee onboarding ---\n", + "\n", + "print(\"=== Multi-Turn Session: New Employee Onboarding ===\")\n", + "\n", + "session_onboarding = run_session(\n", + " [\n", + " \"I just joined the company. What is the remote work policy?\",\n", + " \"How much PTO do I get as a new employee?\",\n", + " \"What life insurance benefit does the company provide?\",\n", + " \"Can you check the current PTO balance for employee EMP-042?\",\n", + " ],\n", + " \"new-employee-onboarding\"\n", + ")\n", + "\n", + "print(\"\\nAll sessions created. Waiting 60s for CloudWatch log ingestion...\")\n", + "time.sleep(60)\n", + "print(\"Ready to evaluate.\")" + ] + }, + { + "cell_type": "markdown", + "id": "eval-client-md", + "metadata": {}, + "source": [ + "## Step 5: EvaluationClient — Evaluate Existing Sessions\n", + "\n", + "`EvaluationClient` is the right tool when you **already have agent sessions** logged in CloudWatch and you want to test them against your ground truth in a ad-hoc manner.\n", + "It looks up the agent's spans for a given `session_id` and runs evaluators against them. For these evaluations, you can pass in an expected response, assertions and expected trajectory. You can use the Built-in evaluators as well as the custom evaluators.\n", + "\n", + "### Ground-Truth Reference Inputs\n", + "\n", + "`ReferenceInputs` lets you supply optional ground truth:\n", + "\n", + "| Field | Evaluators that use it | Description |\n", + "|---|---|---|\n", + "| `expected_response` | `Builtin.Correctness` | The ideal response text |\n", + "| `expected_trajectory` | `Builtin.TrajectoryExactOrderMatch`, `Builtin.TrajectoryInOrderMatch`, `Builtin.TrajectoryAnyOrderMatch` | Ordered list of tool names |\n", + "| `assertions` | `Builtin.GoalSuccessRate` | Free-text assertions the session should satisfy |\n", + "\n", + "Evaluators that don't need ground truth (`Helpfulness`, `ResponseRelevance`) can be included in the same call.\n", + "Each evaluator only reads the fields it needs." + ] + }, + { + "cell_type": "markdown", + "id": "4d583593", + "metadata": {}, + "source": [ + "## Create Custom (LLM-as-a-Judge) Evaluators\n", "\n", "In addition to built-in evaluators, you can define your own evaluation criteria using\n", - "**LLM-as-a-judge custom evaluators**. These accept natural language instructions that\n", + "**LLM-as-a-Judge custom evaluators**. These accept natural language instructions that\n", "can reference **ground truth placeholders** automatically substituted at evaluation time.\n", "\n", "### Ground truth placeholders\n", @@ -1225,45 +730,10 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "76hyptexblj", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-31T18:09:26.438149Z", - "iopub.status.busy": "2026-03-31T18:09:26.438051Z", - "iopub.status.idle": "2026-03-31T18:09:27.825403Z", - "shell.execute_reply": "2026-03-31T18:09:27.824620Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Creating HRResponseSimilarity (TRACE) ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " evaluatorId : HRResponseSimilarity_a007e092-v0ojZ8ARHR\n", - "\n", - "Creating HRAssertionChecker (SESSION) ...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " evaluatorId : HRAssertionChecker_a007e092-zUFaMBBhck\n", - "\n", - "Custom evaluators ready:\n", - " HRResponseSimilarity (TRACE) : HRResponseSimilarity_a007e092-v0ojZ8ARHR\n", - " HRAssertionChecker (SESSION) : HRAssertionChecker_a007e092-zUFaMBBhck\n" - ] - } - ], + "execution_count": null, + "id": "99775e89", + "metadata": {}, + "outputs": [], "source": [ "import uuid\n", "\n", @@ -1377,314 +847,9 @@ "print(f\" HRAssertionChecker (SESSION) : {CUSTOM_ASSERTION_CHECKER_ID}\")" ] }, - { - "cell_type": "markdown", - "id": "invoke-md", - "metadata": {}, - "source": [ - "## Step 4: Invoke the Agent to Generate Sessions\n", - "\n", - "Before we can evaluate, we need agent sessions with CloudWatch spans. We'll invoke the agent\n", - "for several scenarios and record the session IDs for use with `EvaluationClient`.\n", - "\n", - "Each session corresponds to one evaluation scenario." - ] - }, { "cell_type": "code", - "execution_count": 9, - "id": "invoke-helper", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-31T18:09:27.827457Z", - "iopub.status.busy": "2026-03-31T18:09:27.827332Z", - "iopub.status.idle": "2026-03-31T18:09:27.831125Z", - "shell.execute_reply": "2026-03-31T18:09:27.830586Z" - } - }, - "outputs": [], - "source": [ - "def invoke_agent(prompt: str, session_id: str) -> str:\n", - " \"\"\"Send a single prompt to the HR assistant and return its text response.\"\"\"\n", - " resp = agentcore_client.invoke_agent_runtime(\n", - " agentRuntimeArn=AGENT_ARN,\n", - " qualifier=\"DEFAULT\",\n", - " runtimeSessionId=session_id,\n", - " payload=json.dumps({\"prompt\": prompt}).encode(\"utf-8\"),\n", - " )\n", - " raw = resp[\"response\"].read().decode(\"utf-8\")\n", - " parts = []\n", - " for line in raw.splitlines():\n", - " if line.startswith(\"data: \"):\n", - " chunk = line[len(\"data: \"):]\n", - " try:\n", - " chunk = json.loads(chunk)\n", - " except Exception:\n", - " pass\n", - " parts.append(str(chunk))\n", - " return \"\".join(parts) if parts else raw\n", - "\n", - "\n", - "def run_session(turns: list[str], session_prefix: str) -> str:\n", - " \"\"\"Invoke a multi-turn session and return its session ID.\"\"\"\n", - " session_id = f\"{session_prefix}-{uuid.uuid4()}\"\n", - " print(f\"Session: {session_id}\")\n", - " for turn_input in turns:\n", - " print(f\" > {turn_input[:70]}\")\n", - " response = invoke_agent(turn_input, session_id)\n", - " print(f\" < {response[:100]}\")\n", - " return session_id" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "invoke-single", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-31T18:09:27.832388Z", - "iopub.status.busy": "2026-03-31T18:09:27.832293Z", - "iopub.status.idle": "2026-03-31T18:10:06.856305Z", - "shell.execute_reply": "2026-03-31T18:10:06.855331Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "=== Single-Turn Sessions ===\n", - "Session: pto-balance-check-6174aaad-8137-428b-8b54-64aa650cbedf\n", - " > What is the current PTO balance for employee EMP-001?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"The current PTO balance for employee EMP-001 is 10 remaining days. They have a total of 15 days all\n", - "Session: submit-pto-request-3b5a7027-eedb-427b-ab71-357f5ea55393\n", - " > Please submit a PTO request for employee EMP-001 from 2026-04-14 to 20\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"Hi there! Your PTO request from 2026-04-14 to 2026-04-16 for a family vacation has been successfull\n", - "Session: pay-stub-lookup-739b4301-7fed-4486-922f-fae9b33cca3a\n", - " > Can you pull up the January 2026 pay stub for employee EMP-001?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"Here is the pay stub for employee EMP-001 for January 2026:\\n\\n- Gross Pay: $8,333.33\\n- Federal Ta\n", - "\n", - "Single-turn sessions created.\n" - ] - } - ], - "source": [ - "# --- Single-turn sessions ---\n", - "\n", - "print(\"=== Single-Turn Sessions ===\")\n", - "\n", - "session_pto_balance = run_session(\n", - " [\"What is the current PTO balance for employee EMP-001?\"],\n", - " \"pto-balance-check\"\n", - ")\n", - "\n", - "session_submit_pto = run_session(\n", - " [\"Please submit a PTO request for employee EMP-001 from 2026-04-14 to 2026-04-16 for a family vacation.\"],\n", - " \"submit-pto-request\"\n", - ")\n", - "\n", - "session_pay_stub = run_session(\n", - " [\"Can you pull up the January 2026 pay stub for employee EMP-001?\"],\n", - " \"pay-stub-lookup\"\n", - ")\n", - "\n", - "print(\"\\nSingle-turn sessions created.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "invoke-multi", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-31T18:10:06.861600Z", - "iopub.status.busy": "2026-03-31T18:10:06.861400Z", - "iopub.status.idle": "2026-03-31T18:10:13.485540Z", - "shell.execute_reply": "2026-03-31T18:10:13.485047Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "=== Multi-Turn Session: PTO Planning ===\n", - "Session: pto-planning-session-4d42b1e2-b0c3-43c5-9298-05ead1cf4822\n", - " > How many PTO days do I have left? My employee ID is EMP-001.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"Hi there! Based on your employee ID, you have 10 PTO days remaining. You've used 5 days out of your\n", - " > Great. I'd like to take December 23 to December 25 off. Please submit \n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"Your PTO request for December 23 to December 25 has been approved. Your request ID is PTO-2026-001.\n", - " > Remind me — what is the policy on rolling over unused PTO?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"According to Acme Corp's PTO policy, unused PTO up to 5 days rolls over to the next year. This mean\n", - "\n", - "Multi-turn session created.\n" - ] - } - ], - "source": [ - "# --- Multi-turn session: PTO planning ---\n", - "\n", - "print(\"=== Multi-Turn Session: PTO Planning ===\")\n", - "\n", - "session_pto_planning = run_session(\n", - " [\n", - " \"How many PTO days do I have left? My employee ID is EMP-001.\",\n", - " \"Great. I'd like to take December 23 to December 25 off. Please submit a request.\",\n", - " \"Remind me — what is the policy on rolling over unused PTO?\",\n", - " ],\n", - " \"pto-planning-session\"\n", - ")\n", - "\n", - "print(\"\\nMulti-turn session created.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "invoke-onboard", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-31T18:10:13.487685Z", - "iopub.status.busy": "2026-03-31T18:10:13.487505Z", - "iopub.status.idle": "2026-03-31T18:11:21.679433Z", - "shell.execute_reply": "2026-03-31T18:11:21.678254Z" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "=== Multi-Turn Session: New Employee Onboarding ===\n", - "Session: new-employee-onboarding-1b08d214-9233-4050-a550-095962aa53f3\n", - " > I just joined the company. What is the remote work policy?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"Here is the remote work policy for Acme Corp:\\n\\n**Remote Work Policy**\\n- Employees may work remot\n", - " > How much PTO do I get as a new employee?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"It appears there was an error because the employee ID \\\"EMP-NEW\\\" does not exist in the system. Cou\n", - " > What life insurance benefit does the company provide?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"Here is the summary of the life insurance benefit provided by Acme Corp:\\n\\n**Life Insurance**\\n- B\n", - " > Can you check the current PTO balance for employee EMP-042?\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - " < \"The current PTO balance for employee EMP-042 is:\\n\\n- **Total days**: 20\\n- **Used days**: 7\\n- **R\n", - "\n", - "All sessions created. Waiting 60s for CloudWatch log ingestion...\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Ready to evaluate.\n" - ] - } - ], - "source": [ - "# --- Multi-turn session: New employee onboarding ---\n", - "\n", - "print(\"=== Multi-Turn Session: New Employee Onboarding ===\")\n", - "\n", - "session_onboarding = run_session(\n", - " [\n", - " \"I just joined the company. What is the remote work policy?\",\n", - " \"How much PTO do I get as a new employee?\",\n", - " \"What life insurance benefit does the company provide?\",\n", - " \"Can you check the current PTO balance for employee EMP-042?\",\n", - " ],\n", - " \"new-employee-onboarding\"\n", - ")\n", - "\n", - "print(\"\\nAll sessions created. Waiting 60s for CloudWatch log ingestion...\")\n", - "time.sleep(60)\n", - "print(\"Ready to evaluate.\")" - ] - }, - { - "cell_type": "markdown", - "id": "eval-client-md", - "metadata": {}, - "source": [ - "## Step 5: EvaluationClient — Evaluate Existing Sessions\n", - "\n", - "`EvaluationClient` is the right tool when you **already have agent sessions** recorded in CloudWatch.\n", - "It looks up the agent's spans for a given `session_id` and runs evaluators against them.\n", - "No agent re-invocation occurs.\n", - "\n", - "### Ground-Truth Reference Inputs\n", - "\n", - "`ReferenceInputs` lets you supply optional ground truth:\n", - "\n", - "| Field | Evaluators that use it | Description |\n", - "|---|---|---|\n", - "| `expected_response` | `Builtin.Correctness` | The ideal response text |\n", - "| `expected_trajectory` | `Builtin.TrajectoryExactOrderMatch`, `Builtin.TrajectoryInOrderMatch`, `Builtin.TrajectoryAnyOrderMatch` | Ordered list of tool names |\n", - "| `assertions` | `Builtin.GoalSuccessRate` | Free-text assertions the session should satisfy |\n", - "\n", - "Evaluators that don't need ground truth (`Helpfulness`, `ResponseRelevance`) can be included in the same call.\n", - "Each evaluator only reads the fields it needs." - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "eval-client-init", "metadata": { "execution": { @@ -1694,28 +859,18 @@ "shell.execute_reply": "2026-03-31T18:11:21.746884Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "EvaluationClient initialised (region=us-east-1)\n", - " HRResponseSimilarity_a007e092-v0ojZ8ARHR → TRACE (custom: HRResponseSimilarity)\n", - " HRAssertionChecker_a007e092-zUFaMBBhck → SESSION (custom: HRAssertionChecker)\n" - ] - } - ], + "outputs": [], "source": [ "from bedrock_agentcore.evaluation import EvaluationClient, ReferenceInputs\n", "\n", "eval_client = EvaluationClient(region_name=REGION)\n", "\n", - "# Seed the evaluator level cache with custom evaluator IDs so the client\n", - "# doesn't need to call get_evaluator for them (BUG-001 workaround).\n", - "eval_client._evaluator_level_cache.update({\n", - " CUSTOM_RESPONSE_SIMILARITY_ID: \"TRACE\",\n", - " CUSTOM_ASSERTION_CHECKER_ID: \"SESSION\",\n", - "})\n", + "# # Set the evaluator level cache with custom evaluator IDs so the client\n", + "# # doesn't need to call get_evaluator for them\n", + "# eval_client._evaluator_level_cache.update({\n", + "# CUSTOM_RESPONSE_SIMILARITY_ID: \"TRACE\",\n", + "# CUSTOM_ASSERTION_CHECKER_ID: \"SESSION\",\n", + "# })\n", "\n", "print(f\"EvaluationClient initialised (region={REGION})\")\n", "print(f\" {CUSTOM_RESPONSE_SIMILARITY_ID} → TRACE (custom: HRResponseSimilarity)\")\n", @@ -1724,7 +879,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "print-helper", "metadata": { "execution": { @@ -1736,6 +891,7 @@ }, "outputs": [], "source": [ + "# Helper function for printing\n", "def display_eval_results(label: str, results: list) -> None:\n", " \"\"\"Pretty-print EvaluationClient results as a markdown table.\"\"\"\n", " rows = [\"| Evaluator | Value | Label | Explanation |\",\n", @@ -1772,7 +928,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "ec-pto-balance", "metadata": { "execution": { @@ -1782,31 +938,11 @@ "shell.execute_reply": "2026-03-31T18:11:43.659651Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### PTO Balance — Correctness + Quality + Custom ResponseSimilarity\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response provides all the core factual information present in the expected response: EMP-001 has 10 remaining |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clear and straightforward: to obtain the current PTO balance for employee EMP-001. The assistant's re |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked for the current PTO balance for employee EMP-001. The tool output provided detailed information showing: |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response and the expected response convey the same key facts: Employee EMP-001 has 10 remaining PTO days out |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "pto_balance_results = eval_client.run(\n", " evaluator_ids=[\n", - " \"Builtin.Correctness\", # TRACE: built-in factual accuracy\n", + " \"Builtin.Correctness\", # TRACE: compares with provided expected response\n", " \"Builtin.Helpfulness\", # TRACE: no ground truth needed\n", " \"Builtin.ResponseRelevance\", # TRACE: no ground truth needed\n", " CUSTOM_RESPONSE_SIMILARITY_ID, # TRACE: custom — uses {assistant_turn} + {expected_response}\n", @@ -1837,7 +973,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "ec-submit-pto", "metadata": { "execution": { @@ -1847,28 +983,7 @@ "shell.execute_reply": "2026-03-31T18:11:58.887479Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### PTO Submission — Built-in + Custom ResponseSimilarity\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.GoalSuccessRate` | 1.0 | Yes | The agent successfully completed all three success assertions: 1) The tool execution history shows the agent called `sub |\n", - "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['submit_pto_request'] matches expected trajectory ['submit_pto_request'] |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['submit_pto_request'] found in actual ['submit_pto_request'] |\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response correctly conveys all the core factual information from the expected response: (1) PTO request was su |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all key facts from the expected response: the employee ID (EMP-001), the start and end dat |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "submit_pto_results = eval_client.run(\n", " evaluator_ids=[\n", @@ -1908,7 +1023,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "id": "ec-pay-stub", "metadata": { "execution": { @@ -1918,25 +1033,7 @@ "shell.execute_reply": "2026-03-31T18:12:01.773612Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### Pay Stub Lookup — Correctness + GoalSuccessRate\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 90453e5cd76a35d1 and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.GoalSuccessRate` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 90453e5cd76a35d1 and name: invoke_agent Strands Agents is missing a corre |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "pay_stub_results = eval_client.run(\n", " evaluator_ids=[\n", @@ -1977,7 +1074,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "ec-multi-pto", "metadata": { "execution": { @@ -1987,31 +1084,7 @@ "shell.execute_reply": "2026-03-31T18:12:06.099869Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### PTO Planning — Multi-Turn (3 turns) + Custom AssertionChecker\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.GoalSuccessRate` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.TrajectoryExactOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.TrajectoryInOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.Helpfulness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.Helpfulness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.Helpfulness` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |\n", - "| `HRAssertionChecker_a007e092-zUFaMBBhck` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 49b4e5ca7bfadb3b and name: invoke_agent Strands Agents is missing a corre |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "pto_planning_results = eval_client.run(\n", " evaluator_ids=[\n", @@ -2038,78 +1111,6 @@ "display_eval_results(\"PTO Planning — Multi-Turn (3 turns) + Custom AssertionChecker\", pto_planning_results)" ] }, - { - "cell_type": "markdown", - "id": "ec-onboard-md", - "metadata": {}, - "source": [ - "### 5e. Multi-Turn: New Employee Onboarding (4 turns)\n", - "\n", - "This scenario checks that the agent correctly identifies which tool to use for each type of question\n", - "(policy lookup vs. benefits lookup vs. PTO balance check) across a realistic onboarding conversation." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "ec-onboard", - "metadata": { - "execution": { - "iopub.execute_input": "2026-03-31T18:12:06.102151Z", - "iopub.status.busy": "2026-03-31T18:12:06.101988Z", - "iopub.status.idle": "2026-03-31T18:12:09.646290Z", - "shell.execute_reply": "2026-03-31T18:12:09.642775Z" - } - }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### New Employee Onboarding — Multi-Turn Session (4 turns)\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.GoalSuccessRate` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 9b7976374cd80134 and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 9b7976374cd80134 and name: invoke_agent Strands Agents is missing a corre |\n", - "| `Builtin.TrajectoryExactOrderMatch` | N/A | ERR:LogEventMissingException | Session span data is incomplete. Span with ID: 9b7976374cd80134 and name: invoke_agent Strands Agents is missing a corre |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "onboarding_results = eval_client.run(\n", - " evaluator_ids=[\n", - " \"Builtin.GoalSuccessRate\",\n", - " \"Builtin.TrajectoryAnyOrderMatch\",\n", - " \"Builtin.TrajectoryExactOrderMatch\",\n", - " ],\n", - " session_id=session_onboarding,\n", - " agent_id=AGENT_ID,\n", - " look_back_time=timedelta(hours=2),\n", - " reference_inputs=ReferenceInputs(\n", - " expected_trajectory=[\n", - " \"lookup_hr_policy\", # turn 1: remote work policy\n", - " \"lookup_hr_policy\", # turn 2: PTO policy\n", - " \"get_benefits_summary\", # turn 3: life insurance\n", - " \"get_pto_balance\", # turn 4: EMP-042 balance\n", - " ],\n", - " assertions=[\n", - " \"Agent looked up the remote work policy in turn 1 and mentioned 3 days per week\",\n", - " \"Agent looked up the PTO policy in turn 2 and mentioned 15 days for new employees\",\n", - " \"Agent looked up life insurance benefits in turn 3 and mentioned 2x annual salary\",\n", - " \"Agent called get_pto_balance for EMP-042 in turn 4 and reported 13 remaining days\",\n", - " ],\n", - " ),\n", - ")\n", - "\n", - "display_eval_results(\"New Employee Onboarding — Multi-Turn Session (4 turns)\", onboarding_results)" - ] - }, { "cell_type": "markdown", "id": "runner-md", @@ -2128,8 +1129,8 @@ "\n", "A dataset consists of **scenarios**, each with one or more **turns**. Optional ground-truth fields:\n", "- `Turn.expected_response` — per-turn expected answer\n", - "- `TurnByTurnScenario.expected_trajectory` — ordered list of tool names\n", - "- `TurnByTurnScenario.assertions` — session-level assertions\n", + "- `PreDefinedScenario.expected_trajectory` — ordered list of tool names\n", + "- `PreDefinedScenario.assertions` — session-level assertions\n", "\n", "### How OnDemandEvaluationDatasetRunner works\n", "\n", @@ -2145,7 +1146,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "id": "runner-imports", "metadata": { "execution": { @@ -2172,7 +1173,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "id": "runner-invoker", "metadata": { "execution": { @@ -2230,7 +1231,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "id": "runner-dataset", "metadata": { "execution": { @@ -2240,15 +1241,7 @@ "shell.execute_reply": "2026-03-31T18:12:09.670266Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset contains 5 scenarios.\n" - ] - } - ], + "outputs": [], "source": [ "dataset = Dataset(\n", " scenarios=[\n", @@ -2359,7 +1352,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "id": "runner-config", "metadata": { "execution": { @@ -2369,18 +1362,7 @@ "shell.execute_reply": "2026-03-31T18:12:09.683225Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OnDemandEvaluationDatasetRunner configured. Starting evaluation...\n", - " Scenarios : 5\n", - " Evaluators: 9 (7 built-in + 2 custom)\n", - " Delay : 180s (waiting for CloudWatch ingestion)\n" - ] - } - ], + "outputs": [], "source": [ "# Span collector: polls CloudWatch for OTel spans emitted by the agent\n", "span_collector = CloudWatchAgentSpanCollector(\n", @@ -2397,12 +1379,8 @@ " \"Builtin.TrajectoryInOrderMatch\": \"SESSION\",\n", " \"Builtin.TrajectoryAnyOrderMatch\": \"SESSION\",\n", " \"Builtin.Correctness\": \"TRACE\",\n", - " \"Builtin.Helpfulness\": \"TRACE\",\n", - " \"Builtin.ResponseRelevance\": \"TRACE\",\n", - " \"Builtin.Coherence\": \"TRACE\",\n", - " \"Builtin.InstructionFollowing\": \"TRACE\",\n", "}\n", - "# Custom evaluators (IDs are runtime values from Step 3b)\n", + "# Custom evaluators (These are the custom evaluators we created for HR Response Similarity and HRAssertionChecker)\n", "EVALUATOR_LEVELS[CUSTOM_RESPONSE_SIMILARITY_ID] = \"TRACE\"\n", "EVALUATOR_LEVELS[CUSTOM_ASSERTION_CHECKER_ID] = \"SESSION\"\n", "\n", @@ -2414,9 +1392,7 @@ " \"Builtin.GoalSuccessRate\", # SESSION — assertions\n", " \"Builtin.TrajectoryExactOrderMatch\", # SESSION — expected_trajectory\n", " \"Builtin.TrajectoryInOrderMatch\", # SESSION — expected_trajectory\n", - " \"Builtin.TrajectoryAnyOrderMatch\", # SESSION — expected_trajectory\n", - " \"Builtin.Helpfulness\", # TRACE — no ground truth\n", - " \"Builtin.ResponseRelevance\", # TRACE — no ground truth\n", + " \"Builtin.TrajectoryAnyOrderMatch\", # SESSION — expected_trajectory \n", " CUSTOM_RESPONSE_SIMILARITY_ID, # TRACE (custom) — {assistant_turn} + {expected_response}\n", " CUSTOM_ASSERTION_CHECKER_ID, # SESSION (custom) — {actual_tool_trajectory} + {assertions}\n", " ]\n", @@ -2437,7 +1413,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "id": "runner-run", "metadata": { "execution": { @@ -2447,16 +1423,7 @@ "shell.execute_reply": "2026-03-31T19:07:11.806186Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Evaluation complete: 5 completed, 0 failed out of 5 scenarios.\n" - ] - } - ], + "outputs": [], "source": [ "# Run the evaluation.\n", "# OnDemandEvaluationDatasetRunner will:\n", @@ -2487,7 +1454,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "id": "runner-results", "metadata": { "execution": { @@ -2497,136 +1464,7 @@ "shell.execute_reply": "2026-03-31T19:07:11.834370Z" } }, - "outputs": [ - { - "data": { - "text/markdown": [ - "### Scenario: `pto-balance-check`\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response provides all the core factual information present in the expected response: employee EMP-001 has 15 total PTO d |\n", - "| `Builtin.GoalSuccessRate` | 1.0 | Yes | The conversation record shows that the agent called the 'get_pto_balance' tool with the parameter 'employee_id': 'EMP-001', which |\n", - "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_pto_balance'] matches expected trajectory ['get_pto_balance'] |\n", - "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_pto_balance'] found in order within actual ['get_pto_balance'] |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_pto_balance'] found in actual ['get_pto_balance'] |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clear and straightforward: to obtain the current PTO balance for employee EMP-001. The assistant successfully r |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked for the current PTO balance for employee EMP-001. The LLM response directly addresses this question by providing th |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all key facts present in the expected response: the total number of PTO days (15), the number of use |\n", - "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory by calling 'get_pto_balance'. Additionally, the agent satisfied the assertion that |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Scenario: `pto-policy-lookup`\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response includes all the core factual information from the expected response: (1) full-time employees accrue 15 days of |\n", - "| `Builtin.GoalSuccessRate` | 1.0 | Yes | Evaluating the three success assertions: 1. **Agent called lookup_hr_policy with topic=pto**: The tool execution history shows th |\n", - "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['lookup_hr_policy'] matches expected trajectory ['lookup_hr_policy'] |\n", - "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['lookup_hr_policy'] found in order within actual ['lookup_hr_policy'] |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['lookup_hr_policy'] found in actual ['lookup_hr_policy'] |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clear and straightforward: to learn about the company's PTO policy. The assistant's response directly addresses |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked 'What is the company PTO policy?' The LLM response directly addresses this question by providing the complete PTO p |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 0.5 | partially_similar | The agent's response includes all the key facts from the expected response: full-time employees accrue 15 days of PTO per year, re |\n", - "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory by calling 'lookup_hr_policy' with the topic 'pto'. Additionally, the agent mentio |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Scenario: `401k-info`\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response accurately conveys all the core factual information from the expected response: (1) 100% match up to 4% of sala |\n", - "| `Builtin.GoalSuccessRate` | 1.0 | Yes | Evaluating the three success assertions: 1. \"Agent called get_benefits_summary with benefit_type=401k\" - The tool execution histo |\n", - "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_benefits_summary'] matches expected trajectory ['get_benefits_summary'] |\n", - "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_benefits_summary'] found in order within actual ['get_benefits_summary'] |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_benefits_summary'] found in actual ['get_benefits_summary'] |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is clearly to understand how the 401k match works at their company. The assistant's response directly addresses th |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked 'How does the 401k match work?' The LLM response directly addresses this question by explaining the 401k matching s |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all key facts from the expected response, such as the company's matching contribution of 100% up to |\n", - "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory by calling 'get_benefits_summary'. It also satisfied all assertions by correctly d |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Scenario: `check-and-submit-pto`\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response contains all the core factual information from the expected response: (1) EMP-002 has 3 remaining PTO days, (2) |\n", - "| `Builtin.GoalSuccessRate` | 1.0 | Yes | The agent successfully completed all three assertions: (1) First called get_pto_balance for EMP-002, which returned 3 remaining da |\n", - "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_pto_balance', 'submit_pto_request'] matches expected trajectory ['get_pto_balance', 'submit_p |\n", - "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_pto_balance', 'submit_pto_request'] found in order within actual ['get_pto_balance', 'sub |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_pto_balance', 'submit_pto_request'] found in actual ['get_pto_balance', 'submit_pto_requ |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal was clear: check if EMP-002 has at least 2 days of PTO balance, and if so, submit a request for the specified date |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user's question asks to: (1) check the PTO balance for EMP-002, and (2) if they have at least 2 days, submit a request for 202 |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 0.5 | partially_similar | The agent's response includes all the key information from the expected response: EMP-002 has 3 remaining PTO days, and the PTO re |\n", - "| `HRAssertionChecker_a007e092-zUFaMBBhck` | N/A | ERR:ValueError | No score found in evaluation result |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/markdown": [ - "### Scenario: `benefits-exploration`\n", - "\n", - "| Evaluator | Value | Label | Explanation |\n", - "|---|---|---|---|\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response contains all the core factual information from the expected response: (1) the company covers 90% of premiums fo |\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response contains all the core factual information from the expected response: 100% coverage for preventive care, 80% fo |\n", - "| `Builtin.Correctness` | 1.0 | Correct | The agent response correctly conveys the core factual information about the company's 401k contribution. It states that the compan |\n", - "| `Builtin.GoalSuccessRate` | 1.0 | Yes | Evaluating the three success assertions: 1. **Agent called get_benefits_summary three times across the conversation**: The tool e |\n", - "| `Builtin.TrajectoryExactOrderMatch` | 1.0 | Yes | Exact match: Actual trajectory ['get_benefits_summary', 'get_benefits_summary', 'get_benefits_summary'] matches expected trajector |\n", - "| `Builtin.TrajectoryInOrderMatch` | 1.0 | Yes | In-order match: All expected tools ['get_benefits_summary', 'get_benefits_summary', 'get_benefits_summary'] found in order within |\n", - "| `Builtin.TrajectoryAnyOrderMatch` | 1.0 | Yes | Any-order match: All expected tools ['get_benefits_summary', 'get_benefits_summary', 'get_benefits_summary'] found in actual ['get |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is to understand the health insurance options available to them. The assistant's response directly addresses this |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is to understand health insurance options at Acme Corp, and they've now expanded their inquiry to include dental b |\n", - "| `Builtin.Helpfulness` | 0.83 | Very Helpful | The user's goal is to understand employee benefits at Acme Corp, specifically asking about 401k company contributions. The assista |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user asked to be walked through the health insurance options. The LLM response directly addresses this question by providing a |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user's question is 'What about dental?' which is a follow-up question asking about dental insurance options. The LLM response |\n", - "| `Builtin.ResponseRelevance` | 1.0 | Completely Yes | The user's question is: 'And how much does the company contribute to the 401k?' The LLM response provides information about the 40 |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response includes all the key facts mentioned in the expected response. It accurately states the premium coverage perc |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response provides a comprehensive summary of the dental insurance options, including coverage details, annual maximum |\n", - "| `HRResponseSimilarity_a007e092-v0ojZ8ARHR` | 1.0 | highly_similar | The agent's response provides a detailed summary that includes all the key facts mentioned in the expected response. It correctly |\n", - "| `HRAssertionChecker_a007e092-zUFaMBBhck` | 1.0 | passed | The agent followed the expected tool trajectory exactly as specified, calling 'get_benefits_summary' three times. Additionally, th |" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "def display_runner_results(eval_result) -> None:\n", " \"\"\"Display OnDemandEvaluationDatasetRunner results as a markdown table per scenario.\"\"\"\n", @@ -2657,7 +1495,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "id": "runner-summary", "metadata": { "execution": { @@ -2667,26 +1505,7 @@ "shell.execute_reply": "2026-03-31T19:07:11.840171Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Evaluator Summary (average score across all scenarios)\n", - "============================================================\n", - " Builtin.Correctness avg=1.00 (n=7)\n", - " Builtin.GoalSuccessRate avg=1.00 (n=5)\n", - " Builtin.Helpfulness avg=0.83 (n=7)\n", - " Builtin.ResponseRelevance avg=1.00 (n=7)\n", - " Builtin.TrajectoryAnyOrderMatch avg=1.00 (n=5)\n", - " Builtin.TrajectoryExactOrderMatch avg=1.00 (n=5)\n", - " Builtin.TrajectoryInOrderMatch avg=1.00 (n=5)\n", - " HRAssertionChecker_a007e092-zUFaMBBhck avg=1.00 (n=4)\n", - " HRResponseSimilarity_a007e092-v0ojZ8ARHR avg=0.86 (n=7)\n" - ] - } - ], + "outputs": [], "source": [ "# Aggregate summary: average score per evaluator across all scenarios\n", "from collections import defaultdict\n", @@ -2717,7 +1536,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "id": "save-results", "metadata": { "execution": { @@ -2727,23 +1546,7 @@ "shell.execute_reply": "2026-03-31T19:07:11.850695Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Results saved to: results/groundtruth_eval_20260331_190711.json\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/var/folders/9h/jk764rms7493dnsp6_d37wqm0000gq/T/ipykernel_96692/3226257259.py:5: DeprecationWarning: datetime.datetime.utcnow() is deprecated and scheduled for removal in a future version. Use timezone-aware objects to represent datetimes in UTC: datetime.datetime.now(datetime.UTC).\n", - " timestamp = datetime.utcnow().strftime(\"%Y%m%d_%H%M%S\")\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "from datetime import datetime\n", @@ -2770,7 +1573,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "id": "cleanup", "metadata": { "execution": { @@ -2780,15 +1583,7 @@ "shell.execute_reply": "2026-03-31T19:07:11.854895Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cleanup skipped. Uncomment the cell above to delete the agent runtime.\n" - ] - } - ], + "outputs": [], "source": [ "# Uncomment to delete the agent runtime\n", "# agent_runtime.delete()\n", @@ -2807,7 +1602,6 @@ "| | EvaluationClient | OnDemandEvaluationDatasetRunner |\n", "|---|---|---|\n", "| **When to use** | You have existing sessions | You have a test dataset |\n", - "| **Agent invocation** | Not included | Automatic |\n", "| **Best for** | Post-hoc analysis, debugging | Regression testing, CI/CD |\n", "| **Input** | session_id | Dataset of scenarios |\n", "\n", @@ -2820,9 +1614,7 @@ "| `Builtin.TrajectoryExactOrderMatch` | SESSION | `expected_trajectory` |\n", "| `Builtin.TrajectoryInOrderMatch` | SESSION | `expected_trajectory` |\n", "| `Builtin.TrajectoryAnyOrderMatch` | SESSION | `expected_trajectory` |\n", - "| `Builtin.Helpfulness` | TRACE | None |\n", - "| `Builtin.ResponseRelevance` | TRACE | None |\n", - "| `Builtin.Coherence` | TRACE | None |\n", + "\n", "\n", "### Custom evaluator ground truth placeholders\n", "\n", @@ -2838,11 +1630,17 @@ "| SESSION | `{assertions}` | `ReferenceInputs.assertions` |\n", "| SESSION | `{available_tools}` | Tools available to the agent |" ] + }, + { + "cell_type": "markdown", + "id": "1932ba98", + "metadata": {}, + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", "language": "python", "name": "python3" }, From bd11377036d408603e7be01d192f9761f72e208d Mon Sep 17 00:00:00 2001 From: Bharathi Srinivasan Date: Tue, 31 Mar 2026 15:26:35 -0700 Subject: [PATCH 3/3] drop .py script, agent script is created at notebook runtime --- .../.gitignore | 5 + .../README.md | 416 ++++++++++++++---- .../groundtruth_evaluations.ipynb | 283 +----------- .../hr_assistant_agent.py | 268 ----------- 4 files changed, 337 insertions(+), 635 deletions(-) create mode 100644 01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.gitignore delete mode 100644 01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.gitignore b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.gitignore new file mode 100644 index 000000000..b17e1aefb --- /dev/null +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/.gitignore @@ -0,0 +1,5 @@ +# Generated by the %%writefile cell in groundtruth_evaluations.ipynb +hr_assistant_agent.py + +# Generated by bedrock-agentcore-starter-toolkit on first deploy +.bedrock_agentcore.yaml diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md index d7af7fe4a..c4a34dd64 100644 --- a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/README.md @@ -1,89 +1,233 @@ -# Ground Truth Evaluations — EvaluationClient and EvaluationRunner +# Ground Truth Evaluations with Custom Evaluators -## Overview +## Introduction -This tutorial demonstrates end-to-end evaluation of an agentic application using Amazon Bedrock AgentCore's two primary evaluation interfaces: **EvaluationClient** and **OnDemandEvaluationDatasetRunner**. Both are used with ground-truth reference inputs to measure factual correctness, goal achievement, and tool-use accuracy. +This tutorial demonstrates end-to-end evaluation of an agentic application using +[**Amazon Bedrock AgentCore Evaluations**](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/evaluations.html) with ground-truth reference inputs. It covers +the two primary evaluation interfaces — `EvaluationClient` and +`OnDemandEvaluationDatasetRunner` — and shows how to create **custom LLM-as-a-judge +evaluators** that use ground-truth placeholders to tailor scoring criteria to your +application domain. -The tutorial uses an **HR Assistant agent** for Acme Corp — a Strands agent that helps employees with PTO management, HR policy lookups, benefits information, and pay stubs. +The tutorial deploys an **HR Assistant agent** for Acme Corp — a +[Strands Agents](https://strandsagents.com/) application that helps employees with PTO +management, HR policy lookups, benefits information, and pay stub retrieval. Its tools +return deterministic mock data, making evaluation results fully reproducible. -## What You'll Learn +### Key concepts covered -- How to deploy a Strands agent to AgentCore Runtime -- When to use `EvaluationClient` vs `EvaluationRunner` -- How to evaluate existing sessions with ground-truth `ReferenceInputs` -- How to define an evaluation dataset with `TurnByTurnScenario` and `Turn` -- How to run automated dataset evaluations with `EvaluationRunner` -- How to interpret built-in evaluator results for trajectory, correctness, and goal-success metrics -- How to create **custom LLM-as-a-judge evaluators** with ground-truth placeholders +| Concept | Description | +|---|---| +| `EvaluationClient` | Evaluate specific existing CloudWatch sessions against ground-truth references | +| `OnDemandEvaluationDatasetRunner` | Define a test dataset, auto-invoke the agent per scenario, and evaluate the results | +| `ReferenceInputs` | Supply `expected_response`, `expected_trajectory`, and `assertions` as ground truth | +| Custom evaluators | Create LLM-as-a-judge evaluators with domain-specific instructions and ground-truth placeholders | + + +> **Further reading** +> - [Ground-truth evaluations — custom evaluators](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/ground-truth-evaluations.html#gt-custom-evaluators) +> - [Dataset-based evaluations](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/dataset-evaluations.html) + +--- + +## Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Tutorial Notebook (groundtruth_evaluations.ipynb) │ +│ │ +│ Step 1 ──► bedrock-agentcore-starter-toolkit │ +│ │ CodeBuild builds image, pushes to ECR │ +│ └──► AgentCore Runtime (HR Assistant Agent) │ +│ │ invoke_agent_runtime() │ +│ Step 2 ──► bedrock-agentcore-control ──► Custom Evaluators │ +│ create_evaluator() │ +│ │ +│ Step 3 ──► AgentCore Runtime (generate sessions) │ +│ │ OTel spans ──► CloudWatch Logs │ +│ │ +│ Step 4 ──► EvaluationClient.run() │ +│ │ CloudWatchAgentSpanCollector reads spans │ +│ └──► Evaluate API ──► Built-in + Custom Evaluators │ +│ └──► Scores & Explanations │ +│ │ +│ Step 5 ──► OnDemandEvaluationDatasetRunner.run() │ +│ │ Invokes agent per scenario │ +│ │ Waits for CloudWatch ingestion │ +│ └──► Evaluate API ──► Built-in + Custom Evaluators │ +│ └──► Per-scenario Results │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +**Component roles** + +| Component | Role | +|---|---| +| AgentCore Runtime | Hosts the containerised HR Assistant, emits OTel spans to CloudWatch | +| CloudWatch Logs | Stores session spans; queried by `CloudWatchAgentSpanCollector` | +| `bedrock-agentcore-control` | Control plane — creates custom evaluators and agent runtimes | +| Evaluate API (`bedrock-agentcore`) | Data plane — scores sessions against evaluator definitions | +| Starter Toolkit | Builds the Docker image via CodeBuild and registers the runtime; no local Docker required | + +--- ## Prerequisites -Before running this tutorial, ensure you have: +- **Python 3.10+** with the packages in `requirements.txt` +- **AWS credentials** configured (e.g. via `aws configure` or environment variables) with + permissions for: + - `bedrock-agentcore:*` — invoke agent runtime and call Evaluate API + - `bedrock-agentcore-control:CreateAgentRuntime`, `UpdateAgentRuntime`, + `GetAgentRuntime`, `CreateEvaluator` — deploy agent and register evaluators + - `logs:FilterLogEvents`, `logs:DescribeLogGroups`, `logs:StartQuery`, + `logs:GetQueryResults` — read CloudWatch spans + - `ecr:GetAuthorizationToken`, `ecr:BatchCheckLayerAvailability`, + `ecr:InitiateLayerUpload`, `ecr:PutImage` — push container image + - `codebuild:StartBuild`, `codebuild:BatchGetBuilds` — image build via CodeBuild + - `iam:CreateRole`, `iam:AttachRolePolicy`, `iam:PassRole` — auto-create execution roles + - `s3:PutObject`, `s3:GetObject` — CodeBuild source upload +- **No local Docker required** — the starter toolkit builds the container image via + AWS CodeBuild -- Python 3.10+ -- AWS credentials with permissions for: - - AgentCore Runtime (`bedrock-agentcore:*`) - - AgentCore Evaluations (`bedrock-agentcore:Evaluate`) - - CloudWatch Logs (`logs:FilterLogEvents`, `logs:DescribeLogGroups`) - - ECR (`ecr:*`) - - IAM (for auto-creating the agent execution role) +Install dependencies: -## Files +```bash +pip install -r requirements.txt +``` -| File | Description | -|---|---| -| `groundtruth_evaluations.ipynb` | Main tutorial notebook (standalone, end-to-end) | -| `hr_assistant_agent.py` | HR Assistant Strands agent deployed to AgentCore Runtime | -| `requirements.txt` | Python dependencies for the agent container | +--- -## Tutorial Notebook +## Usage -### [groundtruth_evaluations.ipynb](groundtruth_evaluations.ipynb) +### Run the notebook -A single self-contained notebook that walks through the full evaluation workflow in 7 steps: +Open and run [`groundtruth_evaluations.ipynb`](groundtruth_evaluations.ipynb) top-to-bottom. +Each cell is idempotent — re-running the notebook updates the existing agent runtime and +creates fresh custom evaluators with a unique suffix to avoid naming conflicts. -| Step | Description | -|---|---| -| 1 | Install dependencies | -| 2 | Configure AWS session and region | -| 3a | Deploy the HR Assistant agent to AgentCore Runtime | -| 3b | **Create custom LLM-as-a-judge evaluators** with ground-truth placeholders | -| 4 | Invoke the agent to generate sessions with CloudWatch spans | -| 5 | **EvaluationClient** — evaluate existing sessions with ground truth | -| 6 | **OnDemandEvaluationDatasetRunner** — automated dataset evaluation | -| 7 | Cleanup | - -## EvaluationClient vs EvaluationRunner - -| | EvaluationClient | EvaluationRunner | -|---|---|---| -| **When to use** | You already have recorded sessions | You have a test dataset | -| **Input** | `session_id` + `agent_id` | `Dataset` of `TurnByTurnScenario` objects | -| **Best for** | Post-hoc analysis, debugging, incident investigation | Regression testing, CI/CD pipelines, batch evaluation | +```bash +jupyter notebook groundtruth_evaluations.ipynb +``` -## Ground-Truth Reference Inputs +Or execute non-interactively: -`ReferenceInputs` supplies optional ground truth to `EvaluationClient`. Each field is consumed by specific evaluators: +```bash +jupyter nbconvert --to notebook --execute --inplace groundtruth_evaluations.ipynb +``` -| Field | Evaluators that use it | Description | +### Notebook walkthrough + +| Step | Cell(s) | What happens | +|---|---|---| +| **1 — Install** | `install` | Installs `bedrock-agentcore`, `strands-agents`, and other dependencies | +| **2 — Configure** | `setup` | Creates a boto3 session and sets `REGION` | +| **3a — Deploy agent** | `nn72gdo2s4h`, `deploy`, `wait-deploy`, `agent-config` | Writes `hr_assistant_agent.py`, builds image via CodeBuild, creates/updates the AgentCore Runtime, polls until `READY` | +| **3b — Create evaluators** | `76hyptexblj` | Creates `HRResponseSimilarity` (TRACE) and `HRAssertionChecker` (SESSION) custom evaluators via `bedrock-agentcore-control` | +| **4 — Invoke agent** | `invoke-single`, `invoke-multi`, `invoke-onboard` | Runs 5 sessions (single- and multi-turn), waits 60 s for CloudWatch ingestion | +| **5 — EvaluationClient** | `ec-*` | Evaluates each session by session ID using built-in and custom evaluators | +| **6 — DatasetRunner** | `runner-*` | Defines a 5-scenario dataset, invokes the agent per scenario, waits 180 s, evaluates all scenarios | +| **7 — Cleanup** | `cleanup` | (Commented out) Deletes the agent runtime | + +### Using `EvaluationClient` directly + +```python +from bedrock_agentcore.evaluation import EvaluationClient, ReferenceInputs +from datetime import timedelta + +ec = EvaluationClient(region_name="us-east-1") + +results = ec.run( + evaluator_ids=["Builtin.Correctness", "Builtin.GoalSuccessRate", MY_CUSTOM_EVAL_ID], + session_id="", + agent_id="", + look_back_time=timedelta(hours=2), + reference_inputs=ReferenceInputs( + expected_response="Employee EMP-001 has 10 remaining PTO days.", + assertions=["Agent called get_pto_balance", "Agent reported 10 remaining days"], + expected_trajectory=["get_pto_balance"], + ), +) +``` + +### Using `OnDemandEvaluationDatasetRunner` directly + +```python +from bedrock_agentcore.evaluation import ( + Dataset, PredefinedScenario, Turn, + EvaluationRunConfig, EvaluatorConfig, + OnDemandEvaluationDatasetRunner, + CloudWatchAgentSpanCollector, +) + +dataset = Dataset(scenarios=[ + PredefinedScenario( + scenario_id="pto-check", + turns=[Turn( + input="What is the PTO balance for EMP-001?", + expected_response="EMP-001 has 10 remaining PTO days.", + )], + expected_trajectory=["get_pto_balance"], + assertions=["Agent reported 10 remaining PTO days"], + ), +]) + +runner = OnDemandEvaluationDatasetRunner(region="us-east-1") +result = runner.run( + config=EvaluationRunConfig( + evaluator_config=EvaluatorConfig(evaluator_ids=["Builtin.Correctness"]), + evaluation_delay_seconds=180, + ), + dataset=dataset, + agent_invoker=my_invoker_fn, + span_collector=CloudWatchAgentSpanCollector(log_group_name=CW_LOG_GROUP, region="us-east-1"), +) +``` + +--- + +## Sample Prompts + +The following prompts are used in the notebook. They can also be sent directly to a +deployed HR Assistant to generate sessions for evaluation. + +### Single-turn + +| Prompt | Expected tool | Expected outcome | |---|---|---| -| `expected_response` | `Builtin.Correctness` | The ideal response text for semantic comparison | -| `expected_trajectory` | `Builtin.TrajectoryExactOrderMatch`, `Builtin.TrajectoryInOrderMatch`, `Builtin.TrajectoryAnyOrderMatch` | Ordered list of tool names the agent should call | -| `assertions` | `Builtin.GoalSuccessRate` | Free-text assertions the session should satisfy | +| `What is the current PTO balance for employee EMP-001?` | `get_pto_balance` | 10 remaining days (15 total, 5 used) | +| `Please submit a PTO request for EMP-001 from 2026-04-14 to 2026-04-16 for a family vacation.` | `submit_pto_request` | Approved, request ID `PTO-2026-001` | +| `Can you pull up the January 2026 pay stub for employee EMP-001?` | `get_pay_stub` | Gross $8,333.33, net $5,362.50 | +| `What is the company PTO policy?` | `lookup_hr_policy` | 15 days/year, 2-day advance notice, 5-day rollover | +| `How does the 401k match work?` | `get_benefits_summary` | 100% match up to 4%, 50% on next 2%, 3-year vesting | +| `Check the PTO balance for EMP-002 and if they have at least 2 days, submit a request for 2026-05-26 to 2026-05-27.` | `get_pto_balance` → `submit_pto_request` | 3 days remaining → request approved | -Evaluators that don't require ground truth (`Builtin.Helpfulness`, `Builtin.ResponseRelevance`) can be included in the same call — each evaluator reads only the fields it needs. +### Multi-turn -The same fields apply to `PredefinedScenario` objects in `OnDemandEvaluationDatasetRunner` datasets. +**PTO planning (3 turns)** +1. `How many PTO days do I have left? My employee ID is EMP-001.` +2. `Great. I'd like to take December 23 to December 25 off. Please submit a request.` +3. `Remind me — what is the policy on rolling over unused PTO?` + +Expected trajectory: `get_pto_balance` → `submit_pto_request` → `lookup_hr_policy` + +**New employee onboarding (4 turns)** +1. `I just joined the company. What is the remote work policy?` +2. `How much PTO do I get as a new employee?` +3. `What life insurance benefit does the company provide?` +4. `Can you check the current PTO balance for employee EMP-042?` + +Expected trajectory: `lookup_hr_policy` → `lookup_hr_policy` → `get_benefits_summary` → `get_pto_balance` + +--- ## Custom Evaluators with Ground Truth -In addition to built-in evaluators, you can define **custom LLM-as-a-judge evaluators** with -evaluation criteria written in natural language. Custom evaluators support the same ground-truth -fields through **placeholders** that the service substitutes at evaluation time. +Custom evaluators let you define evaluation criteria in natural language. The service +substitutes **ground-truth placeholders** from `ReferenceInputs` before scoring. ### Placeholder reference -| Level | Placeholder | Filled from | +| Level | Placeholder | Populated from | |---|---|---| | TRACE | `{assistant_turn}` | Agent's actual response for that turn | | TRACE | `{expected_response}` | `ReferenceInputs.expected_response` | @@ -93,51 +237,149 @@ fields through **placeholders** that the service substitutes at evaluation time. | SESSION | `{assertions}` | `ReferenceInputs.assertions` | | SESSION | `{available_tools}` | Tools available to the agent | +### Creating a custom evaluator + +```python +import boto3, uuid + +cp = boto3.client("bedrock-agentcore-control", region_name="us-east-1") + +# Trace-level: response similarity using ground-truth placeholders +result = cp.create_evaluator( + evaluatorName=f"ResponseSimilarity_{uuid.uuid4().hex[:8]}", + level="TRACE", + evaluatorConfig={ + "llmAsAJudge": { + "instructions": ( + "Compare the agent's response with the expected response.\n" + "Agent response: {assistant_turn}\n" + "Expected response: {expected_response}\n\n" + "Rate how closely the responses match on a scale of 0 to 1." + ), + "ratingScale": { + "numerical": [ + {"value": 0.0, "label": "not_similar", + "definition": "Response is factually different from expected."}, + {"value": 0.5, "label": "partially_similar", + "definition": "Response partially matches expected."}, + {"value": 1.0, "label": "highly_similar", + "definition": "Response is semantically equivalent to expected."}, + ] + }, + "modelConfig": { + "bedrockEvaluatorModelConfig": { + "modelId": "us.amazon.nova-lite-v1:0", + "inferenceConfig": {"maxTokens": 512}, + } + }, + } + }, +) +custom_evaluator_id = result["evaluatorId"] +``` + +Pass `custom_evaluator_id` to `EvaluationClient.run()` or `EvaluatorConfig` like any +built-in evaluator ID. Seed the level cache to avoid an extra `get_evaluator` lookup: + +```python +eval_client._evaluator_level_cache[custom_evaluator_id] = "TRACE" +``` + +### Custom evaluators in this tutorial + +| Evaluator | Level | Placeholders used | Where used | +|---|---|---|---| +| `HRResponseSimilarity` | TRACE | `{assistant_turn}`, `{expected_response}` | EvaluationClient (Steps 5a, 5b), DatasetRunner (Step 6) | +| `HRAssertionChecker` | SESSION | `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, `{assertions}` | EvaluationClient (Step 5d, multi-turn), DatasetRunner (Step 6) | -The notebook demonstrates two custom evaluators: +> **Note:** SESSION-level custom evaluators require a session with multiple tool calls to +> extract a meaningful trajectory. They are used on multi-turn sessions in Step 5d and on +> all DatasetRunner scenarios in Step 6, where a 180-second ingestion delay ensures span +> data is complete before evaluation. -| Evaluator | Level | Placeholders | Description | -|---|---|---|---| -| `HRResponseSimilarity` | TRACE | `{assistant_turn}`, `{expected_response}` | Scores how closely the agent's response matches the expected answer | -| `HRAssertionChecker` | SESSION | `{actual_tool_trajectory}`, `{expected_tool_trajectory}`, `{assertions}` | Scores whether the agent called the right tools and satisfied all assertions | +--- -## Built-in Evaluators Used +## Built-in Evaluators -| Evaluator | Level | Ground Truth | +| Evaluator | Level | Ground truth required | |---|---|---| | `Builtin.Correctness` | TRACE | `expected_response` | +| `Builtin.Helpfulness` | TRACE | None | +| `Builtin.ResponseRelevance` | TRACE | None | | `Builtin.GoalSuccessRate` | SESSION | `assertions` | | `Builtin.TrajectoryExactOrderMatch` | SESSION | `expected_trajectory` | | `Builtin.TrajectoryInOrderMatch` | SESSION | `expected_trajectory` | | `Builtin.TrajectoryAnyOrderMatch` | SESSION | `expected_trajectory` | -| `Builtin.Helpfulness` | TRACE | None | -| `Builtin.ResponseRelevance` | TRACE | None | **Evaluation levels:** -- **TRACE** — evaluated once per agent response (one result per conversational turn) -- **SESSION** — evaluated once per conversation (one result per scenario) +- **TRACE** — one result per conversational turn (agent response) +- **SESSION** — one result per complete conversation -## The HR Assistant Agent +--- -The agent is built with the [Strands Agents SDK](https://strandsagents.com/) and deployed on AgentCore Runtime. It exposes five tools backed by deterministic mock data, making evaluations fully reproducible: +## Files -| Tool | Description | +| File | Description | |---|---| -| `get_pto_balance` | Returns remaining PTO days for an employee | -| `submit_pto_request` | Submits a time-off request | -| `lookup_hr_policy` | Looks up PTO, remote work, parental leave, or code-of-conduct policies | -| `get_benefits_summary` | Returns health, dental, vision, 401k, or life insurance details | -| `get_pay_stub` | Retrieves gross and net pay for a given employee and period | +| `groundtruth_evaluations.ipynb` | Main tutorial notebook — self-contained, end-to-end | +| `requirements.txt` | Python dependencies installed into the agent container | -## Evaluation Scenarios +`hr_assistant_agent.py` and `.bedrock_agentcore.yaml` are generated at runtime (by the `%%writefile` notebook cell and the starter toolkit respectively) -The notebook evaluates five scenarios that cover different evaluation patterns: +--- -| Scenario | Turns | Key evaluators | -|---|---|---| -| PTO balance check | 1 | Correctness, Helpfulness, **HRResponseSimilarity** (custom) | -| PTO submission | 1 | GoalSuccessRate, Trajectory, Correctness, **HRResponseSimilarity** (custom) | -| Pay stub lookup | 1 | Correctness, GoalSuccessRate | -| PTO planning session | 3 | GoalSuccessRate, TrajectoryExactOrderMatch, **HRAssertionChecker** (custom) | -| New employee onboarding | 4 | GoalSuccessRate, TrajectoryAnyOrderMatch | +## Clean Up + +### Delete the agent runtime + +Uncomment and run the cleanup cell in the notebook: + +```python +agentcore_runtime.delete() +``` + +Or via the AWS CLI: + +```bash +aws bedrock-agentcore delete-agent-runtime \ + --agent-runtime-id hr_assistant_eval_tutorial-xfZ3yiH356 \ + --region us-east-1 +``` + +### Delete custom evaluators + +```python +import boto3 + +cp = boto3.client("bedrock-agentcore-control", region_name="us-east-1") +for evaluator_id in [CUSTOM_RESPONSE_SIMILARITY_ID, CUSTOM_ASSERTION_CHECKER_ID]: + cp.delete_evaluator(evaluatorId=evaluator_id) + print(f"Deleted {evaluator_id}") +``` + +### Delete the ECR repository + +```bash +aws ecr delete-repository \ + --repository-name bedrock-agentcore-hr_assistant_eval_tutorial \ + --region us-east-1 \ + --force +``` + +### Delete CloudWatch log group + +```bash +aws logs delete-log-group \ + --log-group-name /aws/bedrock-agentcore/runtimes/hr_assistant_eval_tutorial-xfZ3yiH356-DEFAULT \ + --region us-east-1 +``` + +--- + +## Additional Resources +- [Ground-truth evaluations — custom evaluators](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/ground-truth-evaluations.html#gt-custom-evaluators) +- [Dataset-based evaluations](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/dataset-evaluations.html) +- [Amazon Bedrock AgentCore Developer Guide](https://docs.aws.amazon.com/bedrock-agentcore/latest/devguide/) +- [Strands Agents SDK](https://strandsagents.com/) +- [Build reliable AI agents with Amazon Bedrock AgentCore Evaluations](https://aws.amazon.com/blogs/machine-learning/build-reliable-ai-agents-with-amazon-bedrock-agentcore-evaluations/) diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb index 4a7137881..2b08cc7c6 100644 --- a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb +++ b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/groundtruth_evaluations.ipynb @@ -138,277 +138,7 @@ } }, "outputs": [], - "source": [ - "%%writefile hr_assistant_agent.py\n", - "\"\"\"\n", - "HR Assistant Agent — Strands agent deployed on Bedrock AgentCore Runtime.\n", - "\n", - "Tools (deterministic / mock data for reproducible evaluations):\n", - " get_pto_balance — remaining PTO days for an employee\n", - " submit_pto_request — request time off\n", - " lookup_hr_policy — company policy documents\n", - " get_benefits_summary — health, dental, vision, 401k, life insurance details\n", - " get_pay_stub — pay stub for a given period\n", - "\"\"\"\n", - "\n", - "import logging\n", - "import re\n", - "\n", - "from bedrock_agentcore.runtime import BedrockAgentCoreApp\n", - "from strands import Agent, tool\n", - "from strands.models import BedrockModel\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "logger = logging.getLogger(__name__)\n", - "\n", - "app = BedrockAgentCoreApp()\n", - "\n", - "# ---------------------------------------------------------------------------\n", - "# Mock data\n", - "# ---------------------------------------------------------------------------\n", - "\n", - "_PTO_BALANCES = {\n", - " \"EMP-001\": {\"total_days\": 15, \"used_days\": 5, \"remaining_days\": 10},\n", - " \"EMP-002\": {\"total_days\": 15, \"used_days\": 12, \"remaining_days\": 3},\n", - " \"EMP-042\": {\"total_days\": 20, \"used_days\": 7, \"remaining_days\": 13},\n", - "}\n", - "\n", - "_HR_POLICIES = {\n", - " \"pto\": (\n", - " \"PTO Policy: Full-time employees accrue 15 days of PTO per year (20 days after 3 years). \"\n", - " \"PTO requests must be submitted at least 2 business days in advance. \"\n", - " \"Unused PTO up to 5 days rolls over to the next year. \"\n", - " \"PTO cannot be taken in advance of accrual.\"\n", - " ),\n", - " \"remote_work\": (\n", - " \"Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. \"\n", - " \"Core collaboration hours are 10am-3pm local time. \"\n", - " \"A dedicated workspace with reliable internet (25 Mbps+) is required. \"\n", - " \"Employees must be reachable via Slack and email during core hours.\"\n", - " ),\n", - " \"parental_leave\": (\n", - " \"Parental Leave Policy: Primary caregivers receive 16 weeks of fully paid parental leave. \"\n", - " \"Secondary caregivers receive 6 weeks of fully paid parental leave. \"\n", - " \"Leave may begin up to 2 weeks before the expected birth or adoption date. \"\n", - " \"Benefits continue unchanged during parental leave.\"\n", - " ),\n", - " \"code_of_conduct\": (\n", - " \"Code of Conduct: All employees are expected to treat colleagues, customers, and partners \"\n", - " \"with respect and professionalism. Harassment, discrimination, and retaliation of any kind \"\n", - " \"are strictly prohibited. Violations should be reported to HR or via the anonymous hotline.\"\n", - " ),\n", - "}\n", - "\n", - "_BENEFITS = {\n", - " \"health\": (\n", - " \"Health Insurance: The company covers 90% of premiums for employee-only coverage and 75% \"\n", - " \"for family coverage. Plans available: Blue Shield PPO, Kaiser HMO, and HDHP with HSA. \"\n", - " \"Annual deductible: $500 (PPO), $0 (HMO), $1,500 (HDHP). \"\n", - " \"Open enrollment is each November for the following calendar year.\"\n", - " ),\n", - " \"dental\": (\n", - " \"Dental Insurance: 100% coverage for preventive care (cleanings, X-rays). \"\n", - " \"80% coverage for basic restorative care (fillings, extractions). \"\n", - " \"50% coverage for major restorative care (crowns, bridges). \"\n", - " \"Annual maximum benefit: $2,000 per person. Orthodontia lifetime maximum: $1,500.\"\n", - " ),\n", - " \"vision\": (\n", - " \"Vision Insurance: Annual eye exam covered in full. \"\n", - " \"Frames or contacts allowance: $200 per year. \"\n", - " \"Laser vision correction discount: 15% off at participating providers.\"\n", - " ),\n", - " \"401k\": (\n", - " \"401(k) Plan: The company matches 100% of employee contributions up to 4% of salary. \"\n", - " \"An additional 50% match on the next 2% (total effective match up to 5%). \"\n", - " \"Employees are eligible to contribute immediately; company match vests over 3 years. \"\n", - " \"2026 IRS contribution limit: $23,500 (under 50), $31,000 (age 50+).\"\n", - " ),\n", - " \"life_insurance\": (\n", - " \"Life Insurance: Basic life insurance of 2x annual salary provided at no cost. \"\n", - " \"Employees may purchase supplemental coverage up to 5x salary during open enrollment. \"\n", - " \"Accidental death and dismemberment (AD&D) coverage equal to basic life benefit is included.\"\n", - " ),\n", - "}\n", - "\n", - "_PAY_STUBS = {\n", - " (\"EMP-001\", \"2025-12\"): {\n", - " \"gross_pay\": 8333.33, \"federal_tax\": 1458.33, \"state_tax\": 416.67,\n", - " \"social_security\": 516.67, \"medicare\": 120.83, \"health_premium\": 125.00,\n", - " \"401k_contribution\": 333.33, \"net_pay\": 5362.50, \"period\": \"December 2025\",\n", - " },\n", - " (\"EMP-001\", \"2026-01\"): {\n", - " \"gross_pay\": 8333.33, \"federal_tax\": 1458.33, \"state_tax\": 416.67,\n", - " \"social_security\": 516.67, \"medicare\": 120.83, \"health_premium\": 125.00,\n", - " \"401k_contribution\": 333.33, \"net_pay\": 5362.50, \"period\": \"January 2026\",\n", - " },\n", - " (\"EMP-042\", \"2026-01\"): {\n", - " \"gross_pay\": 10416.67, \"federal_tax\": 1875.00, \"state_tax\": 520.83,\n", - " \"social_security\": 645.83, \"medicare\": 151.04, \"health_premium\": 200.00,\n", - " \"401k_contribution\": 416.67, \"net_pay\": 6607.30, \"period\": \"January 2026\",\n", - " },\n", - "}\n", - "\n", - "_PTO_REQUEST_COUNTER = {\"n\": 0}\n", - "\n", - "\n", - "# ---------------------------------------------------------------------------\n", - "# Strands tools\n", - "# ---------------------------------------------------------------------------\n", - "\n", - "@tool\n", - "def get_pto_balance(employee_id: str) -> dict:\n", - " \"\"\"\n", - " Return the current PTO balance for an employee.\n", - "\n", - " Args:\n", - " employee_id: Employee identifier (e.g. EMP-001)\n", - "\n", - " Returns:\n", - " Dict with total_days, used_days, and remaining_days.\n", - " \"\"\"\n", - " balance = _PTO_BALANCES.get(employee_id)\n", - " if balance:\n", - " return {\"employee_id\": employee_id, **balance}\n", - " return {\"employee_id\": employee_id, \"error\": f\"Employee {employee_id} not found.\"}\n", - "\n", - "\n", - "@tool\n", - "def submit_pto_request(\n", - " employee_id: str,\n", - " start_date: str,\n", - " end_date: str,\n", - " reason: str = \"Personal time off\",\n", - ") -> dict:\n", - " \"\"\"\n", - " Submit a PTO request for an employee.\n", - "\n", - " Args:\n", - " employee_id: Employee identifier (e.g. EMP-001)\n", - " start_date: First day of leave in YYYY-MM-DD format\n", - " end_date: Last day of leave in YYYY-MM-DD format\n", - " reason: Optional reason for the request\n", - "\n", - " Returns:\n", - " Dict with request_id, status, and confirmation message.\n", - " \"\"\"\n", - " _PTO_REQUEST_COUNTER[\"n\"] += 1\n", - " request_id = f\"PTO-2026-{_PTO_REQUEST_COUNTER['n']:03d}\"\n", - " return {\n", - " \"request_id\": request_id,\n", - " \"employee_id\": employee_id,\n", - " \"start_date\": start_date,\n", - " \"end_date\": end_date,\n", - " \"reason\": reason,\n", - " \"status\": \"APPROVED\",\n", - " \"message\": f\"PTO request {request_id} approved for {employee_id} from {start_date} to {end_date}.\",\n", - " }\n", - "\n", - "\n", - "@tool\n", - "def lookup_hr_policy(topic: str) -> dict:\n", - " \"\"\"\n", - " Look up a company HR policy document by topic.\n", - "\n", - " Args:\n", - " topic: Policy topic. Supported values: pto, remote_work, parental_leave, code_of_conduct\n", - "\n", - " Returns:\n", - " Dict with topic and policy_text.\n", - " \"\"\"\n", - " key = topic.lower().replace(\" \", \"_\").replace(\"-\", \"_\")\n", - " text = _HR_POLICIES.get(key)\n", - " if text:\n", - " return {\"topic\": topic, \"policy_text\": text}\n", - " return {\"topic\": topic, \"error\": f\"Policy '{topic}' not found. Available: {list(_HR_POLICIES.keys())}\"}\n", - "\n", - "\n", - "@tool\n", - "def get_benefits_summary(benefit_type: str) -> dict:\n", - " \"\"\"\n", - " Return a summary of a specific employee benefit.\n", - "\n", - " Args:\n", - " benefit_type: Type of benefit. Supported values: health, dental, vision, 401k, life_insurance\n", - "\n", - " Returns:\n", - " Dict with benefit_type and summary text.\n", - " \"\"\"\n", - " key = benefit_type.lower().replace(\" \", \"_\").replace(\"-\", \"_\")\n", - " text = _BENEFITS.get(key)\n", - " if text:\n", - " return {\"benefit_type\": benefit_type, \"summary\": text}\n", - " return {\"benefit_type\": benefit_type, \"error\": f\"Benefit '{benefit_type}' not found. Available: {list(_BENEFITS.keys())}\"}\n", - "\n", - "\n", - "@tool\n", - "def get_pay_stub(employee_id: str, period: str) -> dict:\n", - " \"\"\"\n", - " Retrieve a pay stub for an employee for a specific pay period.\n", - "\n", - " Args:\n", - " employee_id: Employee identifier (e.g. EMP-001)\n", - " period: Pay period in YYYY-MM format (e.g. 2026-01)\n", - "\n", - " Returns:\n", - " Dict with gross pay, deductions, and net pay.\n", - " \"\"\"\n", - " stub = _PAY_STUBS.get((employee_id, period))\n", - " if stub:\n", - " return {\"employee_id\": employee_id, **stub}\n", - " return {\"employee_id\": employee_id, \"period\": period, \"error\": f\"Pay stub not found for {employee_id} period {period}.\"}\n", - "\n", - "\n", - "# ---------------------------------------------------------------------------\n", - "# Agent\n", - "# ---------------------------------------------------------------------------\n", - "\n", - "SYSTEM_PROMPT = \"\"\"You are a helpful HR Assistant for Acme Corp.\n", - "\n", - "You help employees with:\n", - "- Checking PTO (paid time off) balances\n", - "- Submitting PTO requests\n", - "- Looking up HR policies (PTO, remote work, parental leave, code of conduct)\n", - "- Understanding employee benefits (health, dental, vision, 401k, life insurance)\n", - "- Retrieving pay stub information\n", - "\n", - "Always use the available tools to answer questions accurately. Do not make up\n", - "policy details, benefit amounts, or pay information — look them up.\n", - "Be concise, professional, and friendly.\"\"\"\n", - "\n", - "_MODEL = BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\")\n", - "_TOOLS = [get_pto_balance, submit_pto_request, lookup_hr_policy, get_benefits_summary, get_pay_stub]\n", - "\n", - "# Session cache: session_id -> Agent (preserves conversation history across turns)\n", - "_SESSION_AGENTS: dict[str, Agent] = {}\n", - "\n", - "\n", - "@app.entrypoint\n", - "async def invoke(payload, context):\n", - " \"\"\"Handle an agent invocation from AgentCore Runtime.\"\"\"\n", - " prompt = payload.get(\"prompt\", \"\")\n", - " session_id = context.session_id\n", - " logger.info(\"Received prompt (session=%s): %s\", session_id, prompt[:80])\n", - "\n", - " if session_id and session_id in _SESSION_AGENTS:\n", - " agent = _SESSION_AGENTS[session_id]\n", - " else:\n", - " agent = Agent(model=_MODEL, tools=_TOOLS, system_prompt=SYSTEM_PROMPT)\n", - " if session_id:\n", - " _SESSION_AGENTS[session_id] = agent\n", - "\n", - " parts = []\n", - " async for event in agent.stream_async(prompt):\n", - " if \"data\" in event:\n", - " parts.append(str(event[\"data\"]))\n", - " response = \"\".join(parts)\n", - " # Strip inline ... blocks so spans contain only the final answer\n", - " response = re.sub(r\".*?\", \"\", response, flags=re.DOTALL).strip()\n", - " return response\n", - "\n", - "\n", - "if __name__ == \"__main__\":\n", - " app.run()" - ] + "source": "%%writefile hr_assistant_agent.py\n\"\"\"\nHR Assistant Agent — Strands agent deployed on Bedrock AgentCore Runtime.\n\nTools (deterministic / mock data for reproducible evaluations):\n get_pto_balance — remaining PTO days for an employee\n submit_pto_request — request time off\n lookup_hr_policy — company policy documents\n get_benefits_summary — health, dental, vision, 401k, life insurance details\n get_pay_stub — pay stub for a given period\n\"\"\"\n\nimport logging\nimport re\n\nfrom bedrock_agentcore.runtime import BedrockAgentCoreApp\nfrom strands import Agent, tool\nfrom strands.models import BedrockModel\n\nlogging.basicConfig(level=logging.INFO)\nlogger = logging.getLogger(__name__)\n\napp = BedrockAgentCoreApp()\n\n# ---------------------------------------------------------------------------\n# Mock data\n# ---------------------------------------------------------------------------\n\n_PTO_BALANCES = {\n \"EMP-001\": {\"total_days\": 15, \"used_days\": 5, \"remaining_days\": 10},\n \"EMP-002\": {\"total_days\": 15, \"used_days\": 12, \"remaining_days\": 3},\n \"EMP-042\": {\"total_days\": 20, \"used_days\": 7, \"remaining_days\": 13},\n}\n\n_HR_POLICIES = {\n \"pto\": (\n \"PTO Policy: Full-time employees accrue 15 days of PTO per year (20 days after 3 years). \"\n \"PTO requests must be submitted at least 2 business days in advance. \"\n \"Unused PTO up to 5 days rolls over to the next year. \"\n \"PTO cannot be taken in advance of accrual.\"\n ),\n \"remote_work\": (\n \"Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. \"\n \"Core collaboration hours are 10am-3pm local time. \"\n \"A dedicated workspace with reliable internet (25 Mbps+) is required. \"\n \"Employees must be reachable via Slack and email during core hours.\"\n ),\n \"parental_leave\": (\n \"Parental Leave Policy: Primary caregivers receive 16 weeks of fully paid parental leave. \"\n \"Secondary caregivers receive 6 weeks of fully paid parental leave. \"\n \"Leave may begin up to 2 weeks before the expected birth or adoption date. \"\n \"Benefits continue unchanged during parental leave.\"\n ),\n \"code_of_conduct\": (\n \"Code of Conduct: All employees are expected to treat colleagues, customers, and partners \"\n \"with respect and professionalism. Harassment, discrimination, and retaliation of any kind \"\n \"are strictly prohibited. Violations should be reported to HR or via the anonymous hotline.\"\n ),\n}\n\n_BENEFITS = {\n \"health\": (\n \"Health Insurance: The company covers 90% of premiums for employee-only coverage and 75% \"\n \"for family coverage. Plans available: Blue Shield PPO, Kaiser HMO, and HDHP with HSA. \"\n \"Annual deductible: $500 (PPO), $0 (HMO), $1,500 (HDHP). \"\n \"Open enrollment is each November for the following calendar year.\"\n ),\n \"dental\": (\n \"Dental Insurance: 100% coverage for preventive care (cleanings, X-rays). \"\n \"80% coverage for basic restorative care (fillings, extractions). \"\n \"50% coverage for major restorative care (crowns, bridges). \"\n \"Annual maximum benefit: $2,000 per person. Orthodontia lifetime maximum: $1,500.\"\n ),\n \"vision\": (\n \"Vision Insurance: Annual eye exam covered in full. \"\n \"Frames or contacts allowance: $200 per year. \"\n \"Laser vision correction discount: 15% off at participating providers.\"\n ),\n \"401k\": (\n \"401(k) Plan: The company matches 100% of employee contributions up to 4% of salary. \"\n \"An additional 50% match on the next 2% (total effective match up to 5%). \"\n \"Employees are eligible to contribute immediately; company match vests over 3 years. \"\n \"2026 IRS contribution limit: $23,500 (under 50), $31,000 (age 50+).\"\n ),\n \"life_insurance\": (\n \"Life Insurance: Basic life insurance of 2x annual salary provided at no cost. \"\n \"Employees may purchase supplemental coverage up to 5x salary during open enrollment. \"\n \"Accidental death and dismemberment (AD&D) coverage equal to basic life benefit is included.\"\n ),\n}\n\n_PAY_STUBS = {\n (\"EMP-001\", \"2025-12\"): {\n \"gross_pay\": 8333.33,\n \"federal_tax\": 1458.33,\n \"state_tax\": 416.67,\n \"social_security\": 516.67,\n \"medicare\": 120.83,\n \"health_premium\": 125.00,\n \"401k_contribution\": 333.33,\n \"net_pay\": 5362.50,\n \"period\": \"December 2025\",\n },\n (\"EMP-001\", \"2026-01\"): {\n \"gross_pay\": 8333.33,\n \"federal_tax\": 1458.33,\n \"state_tax\": 416.67,\n \"social_security\": 516.67,\n \"medicare\": 120.83,\n \"health_premium\": 125.00,\n \"401k_contribution\": 333.33,\n \"net_pay\": 5362.50,\n \"period\": \"January 2026\",\n },\n (\"EMP-042\", \"2026-01\"): {\n \"gross_pay\": 10416.67,\n \"federal_tax\": 1875.00,\n \"state_tax\": 520.83,\n \"social_security\": 645.83,\n \"medicare\": 151.04,\n \"health_premium\": 200.00,\n \"401k_contribution\": 416.67,\n \"net_pay\": 6607.30,\n \"period\": \"January 2026\",\n },\n}\n\n_PTO_REQUEST_COUNTER = {\"n\": 0}\n\n\n# ---------------------------------------------------------------------------\n# Strands tools\n# ---------------------------------------------------------------------------\n\n\n@tool\ndef get_pto_balance(employee_id: str) -> dict:\n \"\"\"\n Return the current PTO balance for an employee.\n\n Args:\n employee_id: Employee identifier (e.g. EMP-001)\n\n Returns:\n Dict with total_days, used_days, and remaining_days.\n \"\"\"\n balance = _PTO_BALANCES.get(employee_id)\n if balance:\n return {\"employee_id\": employee_id, **balance}\n return {\"employee_id\": employee_id, \"error\": f\"Employee {employee_id} not found.\"}\n\n\n@tool\ndef submit_pto_request(\n employee_id: str,\n start_date: str,\n end_date: str,\n reason: str = \"Personal time off\",\n) -> dict:\n \"\"\"\n Submit a PTO request for an employee.\n\n Args:\n employee_id: Employee identifier (e.g. EMP-001)\n start_date: First day of leave in YYYY-MM-DD format\n end_date: Last day of leave in YYYY-MM-DD format\n reason: Optional reason for the request\n\n Returns:\n Dict with request_id, status, and confirmation message.\n \"\"\"\n _PTO_REQUEST_COUNTER[\"n\"] += 1\n request_id = f\"PTO-2026-{_PTO_REQUEST_COUNTER['n']:03d}\"\n return {\n \"request_id\": request_id,\n \"employee_id\": employee_id,\n \"start_date\": start_date,\n \"end_date\": end_date,\n \"reason\": reason,\n \"status\": \"APPROVED\",\n \"message\": f\"PTO request {request_id} approved for {employee_id} from {start_date} to {end_date}.\",\n }\n\n\n@tool\ndef lookup_hr_policy(topic: str) -> dict:\n \"\"\"\n Look up a company HR policy document by topic.\n\n Args:\n topic: Policy topic. Supported values: pto, remote_work, parental_leave, code_of_conduct\n\n Returns:\n Dict with topic and policy_text.\n \"\"\"\n key = topic.lower().replace(\" \", \"_\").replace(\"-\", \"_\")\n text = _HR_POLICIES.get(key)\n if text:\n return {\"topic\": topic, \"policy_text\": text}\n return {\n \"topic\": topic,\n \"error\": f\"Policy '{topic}' not found. Available: {list(_HR_POLICIES.keys())}\",\n }\n\n\n@tool\ndef get_benefits_summary(benefit_type: str) -> dict:\n \"\"\"\n Return a summary of a specific employee benefit.\n\n Args:\n benefit_type: Type of benefit. Supported values: health, dental, vision, 401k, life_insurance\n\n Returns:\n Dict with benefit_type and summary text.\n \"\"\"\n key = benefit_type.lower().replace(\" \", \"_\").replace(\"-\", \"_\")\n text = _BENEFITS.get(key)\n if text:\n return {\"benefit_type\": benefit_type, \"summary\": text}\n return {\n \"benefit_type\": benefit_type,\n \"error\": f\"Benefit '{benefit_type}' not found. Available: {list(_BENEFITS.keys())}\",\n }\n\n\n@tool\ndef get_pay_stub(employee_id: str, period: str) -> dict:\n \"\"\"\n Retrieve a pay stub for an employee for a specific pay period.\n\n Args:\n employee_id: Employee identifier (e.g. EMP-001)\n period: Pay period in YYYY-MM format (e.g. 2026-01)\n\n Returns:\n Dict with gross pay, deductions, and net pay.\n \"\"\"\n stub = _PAY_STUBS.get((employee_id, period))\n if stub:\n return {\"employee_id\": employee_id, **stub}\n return {\n \"employee_id\": employee_id,\n \"period\": period,\n \"error\": f\"Pay stub not found for {employee_id} period {period}.\",\n }\n\n\n# ---------------------------------------------------------------------------\n# Agent\n# ---------------------------------------------------------------------------\n\nSYSTEM_PROMPT = \"\"\"You are a helpful HR Assistant for Acme Corp.\n\nYou help employees with:\n- Checking PTO (paid time off) balances\n- Submitting PTO requests\n- Looking up HR policies (PTO, remote work, parental leave, code of conduct)\n- Understanding employee benefits (health, dental, vision, 401k, life insurance)\n- Retrieving pay stub information\n\nAlways use the available tools to answer questions accurately. Do not make up\npolicy details, benefit amounts, or pay information — look them up.\nBe concise, professional, and friendly.\"\"\"\n\n_MODEL = BedrockModel(model_id=\"us.amazon.nova-lite-v1:0\")\n_TOOLS = [\n get_pto_balance,\n submit_pto_request,\n lookup_hr_policy,\n get_benefits_summary,\n get_pay_stub,\n]\n\n# Session cache: session_id -> Agent (preserves conversation history across turns)\n_SESSION_AGENTS: dict[str, Agent] = {}\n\n\n@app.entrypoint\nasync def invoke(payload, context):\n \"\"\"Handle an agent invocation from AgentCore Runtime.\"\"\"\n prompt = payload.get(\"prompt\", \"\")\n session_id = context.session_id\n logger.info(\"Received prompt (session=%s): %s\", session_id, prompt[:80])\n\n if session_id and session_id in _SESSION_AGENTS:\n agent = _SESSION_AGENTS[session_id]\n else:\n agent = Agent(model=_MODEL, tools=_TOOLS, system_prompt=SYSTEM_PROMPT)\n if session_id:\n _SESSION_AGENTS[session_id] = agent\n\n parts = []\n async for event in agent.stream_async(prompt):\n if \"data\" in event:\n parts.append(str(event[\"data\"]))\n response = \"\".join(parts)\n # Strip inline ... blocks so spans contain only the final answer\n response = re.sub(\n r\".*?\", \"\", response, flags=re.DOTALL\n ).strip()\n return response\n\n\nif __name__ == \"__main__\":\n app.run()" }, { "cell_type": "code", @@ -536,7 +266,7 @@ }, "outputs": [], "source": [ - "# Persist agent info for the programmatic_evaluators notebook\n", + "# Persist agent info\n", "%store AGENT_ID\n", "%store AGENT_ARN\n", "%store CW_LOG_GROUP\n", @@ -865,13 +595,6 @@ "\n", "eval_client = EvaluationClient(region_name=REGION)\n", "\n", - "# # Set the evaluator level cache with custom evaluator IDs so the client\n", - "# # doesn't need to call get_evaluator for them\n", - "# eval_client._evaluator_level_cache.update({\n", - "# CUSTOM_RESPONSE_SIMILARITY_ID: \"TRACE\",\n", - "# CUSTOM_ASSERTION_CHECKER_ID: \"SESSION\",\n", - "# })\n", - "\n", "print(f\"EvaluationClient initialised (region={REGION})\")\n", "print(f\" {CUSTOM_RESPONSE_SIMILARITY_ID} → TRACE (custom: HRResponseSimilarity)\")\n", "print(f\" {CUSTOM_ASSERTION_CHECKER_ID} → SESSION (custom: HRAssertionChecker)\")" @@ -1659,4 +1382,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file diff --git a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py b/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py deleted file mode 100644 index f0ecb1b34..000000000 --- a/01-tutorials/07-AgentCore-evaluations/05-groundtruth-based-evalautions/hr_assistant_agent.py +++ /dev/null @@ -1,268 +0,0 @@ -""" -HR Assistant Agent — Strands agent deployed on Bedrock AgentCore Runtime. - -Tools (deterministic / mock data for reproducible evaluations): - get_pto_balance — remaining PTO days for an employee - submit_pto_request — request time off - lookup_hr_policy — company policy documents - get_benefits_summary — health, dental, vision, 401k, life insurance details - get_pay_stub — pay stub for a given period -""" - -import logging -import re - -from bedrock_agentcore.runtime import BedrockAgentCoreApp -from strands import Agent, tool -from strands.models import BedrockModel - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -app = BedrockAgentCoreApp() - -# --------------------------------------------------------------------------- -# Mock data -# --------------------------------------------------------------------------- - -_PTO_BALANCES = { - "EMP-001": {"total_days": 15, "used_days": 5, "remaining_days": 10}, - "EMP-002": {"total_days": 15, "used_days": 12, "remaining_days": 3}, - "EMP-042": {"total_days": 20, "used_days": 7, "remaining_days": 13}, -} - -_HR_POLICIES = { - "pto": ( - "PTO Policy: Full-time employees accrue 15 days of PTO per year (20 days after 3 years). " - "PTO requests must be submitted at least 2 business days in advance. " - "Unused PTO up to 5 days rolls over to the next year. " - "PTO cannot be taken in advance of accrual." - ), - "remote_work": ( - "Remote Work Policy: Employees may work remotely up to 3 days per week with manager approval. " - "Core collaboration hours are 10am-3pm local time. " - "A dedicated workspace with reliable internet (25 Mbps+) is required. " - "Employees must be reachable via Slack and email during core hours." - ), - "parental_leave": ( - "Parental Leave Policy: Primary caregivers receive 16 weeks of fully paid parental leave. " - "Secondary caregivers receive 6 weeks of fully paid parental leave. " - "Leave may begin up to 2 weeks before the expected birth or adoption date. " - "Benefits continue unchanged during parental leave." - ), - "code_of_conduct": ( - "Code of Conduct: All employees are expected to treat colleagues, customers, and partners " - "with respect and professionalism. Harassment, discrimination, and retaliation of any kind " - "are strictly prohibited. Violations should be reported to HR or via the anonymous hotline." - ), -} - -_BENEFITS = { - "health": ( - "Health Insurance: The company covers 90% of premiums for employee-only coverage and 75% " - "for family coverage. Plans available: Blue Shield PPO, Kaiser HMO, and HDHP with HSA. " - "Annual deductible: $500 (PPO), $0 (HMO), $1,500 (HDHP). " - "Open enrollment is each November for the following calendar year." - ), - "dental": ( - "Dental Insurance: 100% coverage for preventive care (cleanings, X-rays). " - "80% coverage for basic restorative care (fillings, extractions). " - "50% coverage for major restorative care (crowns, bridges). " - "Annual maximum benefit: $2,000 per person. Orthodontia lifetime maximum: $1,500." - ), - "vision": ( - "Vision Insurance: Annual eye exam covered in full. " - "Frames or contacts allowance: $200 per year. " - "Laser vision correction discount: 15% off at participating providers." - ), - "401k": ( - "401(k) Plan: The company matches 100% of employee contributions up to 4% of salary. " - "An additional 50% match on the next 2% (total effective match up to 5%). " - "Employees are eligible to contribute immediately; company match vests over 3 years. " - "2026 IRS contribution limit: $23,500 (under 50), $31,000 (age 50+)." - ), - "life_insurance": ( - "Life Insurance: Basic life insurance of 2x annual salary provided at no cost. " - "Employees may purchase supplemental coverage up to 5x salary during open enrollment. " - "Accidental death and dismemberment (AD&D) coverage equal to basic life benefit is included." - ), -} - -_PAY_STUBS = { - ("EMP-001", "2025-12"): { - "gross_pay": 8333.33, "federal_tax": 1458.33, "state_tax": 416.67, - "social_security": 516.67, "medicare": 120.83, "health_premium": 125.00, - "401k_contribution": 333.33, "net_pay": 5362.50, "period": "December 2025", - }, - ("EMP-001", "2026-01"): { - "gross_pay": 8333.33, "federal_tax": 1458.33, "state_tax": 416.67, - "social_security": 516.67, "medicare": 120.83, "health_premium": 125.00, - "401k_contribution": 333.33, "net_pay": 5362.50, "period": "January 2026", - }, - ("EMP-042", "2026-01"): { - "gross_pay": 10416.67, "federal_tax": 1875.00, "state_tax": 520.83, - "social_security": 645.83, "medicare": 151.04, "health_premium": 200.00, - "401k_contribution": 416.67, "net_pay": 6607.30, "period": "January 2026", - }, -} - -_PTO_REQUEST_COUNTER = {"n": 0} - - -# --------------------------------------------------------------------------- -# Strands tools -# --------------------------------------------------------------------------- - -@tool -def get_pto_balance(employee_id: str) -> dict: - """ - Return the current PTO balance for an employee. - - Args: - employee_id: Employee identifier (e.g. EMP-001) - - Returns: - Dict with total_days, used_days, and remaining_days. - """ - balance = _PTO_BALANCES.get(employee_id) - if balance: - return {"employee_id": employee_id, **balance} - return {"employee_id": employee_id, "error": f"Employee {employee_id} not found."} - - -@tool -def submit_pto_request( - employee_id: str, - start_date: str, - end_date: str, - reason: str = "Personal time off", -) -> dict: - """ - Submit a PTO request for an employee. - - Args: - employee_id: Employee identifier (e.g. EMP-001) - start_date: First day of leave in YYYY-MM-DD format - end_date: Last day of leave in YYYY-MM-DD format - reason: Optional reason for the request - - Returns: - Dict with request_id, status, and confirmation message. - """ - _PTO_REQUEST_COUNTER["n"] += 1 - request_id = f"PTO-2026-{_PTO_REQUEST_COUNTER['n']:03d}" - return { - "request_id": request_id, - "employee_id": employee_id, - "start_date": start_date, - "end_date": end_date, - "reason": reason, - "status": "APPROVED", - "message": f"PTO request {request_id} approved for {employee_id} from {start_date} to {end_date}.", - } - - -@tool -def lookup_hr_policy(topic: str) -> dict: - """ - Look up a company HR policy document by topic. - - Args: - topic: Policy topic. Supported values: pto, remote_work, parental_leave, code_of_conduct - - Returns: - Dict with topic and policy_text. - """ - key = topic.lower().replace(" ", "_").replace("-", "_") - text = _HR_POLICIES.get(key) - if text: - return {"topic": topic, "policy_text": text} - return {"topic": topic, "error": f"Policy '{topic}' not found. Available: {list(_HR_POLICIES.keys())}"} - - -@tool -def get_benefits_summary(benefit_type: str) -> dict: - """ - Return a summary of a specific employee benefit. - - Args: - benefit_type: Type of benefit. Supported values: health, dental, vision, 401k, life_insurance - - Returns: - Dict with benefit_type and summary text. - """ - key = benefit_type.lower().replace(" ", "_").replace("-", "_") - text = _BENEFITS.get(key) - if text: - return {"benefit_type": benefit_type, "summary": text} - return {"benefit_type": benefit_type, "error": f"Benefit '{benefit_type}' not found. Available: {list(_BENEFITS.keys())}"} - - -@tool -def get_pay_stub(employee_id: str, period: str) -> dict: - """ - Retrieve a pay stub for an employee for a specific pay period. - - Args: - employee_id: Employee identifier (e.g. EMP-001) - period: Pay period in YYYY-MM format (e.g. 2026-01) - - Returns: - Dict with gross pay, deductions, and net pay. - """ - stub = _PAY_STUBS.get((employee_id, period)) - if stub: - return {"employee_id": employee_id, **stub} - return {"employee_id": employee_id, "period": period, "error": f"Pay stub not found for {employee_id} period {period}."} - - -# --------------------------------------------------------------------------- -# Agent -# --------------------------------------------------------------------------- - -SYSTEM_PROMPT = """You are a helpful HR Assistant for Acme Corp. - -You help employees with: -- Checking PTO (paid time off) balances -- Submitting PTO requests -- Looking up HR policies (PTO, remote work, parental leave, code of conduct) -- Understanding employee benefits (health, dental, vision, 401k, life insurance) -- Retrieving pay stub information - -Always use the available tools to answer questions accurately. Do not make up -policy details, benefit amounts, or pay information — look them up. -Be concise, professional, and friendly.""" - -_MODEL = BedrockModel(model_id="us.amazon.nova-lite-v1:0") -_TOOLS = [get_pto_balance, submit_pto_request, lookup_hr_policy, get_benefits_summary, get_pay_stub] - -# Session cache: session_id -> Agent (preserves conversation history across turns) -_SESSION_AGENTS: dict[str, Agent] = {} - - -@app.entrypoint -async def invoke(payload, context): - """Handle an agent invocation from AgentCore Runtime.""" - prompt = payload.get("prompt", "") - session_id = context.session_id - logger.info("Received prompt (session=%s): %s", session_id, prompt[:80]) - - if session_id and session_id in _SESSION_AGENTS: - agent = _SESSION_AGENTS[session_id] - else: - agent = Agent(model=_MODEL, tools=_TOOLS, system_prompt=SYSTEM_PROMPT) - if session_id: - _SESSION_AGENTS[session_id] = agent - - parts = [] - async for event in agent.stream_async(prompt): - if "data" in event: - parts.append(str(event["data"])) - response = "".join(parts) - # Strip inline ... blocks so spans contain only the final answer - response = re.sub(r".*?", "", response, flags=re.DOTALL).strip() - return response - - -if __name__ == "__main__": - app.run()