diff --git a/beeai/Containerfile b/beeai/Containerfile
index 379affcf..a81f7927 100644
--- a/beeai/Containerfile
+++ b/beeai/Containerfile
@@ -28,6 +28,7 @@ RUN pip3 install --no-cache-dir \
     beeai-framework[mcp,duckduckgo]==0.1.31 \
     openinference-instrumentation-beeai \
     arize-phoenix-otel \
+    deepeval \
     && cd /usr/local/lib/python3.13/site-packages \
     && patch -p2 -i /tmp/beeai-gemini.patch \
     && patch -p2 -i /tmp/beeai-gemini-malformed-function-call.patch \
diff --git a/beeai/agents/backport_agent.py b/beeai/agents/backport_agent.py
index c4a72db3..a956ae8a 100644
--- a/beeai/agents/backport_agent.py
+++ b/beeai/agents/backport_agent.py
@@ -144,7 +144,7 @@ def prompt(self) -> str:
           6. {{ backport_git_steps }}
         """
 
-    async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
+    async def run_with_schema(self, input: TInputSchema, capture_raw_response: bool = False) -> TOutputSchema:
         async with mcp_tools(
             os.getenv("MCP_GATEWAY_URL"),
             filter=lambda t: t
@@ -153,7 +153,7 @@ async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
             tools = self._tools.copy()
             try:
                 self._tools.extend(gateway_tools)
-                return await self._run_with_schema(input)
+                return await self._run_with_schema(input, capture_raw_response=capture_raw_response)
             finally:
                 self._tools = tools
                 # disassociate removed tools from requirements
diff --git a/beeai/agents/base_agent.py b/beeai/agents/base_agent.py
index 1ebe7b72..1bc08bb4 100644
--- a/beeai/agents/base_agent.py
+++ b/beeai/agents/base_agent.py
@@ -14,6 +14,8 @@
 
 
 class BaseAgent(RequirementAgent, ABC):
+    last_raw_response: RequirementAgentRunOutput | None = None
+
     @property
     @abstractmethod
     def input_schema(self) -> type[TInputSchema]: ...
@@ -32,7 +34,9 @@ def _render_prompt(self, input: TInputSchema) -> str:
         )
         return template.render(input)
 
-    async def _run_with_schema(self, input: TInputSchema) -> TOutputSchema:
+    async def _run_with_schema(
+        self, input: TInputSchema, capture_raw_response: bool = False
+    ) -> TOutputSchema:
         max_retries_per_step = int(os.getenv("BEEAI_MAX_RETRIES_PER_STEP", 5))
         total_max_retries = int(os.getenv("BEEAI_TOTAL_MAX_RETRIES", 10))
         max_iterations = int(os.getenv("BEEAI_MAX_ITERATIONS", 100))
@@ -46,10 +50,14 @@ async def _run_with_schema(self, input: TInputSchema) -> TOutputSchema:
                 max_iterations=max_iterations,
             ),
         )
+        if capture_raw_response:
+            self.last_raw_response = response
         return self.output_schema.model_validate_json(response.result.text)
 
-    async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
-        return await self._run_with_schema(input)
+    async def run_with_schema(
+        self, input: TInputSchema, capture_raw_response: bool = False
+    ) -> TOutputSchema:
+        return await self._run_with_schema(input, capture_raw_response)
 
 
 if os.getenv("LITELLM_DEBUG"):
@@ -58,4 +66,5 @@ async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
     import beeai_framework.adapters.litellm.chat
     import beeai_framework.adapters.litellm.embedding
     from beeai_framework.adapters.litellm.utils import litellm_debug
+
     litellm_debug(True)
diff --git a/beeai/agents/rebase_agent.py b/beeai/agents/rebase_agent.py
index 48cc215c..8daeb471 100644
--- a/beeai/agents/rebase_agent.py
+++ b/beeai/agents/rebase_agent.py
@@ -173,7 +173,7 @@ def prompt(self) -> str:
           - Any validation issues found with rpmlint
         """
 
-    async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
+    async def run_with_schema(self, input: TInputSchema, capture_raw_response: bool = False) -> TOutputSchema:
         async with mcp_tools(
             os.getenv("MCP_GATEWAY_URL"),
             filter=lambda t: t
@@ -182,7 +182,7 @@ async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
             tools = self._tools.copy()
             try:
                 self._tools.extend(gateway_tools)
-                return await self._run_with_schema(input)
+                return await self._run_with_schema(input, capture_raw_response=capture_raw_response)
             finally:
                 self._tools = tools
                 # disassociate removed tools from requirements
diff --git a/beeai/agents/tests/_utils.py b/beeai/agents/tests/_utils.py
new file mode 100644
index 00000000..47351091
--- /dev/null
+++ b/beeai/agents/tests/_utils.py
@@ -0,0 +1,166 @@
+# Copyright 2025 © BeeAI a Series of LF Projects, LLC
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import os
+from collections.abc import Awaitable, Callable
+from pathlib import Path
+from typing import TypeVar
+
+import pytest
+from deepeval import evaluate
+from deepeval.dataset import EvaluationDataset, Golden
+from deepeval.evaluate import DisplayConfig
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase
+from deepeval.test_run.test_run import TestRunResultDisplay
+from rich.console import Console, Group
+from rich.panel import Panel
+from rich.table import Table
+
+from beeai_framework.agents import AnyAgent
+
+ROOT_CACHE_DIR = f"/tmp/.cache"
+Path(ROOT_CACHE_DIR).mkdir(parents=True, exist_ok=True)
+
+
+T = TypeVar("T", bound=AnyAgent)
+
+
+async def create_dataset(
+    *,
+    name: str,
+    agent_factory: Callable[[], T],
+    agent_run: Callable[[T, LLMTestCase], Awaitable[None]],
+    goldens: list[Golden],
+    cache: bool | None = None,
+) -> EvaluationDataset:
+    dataset = EvaluationDataset()
+
+    cache_dir = Path(f"{ROOT_CACHE_DIR}/{name}")
+    if cache is None:
+        cache = os.getenv("EVAL_CACHE_DATASET", "").lower() == "true"
+
+    if cache and cache_dir.exists():
+        for file_path in cache_dir.glob("*.json"):
+            dataset.add_test_cases_from_json_file(
+                file_path=str(file_path.absolute().resolve()),
+                input_key_name="input",
+                actual_output_key_name="actual_output",
+                expected_output_key_name="expected_output",
+                context_key_name="context",
+                tools_called_key_name="tools_called",
+                expected_tools_key_name="expected_tools",
+                retrieval_context_key_name="retrieval_context",
+            )
+    else:
+
+        async def process_golden(golden: Golden) -> LLMTestCase:
+            agent = agent_factory()
+            case = LLMTestCase(
+                input=golden.input,
+                expected_tools=golden.expected_tools,
+                actual_output="",
+                expected_output=golden.expected_output,
+                comments=golden.comments,
+                context=golden.context,
+                tools_called=golden.tools_called,
+                retrieval_context=golden.retrieval_context,
+                additional_metadata=golden.additional_metadata,
+            )
+            await agent_run(agent, case)
+            return case
+
+        for test_case in await asyncio.gather(*[process_golden(golden) for golden in goldens], return_exceptions=False):
+            dataset.add_test_case(test_case)
+
+        if cache:
+            dataset.save_as(file_type="json", directory=str(cache_dir.absolute()), include_test_cases=True)
+
+    for case in dataset.test_cases:
+        case.name = f"{name} - {case.input[0:128].strip()}"  # type: ignore
+
+    return dataset
+
+
+def evaluate_dataset(
+    dataset: EvaluationDataset, metrics: list[BaseMetric], display_mode: TestRunResultDisplay | None = None
+) -> None:
+    console = Console()
+    console.print("[bold green]Evaluating dataset[/bold green]")
+
+    if display_mode is None:
+        display_mode = TestRunResultDisplay(os.environ.get("EVAL_DISPLAY_MODE", "all"))
+
+    output = evaluate(
+        test_cases=dataset.test_cases,  # type: ignore
+        metrics=metrics,
+        display_config=DisplayConfig(
+            show_indicator=False, print_results=False, verbose_mode=False, display_option=None
+        ),
+    )
+
+    # Calculate pass/fail counts
+    total = len(output.test_results)
+    passed = sum(
+        bool(test_result.metrics_data) and all(md.success for md in (test_result.metrics_data or []))
+        for test_result in output.test_results
+    )
+    failed = total - passed
+
+    # Print summary table
+    summary_table = Table(title="Test Results Summary", show_header=True, header_style="bold cyan")
+    summary_table.add_column("Total", justify="right")
+    summary_table.add_column("Passed", justify="right", style="green")
+    summary_table.add_column("Failed", justify="right", style="red")
+    summary_table.add_row(str(total), str(passed), str(failed))
+    console.print(summary_table)
+
+    for test_result in output.test_results:
+        if display_mode != TestRunResultDisplay.ALL and (
+            (display_mode == TestRunResultDisplay.FAILING and test_result.success)
+            or (display_mode == TestRunResultDisplay.PASSING and not test_result.success)
+        ):
+            continue
+
+        # Info Table
+        info_table = Table(show_header=False, box=None, pad_edge=False)
+        info_table.add_row("Input", str(test_result.input))
+        info_table.add_row("Expected Output", str(test_result.expected_output))
+        info_table.add_row("Actual Output", str(test_result.actual_output))
+
+        # Metrics Table
+        metrics_table = Table(title="Metrics", show_header=True, header_style="bold magenta")
+        metrics_table.add_column("Metric")
+        metrics_table.add_column("Success")
+        metrics_table.add_column("Score")
+        metrics_table.add_column("Threshold")
+        metrics_table.add_column("Reason")
+        metrics_table.add_column("Error")
+        # metrics_table.add_column("Verbose Log")
+
+        for metric_data in test_result.metrics_data or []:
+            metrics_table.add_row(
+                str(metric_data.name),
+                str(metric_data.success),
+                str(metric_data.score),
+                str(metric_data.threshold),
+                str(metric_data.reason),
+                str(metric_data.error) if metric_data.error else "",
+                # str(metric_data.verbose_logs),
+            )
+
+        # Print the panel with info and metrics table
+        console.print(
+            Panel(
+                Group(info_table, metrics_table),
+                title=f"[bold blue]{test_result.name}[/bold blue]",
+                border_style="blue",
+            )
+        )
+
+    # Gather failed tests
+    if failed:
+        pytest.fail(f"{failed}/{total} tests failed. See the summary table above for more details.", pytrace=False)
+    else:
+        assert 1 == 1
diff --git a/beeai/agents/tests/model.py b/beeai/agents/tests/model.py
new file mode 100644
index 00000000..76308345
--- /dev/null
+++ b/beeai/agents/tests/model.py
@@ -0,0 +1,59 @@
+# Copyright 2025 © BeeAI a Series of LF Projects, LLC
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+from typing import Any, TypeVar
+
+from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues
+from deepeval.models import DeepEvalBaseLLM
+from dotenv import load_dotenv
+from pydantic import BaseModel
+
+from beeai_framework.backend import ChatModel, ChatModelParameters
+from beeai_framework.backend.constants import ProviderName
+from beeai_framework.backend.message import UserMessage
+from beeai_framework.middleware.trajectory import GlobalTrajectoryMiddleware
+from beeai_framework.utils import ModelLike
+
+TSchema = TypeVar("TSchema", bound=BaseModel)
+
+
+load_dotenv()
+
+
+class DeepEvalLLM(DeepEvalBaseLLM):
+    def __init__(self, model: ChatModel, *args: Any, **kwargs: Any) -> None:
+        self._model = model
+        super().__init__(model.model_id, *args, **kwargs)
+
+    def load_model(self, *args: Any, **kwargs: Any) -> None:
+        return None
+
+    def generate(self, prompt: str, schema: BaseModel | None = None) -> str:
+        raise NotImplementedError()
+
+    async def a_generate(self, prompt: str, schema: TSchema | None = None) -> str:
+        input_msg = UserMessage(prompt)
+        response = await self._model.create(
+            messages=[input_msg],
+            response_format=schema.model_json_schema(mode="serialization") if schema is not None else None,
+            stream=False,
+            temperature=0,
+        ).middleware(
+            GlobalTrajectoryMiddleware(
+                pretty=True, exclude_none=True, enabled=os.environ.get("EVAL_LOG_LLM_CALLS", "").lower() == "true"
+            )
+        )
+        text = response.get_text_content()
+        return schema.model_validate_json(text) if schema else text  # type: ignore
+
+    def get_model_name(self) -> str:
+        return f"{self._model.model_id} ({self._model.provider_id})"
+
+    @staticmethod
+    def from_name(
+        name: str | ProviderName | None = None, options: ModelLike[ChatModelParameters] | None = None, **kwargs: Any
+    ) -> "DeepEvalLLM":
+        name = name or KEY_FILE_HANDLER.fetch_data(KeyValues.LOCAL_MODEL_NAME)
+        model = ChatModel.from_name(name, options, **kwargs)
+        return DeepEvalLLM(model)
diff --git a/beeai/agents/tests/test_triage_agent.py b/beeai/agents/tests/test_triage_agent.py
new file mode 100644
index 00000000..0d41bd18
--- /dev/null
+++ b/beeai/agents/tests/test_triage_agent.py
@@ -0,0 +1,97 @@
+import os
+
+import pytest
+
+from beeai_framework.agents.experimental.utils._tool import FinalAnswerTool
+from beeai_framework.tools.think import ThinkTool
+from beeai_framework.utils.strings import to_json
+
+from deepeval import assert_test
+from deepeval.dataset import Golden
+from deepeval.metrics import GEval
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall
+
+from base_agent import BaseAgent
+from observability import setup_observability
+from triage_agent import TriageAgent, InputSchema, OutputSchema, Resolution, NoActionData
+
+from model import DeepEvalLLM
+from _utils import create_dataset, evaluate_dataset
+
+
+async def run_agent(agent: BaseAgent, test_case: LLMTestCase) -> None:
+    await agent.run_with_schema(
+        agent.input_schema.model_validate_json(test_case.input), capture_raw_response=True
+    )
+    response = agent.last_raw_response
+    test_case.tools_called = []
+    test_case.actual_output = response.answer.text
+    for index, step in enumerate(response.state.steps):
+        if not step.tool:
+            continue
+        prev_step = response.state.steps[index - 1] if index > 0 else None
+        test_case.tools_called = [
+            ToolCall(
+                name=step.tool.name,
+                description=step.tool.description,
+                input_parameters=step.input,
+                output=step.output.get_text_content(),
+                reasoning=(
+                    to_json(prev_step.input, indent=2, sort_keys=False)
+                    if prev_step and isinstance(prev_step.tool, ThinkTool)
+                    else None
+                ),
+            )
+            for step in response.state.steps
+            if step.tool and not isinstance(step.tool, FinalAnswerTool)
+        ]
+
+
+@pytest.mark.asyncio
+async def test_triage():
+    setup_observability(os.getenv("COLLECTOR_ENDPOINT"))
+
+    dataset = await create_dataset(
+        name="Triage",
+        agent_factory=lambda: TriageAgent(),
+        agent_run=run_agent,
+        goldens=[
+            Golden(
+                input=InputSchema(issue="RHEL-12345").model_dump_json(),
+                expected_output=OutputSchema(
+                    resolution=Resolution.NO_ACTION,
+                    data=NoActionData(reasoning="The issue is not a fixable bug", jira_issue="RHEL-12345"),
+                ).model_dump_json(),
+                expected_tools=[
+                    # ToolCall(
+                    #    name="get_jira_details",
+                    #    reasoning="TODO",
+                    #    input={"issue_key": "RHEL-12345"},
+                    #    output="TODO",
+                    # ),
+                ],
+            )
+        ],
+    )
+
+    correctness_metric = GEval(
+        name="Correctness",
+        criteria="\n - ".join(
+            [
+                "Reasoning must be factually equal to the expected one",
+                "`jira_issue` in the output must match `issue` in the input",
+            ]
+        ),
+        evaluation_params=[
+            LLMTestCaseParams.INPUT,
+            LLMTestCaseParams.ACTUAL_OUTPUT,
+            LLMTestCaseParams.EXPECTED_OUTPUT,
+            LLMTestCaseParams.TOOLS_CALLED,
+            LLMTestCaseParams.EXPECTED_TOOLS,
+        ],
+        verbose_mode=True,
+        model=DeepEvalLLM.from_name(os.getenv("CHAT_MODEL")),
+        threshold=0.65,
+    )
+    metrics: list[BaseMetric] = [correctness_metric]
+    evaluate_dataset(dataset, metrics)
diff --git a/beeai/agents/triage_agent.py b/beeai/agents/triage_agent.py
index 555d9fd9..bb68a095 100644
--- a/beeai/agents/triage_agent.py
+++ b/beeai/agents/triage_agent.py
@@ -258,14 +258,14 @@ def prompt(self) -> str:
                 e.g., "The request is for a new feature ('add dark mode') which is not appropriate for a bugfix update in RHEL."]
         """
 
-    async def run_with_schema(self, input: TInputSchema) -> TOutputSchema:
+    async def run_with_schema(self, input: TInputSchema, capture_raw_response: bool = False) -> TOutputSchema:
         async with mcp_tools(
             os.getenv("MCP_GATEWAY_URL"), filter=lambda t: t == "get_jira_details"
         ) as gateway_tools:
             tools = self._tools.copy()
             try:
                 self._tools.extend(gateway_tools)
-                return await self._run_with_schema(input)
+                return await self._run_with_schema(input, capture_raw_response=capture_raw_response)
             finally:
                 self._tools = tools
                 # disassociate removed tools from requirements
diff --git a/beeai/beeai-instrumentation.patch b/beeai/beeai-instrumentation.patch
index dd7ec978..389050c8 100644
--- a/beeai/beeai-instrumentation.patch
+++ b/beeai/beeai-instrumentation.patch
@@ -42,7 +42,7 @@ index 21da0aa2..c32ae843 100644
              from beeai_framework.agents.react.agent import ReActAgent
  
 diff --git a/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py b/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py
-index f8150f04..24a0781e 100644
+index f8150f04..b94c04ff 100644
 --- a/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py
 +++ b/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py
 @@ -19,6 +19,8 @@ from importlib.metadata import PackageNotFoundError, version
@@ -68,15 +68,15 @@ index f8150f04..24a0781e 100644
 +                        for m in requirement_agent_typed_data.state.memory.messages
 +                    ]
 +                    if (
-+                        hasattr(requirement_agent_typed_data.state, "result")
-+                        and requirement_agent_typed_data.state.result is not None
++                        hasattr(requirement_agent_typed_data.state, "answer")
++                        and requirement_agent_typed_data.state.answer is not None
 +                    ):
-+                        result_role = requirement_agent_typed_data.state.result.role
++                        result_role = requirement_agent_typed_data.state.answer.role
 +                        generated_message = {
 +                            "role": result_role.value
 +                            if hasattr(result_role, "value")
 +                            else result_role,
-+                            "text": requirement_agent_typed_data.state.result.text,
++                            "text": requirement_agent_typed_data.state.answer.text,
 +                        }
              except Exception as e:
                  logger.error("Instrumentation error: failed to extract success message", exc_info=e)