diff --git a/beeai/Containerfile b/beeai/Containerfile index 379affcf..a81f7927 100644 --- a/beeai/Containerfile +++ b/beeai/Containerfile @@ -28,6 +28,7 @@ RUN pip3 install --no-cache-dir \ beeai-framework[mcp,duckduckgo]==0.1.31 \ openinference-instrumentation-beeai \ arize-phoenix-otel \ + deepeval \ && cd /usr/local/lib/python3.13/site-packages \ && patch -p2 -i /tmp/beeai-gemini.patch \ && patch -p2 -i /tmp/beeai-gemini-malformed-function-call.patch \ diff --git a/beeai/agents/backport_agent.py b/beeai/agents/backport_agent.py index c4a72db3..a956ae8a 100644 --- a/beeai/agents/backport_agent.py +++ b/beeai/agents/backport_agent.py @@ -144,7 +144,7 @@ def prompt(self) -> str: 6. {{ backport_git_steps }} """ - async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: + async def run_with_schema(self, input: TInputSchema, capture_raw_response: bool = False) -> TOutputSchema: async with mcp_tools( os.getenv("MCP_GATEWAY_URL"), filter=lambda t: t @@ -153,7 +153,7 @@ async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: tools = self._tools.copy() try: self._tools.extend(gateway_tools) - return await self._run_with_schema(input) + return await self._run_with_schema(input, capture_raw_response=capture_raw_response) finally: self._tools = tools # disassociate removed tools from requirements diff --git a/beeai/agents/base_agent.py b/beeai/agents/base_agent.py index 1ebe7b72..1bc08bb4 100644 --- a/beeai/agents/base_agent.py +++ b/beeai/agents/base_agent.py @@ -14,6 +14,8 @@ class BaseAgent(RequirementAgent, ABC): + last_raw_response: RequirementAgentRunOutput | None = None + @property @abstractmethod def input_schema(self) -> type[TInputSchema]: ... @@ -32,7 +34,9 @@ def _render_prompt(self, input: TInputSchema) -> str: ) return template.render(input) - async def _run_with_schema(self, input: TInputSchema) -> TOutputSchema: + async def _run_with_schema( + self, input: TInputSchema, capture_raw_response: bool = False + ) -> TOutputSchema: max_retries_per_step = int(os.getenv("BEEAI_MAX_RETRIES_PER_STEP", 5)) total_max_retries = int(os.getenv("BEEAI_TOTAL_MAX_RETRIES", 10)) max_iterations = int(os.getenv("BEEAI_MAX_ITERATIONS", 100)) @@ -46,10 +50,14 @@ async def _run_with_schema(self, input: TInputSchema) -> TOutputSchema: max_iterations=max_iterations, ), ) + if capture_raw_response: + self.last_raw_response = response return self.output_schema.model_validate_json(response.result.text) - async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: - return await self._run_with_schema(input) + async def run_with_schema( + self, input: TInputSchema, capture_raw_response: bool = False + ) -> TOutputSchema: + return await self._run_with_schema(input, capture_raw_response) if os.getenv("LITELLM_DEBUG"): @@ -58,4 +66,5 @@ async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: import beeai_framework.adapters.litellm.chat import beeai_framework.adapters.litellm.embedding from beeai_framework.adapters.litellm.utils import litellm_debug + litellm_debug(True) diff --git a/beeai/agents/rebase_agent.py b/beeai/agents/rebase_agent.py index 48cc215c..8daeb471 100644 --- a/beeai/agents/rebase_agent.py +++ b/beeai/agents/rebase_agent.py @@ -173,7 +173,7 @@ def prompt(self) -> str: - Any validation issues found with rpmlint """ - async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: + async def run_with_schema(self, input: TInputSchema, capture_raw_response: bool = False) -> TOutputSchema: async with mcp_tools( os.getenv("MCP_GATEWAY_URL"), filter=lambda t: t @@ -182,7 +182,7 @@ async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: tools = self._tools.copy() try: self._tools.extend(gateway_tools) - return await self._run_with_schema(input) + return await self._run_with_schema(input, capture_raw_response=capture_raw_response) finally: self._tools = tools # disassociate removed tools from requirements diff --git a/beeai/agents/tests/_utils.py b/beeai/agents/tests/_utils.py new file mode 100644 index 00000000..47351091 --- /dev/null +++ b/beeai/agents/tests/_utils.py @@ -0,0 +1,166 @@ +# Copyright 2025 © BeeAI a Series of LF Projects, LLC +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import os +from collections.abc import Awaitable, Callable +from pathlib import Path +from typing import TypeVar + +import pytest +from deepeval import evaluate +from deepeval.dataset import EvaluationDataset, Golden +from deepeval.evaluate import DisplayConfig +from deepeval.metrics import BaseMetric +from deepeval.test_case import LLMTestCase +from deepeval.test_run.test_run import TestRunResultDisplay +from rich.console import Console, Group +from rich.panel import Panel +from rich.table import Table + +from beeai_framework.agents import AnyAgent + +ROOT_CACHE_DIR = f"/tmp/.cache" +Path(ROOT_CACHE_DIR).mkdir(parents=True, exist_ok=True) + + +T = TypeVar("T", bound=AnyAgent) + + +async def create_dataset( + *, + name: str, + agent_factory: Callable[[], T], + agent_run: Callable[[T, LLMTestCase], Awaitable[None]], + goldens: list[Golden], + cache: bool | None = None, +) -> EvaluationDataset: + dataset = EvaluationDataset() + + cache_dir = Path(f"{ROOT_CACHE_DIR}/{name}") + if cache is None: + cache = os.getenv("EVAL_CACHE_DATASET", "").lower() == "true" + + if cache and cache_dir.exists(): + for file_path in cache_dir.glob("*.json"): + dataset.add_test_cases_from_json_file( + file_path=str(file_path.absolute().resolve()), + input_key_name="input", + actual_output_key_name="actual_output", + expected_output_key_name="expected_output", + context_key_name="context", + tools_called_key_name="tools_called", + expected_tools_key_name="expected_tools", + retrieval_context_key_name="retrieval_context", + ) + else: + + async def process_golden(golden: Golden) -> LLMTestCase: + agent = agent_factory() + case = LLMTestCase( + input=golden.input, + expected_tools=golden.expected_tools, + actual_output="", + expected_output=golden.expected_output, + comments=golden.comments, + context=golden.context, + tools_called=golden.tools_called, + retrieval_context=golden.retrieval_context, + additional_metadata=golden.additional_metadata, + ) + await agent_run(agent, case) + return case + + for test_case in await asyncio.gather(*[process_golden(golden) for golden in goldens], return_exceptions=False): + dataset.add_test_case(test_case) + + if cache: + dataset.save_as(file_type="json", directory=str(cache_dir.absolute()), include_test_cases=True) + + for case in dataset.test_cases: + case.name = f"{name} - {case.input[0:128].strip()}" # type: ignore + + return dataset + + +def evaluate_dataset( + dataset: EvaluationDataset, metrics: list[BaseMetric], display_mode: TestRunResultDisplay | None = None +) -> None: + console = Console() + console.print("[bold green]Evaluating dataset[/bold green]") + + if display_mode is None: + display_mode = TestRunResultDisplay(os.environ.get("EVAL_DISPLAY_MODE", "all")) + + output = evaluate( + test_cases=dataset.test_cases, # type: ignore + metrics=metrics, + display_config=DisplayConfig( + show_indicator=False, print_results=False, verbose_mode=False, display_option=None + ), + ) + + # Calculate pass/fail counts + total = len(output.test_results) + passed = sum( + bool(test_result.metrics_data) and all(md.success for md in (test_result.metrics_data or [])) + for test_result in output.test_results + ) + failed = total - passed + + # Print summary table + summary_table = Table(title="Test Results Summary", show_header=True, header_style="bold cyan") + summary_table.add_column("Total", justify="right") + summary_table.add_column("Passed", justify="right", style="green") + summary_table.add_column("Failed", justify="right", style="red") + summary_table.add_row(str(total), str(passed), str(failed)) + console.print(summary_table) + + for test_result in output.test_results: + if display_mode != TestRunResultDisplay.ALL and ( + (display_mode == TestRunResultDisplay.FAILING and test_result.success) + or (display_mode == TestRunResultDisplay.PASSING and not test_result.success) + ): + continue + + # Info Table + info_table = Table(show_header=False, box=None, pad_edge=False) + info_table.add_row("Input", str(test_result.input)) + info_table.add_row("Expected Output", str(test_result.expected_output)) + info_table.add_row("Actual Output", str(test_result.actual_output)) + + # Metrics Table + metrics_table = Table(title="Metrics", show_header=True, header_style="bold magenta") + metrics_table.add_column("Metric") + metrics_table.add_column("Success") + metrics_table.add_column("Score") + metrics_table.add_column("Threshold") + metrics_table.add_column("Reason") + metrics_table.add_column("Error") + # metrics_table.add_column("Verbose Log") + + for metric_data in test_result.metrics_data or []: + metrics_table.add_row( + str(metric_data.name), + str(metric_data.success), + str(metric_data.score), + str(metric_data.threshold), + str(metric_data.reason), + str(metric_data.error) if metric_data.error else "", + # str(metric_data.verbose_logs), + ) + + # Print the panel with info and metrics table + console.print( + Panel( + Group(info_table, metrics_table), + title=f"[bold blue]{test_result.name}[/bold blue]", + border_style="blue", + ) + ) + + # Gather failed tests + if failed: + pytest.fail(f"{failed}/{total} tests failed. See the summary table above for more details.", pytrace=False) + else: + assert 1 == 1 diff --git a/beeai/agents/tests/model.py b/beeai/agents/tests/model.py new file mode 100644 index 00000000..76308345 --- /dev/null +++ b/beeai/agents/tests/model.py @@ -0,0 +1,59 @@ +# Copyright 2025 © BeeAI a Series of LF Projects, LLC +# SPDX-License-Identifier: Apache-2.0 + +import os +from typing import Any, TypeVar + +from deepeval.key_handler import KEY_FILE_HANDLER, KeyValues +from deepeval.models import DeepEvalBaseLLM +from dotenv import load_dotenv +from pydantic import BaseModel + +from beeai_framework.backend import ChatModel, ChatModelParameters +from beeai_framework.backend.constants import ProviderName +from beeai_framework.backend.message import UserMessage +from beeai_framework.middleware.trajectory import GlobalTrajectoryMiddleware +from beeai_framework.utils import ModelLike + +TSchema = TypeVar("TSchema", bound=BaseModel) + + +load_dotenv() + + +class DeepEvalLLM(DeepEvalBaseLLM): + def __init__(self, model: ChatModel, *args: Any, **kwargs: Any) -> None: + self._model = model + super().__init__(model.model_id, *args, **kwargs) + + def load_model(self, *args: Any, **kwargs: Any) -> None: + return None + + def generate(self, prompt: str, schema: BaseModel | None = None) -> str: + raise NotImplementedError() + + async def a_generate(self, prompt: str, schema: TSchema | None = None) -> str: + input_msg = UserMessage(prompt) + response = await self._model.create( + messages=[input_msg], + response_format=schema.model_json_schema(mode="serialization") if schema is not None else None, + stream=False, + temperature=0, + ).middleware( + GlobalTrajectoryMiddleware( + pretty=True, exclude_none=True, enabled=os.environ.get("EVAL_LOG_LLM_CALLS", "").lower() == "true" + ) + ) + text = response.get_text_content() + return schema.model_validate_json(text) if schema else text # type: ignore + + def get_model_name(self) -> str: + return f"{self._model.model_id} ({self._model.provider_id})" + + @staticmethod + def from_name( + name: str | ProviderName | None = None, options: ModelLike[ChatModelParameters] | None = None, **kwargs: Any + ) -> "DeepEvalLLM": + name = name or KEY_FILE_HANDLER.fetch_data(KeyValues.LOCAL_MODEL_NAME) + model = ChatModel.from_name(name, options, **kwargs) + return DeepEvalLLM(model) diff --git a/beeai/agents/tests/test_triage_agent.py b/beeai/agents/tests/test_triage_agent.py new file mode 100644 index 00000000..0d41bd18 --- /dev/null +++ b/beeai/agents/tests/test_triage_agent.py @@ -0,0 +1,97 @@ +import os + +import pytest + +from beeai_framework.agents.experimental.utils._tool import FinalAnswerTool +from beeai_framework.tools.think import ThinkTool +from beeai_framework.utils.strings import to_json + +from deepeval import assert_test +from deepeval.dataset import Golden +from deepeval.metrics import GEval +from deepeval.test_case import LLMTestCase, LLMTestCaseParams, ToolCall + +from base_agent import BaseAgent +from observability import setup_observability +from triage_agent import TriageAgent, InputSchema, OutputSchema, Resolution, NoActionData + +from model import DeepEvalLLM +from _utils import create_dataset, evaluate_dataset + + +async def run_agent(agent: BaseAgent, test_case: LLMTestCase) -> None: + await agent.run_with_schema( + agent.input_schema.model_validate_json(test_case.input), capture_raw_response=True + ) + response = agent.last_raw_response + test_case.tools_called = [] + test_case.actual_output = response.answer.text + for index, step in enumerate(response.state.steps): + if not step.tool: + continue + prev_step = response.state.steps[index - 1] if index > 0 else None + test_case.tools_called = [ + ToolCall( + name=step.tool.name, + description=step.tool.description, + input_parameters=step.input, + output=step.output.get_text_content(), + reasoning=( + to_json(prev_step.input, indent=2, sort_keys=False) + if prev_step and isinstance(prev_step.tool, ThinkTool) + else None + ), + ) + for step in response.state.steps + if step.tool and not isinstance(step.tool, FinalAnswerTool) + ] + + +@pytest.mark.asyncio +async def test_triage(): + setup_observability(os.getenv("COLLECTOR_ENDPOINT")) + + dataset = await create_dataset( + name="Triage", + agent_factory=lambda: TriageAgent(), + agent_run=run_agent, + goldens=[ + Golden( + input=InputSchema(issue="RHEL-12345").model_dump_json(), + expected_output=OutputSchema( + resolution=Resolution.NO_ACTION, + data=NoActionData(reasoning="The issue is not a fixable bug", jira_issue="RHEL-12345"), + ).model_dump_json(), + expected_tools=[ + # ToolCall( + # name="get_jira_details", + # reasoning="TODO", + # input={"issue_key": "RHEL-12345"}, + # output="TODO", + # ), + ], + ) + ], + ) + + correctness_metric = GEval( + name="Correctness", + criteria="\n - ".join( + [ + "Reasoning must be factually equal to the expected one", + "`jira_issue` in the output must match `issue` in the input", + ] + ), + evaluation_params=[ + LLMTestCaseParams.INPUT, + LLMTestCaseParams.ACTUAL_OUTPUT, + LLMTestCaseParams.EXPECTED_OUTPUT, + LLMTestCaseParams.TOOLS_CALLED, + LLMTestCaseParams.EXPECTED_TOOLS, + ], + verbose_mode=True, + model=DeepEvalLLM.from_name(os.getenv("CHAT_MODEL")), + threshold=0.65, + ) + metrics: list[BaseMetric] = [correctness_metric] + evaluate_dataset(dataset, metrics) diff --git a/beeai/agents/triage_agent.py b/beeai/agents/triage_agent.py index 555d9fd9..bb68a095 100644 --- a/beeai/agents/triage_agent.py +++ b/beeai/agents/triage_agent.py @@ -258,14 +258,14 @@ def prompt(self) -> str: e.g., "The request is for a new feature ('add dark mode') which is not appropriate for a bugfix update in RHEL."] """ - async def run_with_schema(self, input: TInputSchema) -> TOutputSchema: + async def run_with_schema(self, input: TInputSchema, capture_raw_response: bool = False) -> TOutputSchema: async with mcp_tools( os.getenv("MCP_GATEWAY_URL"), filter=lambda t: t == "get_jira_details" ) as gateway_tools: tools = self._tools.copy() try: self._tools.extend(gateway_tools) - return await self._run_with_schema(input) + return await self._run_with_schema(input, capture_raw_response=capture_raw_response) finally: self._tools = tools # disassociate removed tools from requirements diff --git a/beeai/beeai-instrumentation.patch b/beeai/beeai-instrumentation.patch index dd7ec978..389050c8 100644 --- a/beeai/beeai-instrumentation.patch +++ b/beeai/beeai-instrumentation.patch @@ -42,7 +42,7 @@ index 21da0aa2..c32ae843 100644 from beeai_framework.agents.react.agent import ReActAgent diff --git a/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py b/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py -index f8150f04..24a0781e 100644 +index f8150f04..b94c04ff 100644 --- a/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py +++ b/python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/middleware.py @@ -19,6 +19,8 @@ from importlib.metadata import PackageNotFoundError, version @@ -68,15 +68,15 @@ index f8150f04..24a0781e 100644 + for m in requirement_agent_typed_data.state.memory.messages + ] + if ( -+ hasattr(requirement_agent_typed_data.state, "result") -+ and requirement_agent_typed_data.state.result is not None ++ hasattr(requirement_agent_typed_data.state, "answer") ++ and requirement_agent_typed_data.state.answer is not None + ): -+ result_role = requirement_agent_typed_data.state.result.role ++ result_role = requirement_agent_typed_data.state.answer.role + generated_message = { + "role": result_role.value + if hasattr(result_role, "value") + else result_role, -+ "text": requirement_agent_typed_data.state.result.text, ++ "text": requirement_agent_typed_data.state.answer.text, + } except Exception as e: logger.error("Instrumentation error: failed to extract success message", exc_info=e)