From 5b21ab8af33f74b94f119de9593e819e3d7d6b11 Mon Sep 17 00:00:00 2001 From: Gus Fraser Date: Sat, 2 Aug 2025 17:26:04 +0100 Subject: [PATCH] Improved reporting --- replicantx/cli.py | 62 +++++++++++++++-------- replicantx/models.py | 2 + replicantx/reporters/json.py | 20 ++++++++ replicantx/reporters/markdown.py | 11 ++++ replicantx/scenarios/agent.py | 86 +++++++++++++++++++++++++++++++- replicantx/scenarios/basic.py | 42 ++++++++++++++++ 6 files changed, 200 insertions(+), 23 deletions(-) diff --git a/replicantx/cli.py b/replicantx/cli.py index cb6a76b..23138ff 100644 --- a/replicantx/cli.py +++ b/replicantx/cli.py @@ -234,7 +234,7 @@ async def run_scenarios_async( suite_report.completed_at = datetime.now() # Display summary - display_summary(suite_report) + display_summary(suite_report, verbose) # Generate reports if report_path: @@ -487,7 +487,7 @@ def load_scenario_config(file_path: str) -> ScenarioConfig: raise Exception(f"Invalid scenario configuration: {e}") -def display_summary(suite_report: TestSuiteReport): +def display_summary(suite_report: TestSuiteReport, verbose: bool = False): """Display test execution summary. Args: @@ -517,27 +517,45 @@ def display_summary(suite_report: TestSuiteReport): # Scenario details if suite_report.scenario_reports: - console.print("\nšŸ“‹ Scenario Details") - - scenario_table = Table(show_header=True, header_style="bold blue") - scenario_table.add_column("Scenario") - scenario_table.add_column("Status") - scenario_table.add_column("Steps") - scenario_table.add_column("Duration") - - for scenario in suite_report.scenario_reports: - status = "āœ… PASS" if scenario.passed else "āŒ FAIL" - steps = f"{scenario.passed_steps}/{scenario.total_steps}" - duration = f"{scenario.duration_seconds:.2f}s" + console.print("\nšŸ“‹ Scenario Details") - scenario_table.add_row( - scenario.scenario_name, - status, - steps, - duration - ) - - console.print(scenario_table) + scenario_table = Table(show_header=True, header_style="bold blue") + scenario_table.add_column("Scenario") + scenario_table.add_column("Status") + scenario_table.add_column("Steps") + scenario_table.add_column("Duration") + scenario_table.add_column("Justification") + + for scenario in suite_report.scenario_reports: + status = "āœ… PASS" if scenario.passed else "āŒ FAIL" + steps = f"{scenario.passed_steps}/{scenario.total_steps}" + duration = f"{scenario.duration_seconds:.2f}s" + justification = scenario.justification or "No justification available" + + # Truncate justification for table display + if len(justification) > 80: + justification = justification[:77] + "..." + + scenario_table.add_row( + scenario.scenario_name, + status, + steps, + duration, + justification + ) + + console.print(scenario_table) + + # Show detailed justification for failed scenarios + failed_scenarios = [s for s in suite_report.scenario_reports if not s.passed] + if failed_scenarios and verbose: + console.print("\nšŸ” Detailed Justification for Failed Scenarios") + for scenario in failed_scenarios: + console.print(f"\n**{scenario.scenario_name}**") + if scenario.justification: + console.print(f"šŸ’­ {scenario.justification}") + if scenario.error: + console.print(f"āŒ Error: {scenario.error}") def generate_reports(suite_report: TestSuiteReport, report_path: str): diff --git a/replicantx/models.py b/replicantx/models.py index 069e640..91f55fa 100644 --- a/replicantx/models.py +++ b/replicantx/models.py @@ -264,6 +264,8 @@ class ScenarioReport(BaseModel): step_results: List[StepResult] = Field(default_factory=list, description="Results for each step") error: Optional[str] = Field(None, description="Overall error message if scenario failed") conversation_history: Optional[str] = Field(None, description="Complete conversation history for agent scenarios") + justification: Optional[str] = Field(None, description="Explanation of why the scenario passed or failed") + goal_evaluation_result: Optional[GoalEvaluationResult] = Field(None, description="Goal evaluation result for agent scenarios") started_at: datetime = Field(default_factory=datetime.now, description="When scenario started") completed_at: Optional[datetime] = Field(None, description="When scenario completed") diff --git a/replicantx/reporters/json.py b/replicantx/reporters/json.py index 19574ec..83c54b5 100644 --- a/replicantx/reporters/json.py +++ b/replicantx/reporters/json.py @@ -72,6 +72,8 @@ def _serialize_scenario_report(self, report: ScenarioReport) -> Dict[str, Any]: "completed_at": report.completed_at.isoformat() if report.completed_at else None, }, "error": report.error, + "justification": report.justification, + "goal_evaluation_result": self._serialize_goal_evaluation_result(report.goal_evaluation_result) if report.goal_evaluation_result else None, "step_results": [ self._serialize_step_result(step) for step in report.step_results ], @@ -149,6 +151,24 @@ def _serialize_step_result(self, step: 'StepResult') -> Dict[str, Any]: ] } + def _serialize_goal_evaluation_result(self, result: 'GoalEvaluationResult') -> Dict[str, Any]: + """Serialize a goal evaluation result to dictionary. + + Args: + result: Goal evaluation result to serialize + + Returns: + Dictionary representation of the goal evaluation result + """ + return { + "goal_achieved": result.goal_achieved, + "confidence": result.confidence, + "reasoning": result.reasoning, + "evaluation_method": result.evaluation_method, + "fallback_used": result.fallback_used, + "timestamp": result.timestamp.isoformat() + } + def _serialize_assertion_result(self, assertion: 'AssertionResult') -> Dict[str, Any]: """Serialize an assertion result to dictionary. diff --git a/replicantx/reporters/markdown.py b/replicantx/reporters/markdown.py index 4625a26..f884b36 100644 --- a/replicantx/reporters/markdown.py +++ b/replicantx/reporters/markdown.py @@ -215,6 +215,17 @@ def _generate_test_suite_markdown(self, report: TestSuiteReport) -> str: lines.append(f"**Steps:** {scenario.passed_steps}/{scenario.total_steps}") lines.append(f"**Success Rate:** {scenario.success_rate:.1f}%") lines.append(f"**Duration:** {scenario.duration_seconds:.2f}s") + if scenario.justification: + lines.append(f"**Justification:** {scenario.justification}") + + # Goal evaluation details for agent scenarios + if scenario.goal_evaluation_result: + lines.append(f"**Goal Evaluation:**") + lines.append(f"- Method: {scenario.goal_evaluation_result.evaluation_method}") + lines.append(f"- Confidence: {scenario.goal_evaluation_result.confidence:.2f}") + lines.append(f"- Fallback Used: {'Yes' if scenario.goal_evaluation_result.fallback_used else 'No'}") + lines.append(f"- Reasoning: {scenario.goal_evaluation_result.reasoning}") + lines.append("") # Complete conversation history for agent scenarios diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py index e67aa72..a6e7269 100644 --- a/replicantx/scenarios/agent.py +++ b/replicantx/scenarios/agent.py @@ -10,7 +10,7 @@ import asyncio from datetime import datetime -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Any from ..auth import AuthBase, SupabaseAuth, JWTAuth, NoopAuth from ..models import ( @@ -289,6 +289,13 @@ async def run(self) -> ScenarioReport: goal_achieved = conversation_summary.get('goal_achieved', False) report.passed = report.passed and goal_achieved + # Store goal evaluation result if available + if hasattr(self.replicant_agent.state, 'goal_evaluation_result') and self.replicant_agent.state.goal_evaluation_result: + report.goal_evaluation_result = self.replicant_agent.state.goal_evaluation_result + + # Generate justification for the overall result + report.justification = self._generate_justification(report, conversation_summary) + # Add conversation history to the last step result for reporting if report.step_results and self.replicant_agent: conversation_history = self._format_full_conversation() @@ -650,6 +657,83 @@ def _format_conversation_summary(self, summary: Dict) -> str: ] return "\n".join(lines) + def _generate_justification(self, report: 'ScenarioReport', conversation_summary: Dict[str, Any]) -> str: + """Generate justification for the scenario result. + + Args: + report: The scenario report + conversation_summary: Summary from the replicant agent + + Returns: + Justification string explaining why the scenario passed or failed + """ + goal_achieved = conversation_summary.get('goal_achieved', False) + total_turns = conversation_summary.get('total_turns', 0) + facts_used = conversation_summary.get('facts_used', 0) + + if report.passed: + # Scenario passed - explain why + justification_parts = [] + + if report.passed_steps == report.total_steps: + justification_parts.append(f"All {report.total_steps} steps passed successfully") + else: + justification_parts.append(f"{report.passed_steps}/{report.total_steps} steps passed") + + if goal_achieved: + justification_parts.append("Goal was achieved") + + # Add goal evaluation details if available + if 'goal_evaluation_method' in conversation_summary: + method = conversation_summary.get('goal_evaluation_method', 'unknown') + confidence = conversation_summary.get('goal_evaluation_confidence', 0.0) + reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided') + + justification_parts.append(f"Goal evaluation: {method} method with {confidence:.2f} confidence") + justification_parts.append(f"Reasoning: {reasoning}") + else: + justification_parts.append("Goal was not achieved") + + justification_parts.append(f"Conversation completed in {total_turns} turns") + if facts_used > 0: + justification_parts.append(f"Used {facts_used} available facts") + + return ". ".join(justification_parts) + "." + else: + # Scenario failed - explain why + justification_parts = [] + + if report.failed_steps > 0: + failed_step_details = [] + for step in report.step_results: + if not step.passed: + failed_step_details.append(f"Step {step.step_index + 1}") + if step.error: + failed_step_details.append(f"Error: {step.error}") + elif step.assertions: + failed_assertions = [a for a in step.assertions if not a.passed] + if failed_assertions: + failed_step_details.append(f"Failed assertions: {len(failed_assertions)}") + + justification_parts.append(f"Failed steps: {', '.join(failed_step_details)}") + + if not goal_achieved: + justification_parts.append("Goal was not achieved") + + # Add goal evaluation details if available + if 'goal_evaluation_method' in conversation_summary: + method = conversation_summary.get('goal_evaluation_method', 'unknown') + confidence = conversation_summary.get('goal_evaluation_confidence', 0.0) + reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided') + + justification_parts.append(f"Goal evaluation: {method} method with {confidence:.2f} confidence") + justification_parts.append(f"Reasoning: {reasoning}") + + if report.error: + justification_parts.append(f"Error: {report.error}") + + return ". ".join(justification_parts) + "." + def _format_full_conversation(self) -> str: """Format the complete conversation history for reporting. diff --git a/replicantx/scenarios/basic.py b/replicantx/scenarios/basic.py index 967c6ad..a44a6c3 100644 --- a/replicantx/scenarios/basic.py +++ b/replicantx/scenarios/basic.py @@ -157,6 +157,9 @@ async def run(self) -> ScenarioReport: report.failed_steps += 1 report.passed = False + # Generate justification for the result + report.justification = self._generate_justification(report) + # Log final summary if self.watch: self._watch_log("") @@ -165,6 +168,8 @@ async def run(self) -> ScenarioReport: self._watch_log(f"šŸ Status: {status}") self._watch_log(f"šŸ”¢ Steps: {report.passed_steps}/{report.total_steps} passed") self._watch_log(f"ā±ļø Duration: {report.total_duration_ms/1000:.1f}s") + if report.justification: + self._watch_log(f"šŸ’­ Justification: {report.justification}") self._debug_log("Basic scenario completed", { "passed": report.passed, @@ -230,6 +235,43 @@ async def _execute_step(self, step_index: int, step: Step) -> StepResult: return step_result + def _generate_justification(self, report: 'ScenarioReport') -> str: + """Generate justification for the scenario result. + + Args: + report: The scenario report + + Returns: + Justification string explaining why the scenario passed or failed + """ + if report.passed: + if report.passed_steps == report.total_steps: + return f"All {report.total_steps} steps passed successfully with all assertions satisfied." + else: + return f"{report.passed_steps}/{report.total_steps} steps passed. Some steps may have been skipped due to configuration." + else: + # Scenario failed - explain why + justification_parts = [] + + if report.failed_steps > 0: + failed_step_details = [] + for step in report.step_results: + if not step.passed: + failed_step_details.append(f"Step {step.step_index + 1}") + if step.error: + failed_step_details.append(f"Error: {step.error}") + elif step.assertions: + failed_assertions = [a for a in step.assertions if not a.passed] + if failed_assertions: + failed_step_details.append(f"Failed assertions: {len(failed_assertions)}") + + justification_parts.append(f"Failed steps: {', '.join(failed_step_details)}") + + if report.error: + justification_parts.append(f"Error: {report.error}") + + return ". ".join(justification_parts) + "." + async def _make_api_request(self, user_message: str, timeout: Optional[int] = None) -> HTTPResponse: """Make API request with user message.