From 5b21ab8af33f74b94f119de9593e819e3d7d6b11 Mon Sep 17 00:00:00 2001
From: Gus Fraser <gus@techblurt.com>
Date: Sat, 2 Aug 2025 17:26:04 +0100
Subject: [PATCH] Improved reporting

---
 replicantx/cli.py                | 62 +++++++++++++++--------
 replicantx/models.py             |  2 +
 replicantx/reporters/json.py     | 20 ++++++++
 replicantx/reporters/markdown.py | 11 ++++
 replicantx/scenarios/agent.py    | 86 +++++++++++++++++++++++++++++++-
 replicantx/scenarios/basic.py    | 42 ++++++++++++++++
 6 files changed, 200 insertions(+), 23 deletions(-)

diff --git a/replicantx/cli.py b/replicantx/cli.py
index cb6a76b..23138ff 100644
--- a/replicantx/cli.py
+++ b/replicantx/cli.py
@@ -234,7 +234,7 @@ async def run_scenarios_async(
     suite_report.completed_at = datetime.now()
     
     # Display summary
-    display_summary(suite_report)
+    display_summary(suite_report, verbose)
     
     # Generate reports
     if report_path:
@@ -487,7 +487,7 @@ def load_scenario_config(file_path: str) -> ScenarioConfig:
         raise Exception(f"Invalid scenario configuration: {e}")
 
 
-def display_summary(suite_report: TestSuiteReport):
+def display_summary(suite_report: TestSuiteReport, verbose: bool = False):
     """Display test execution summary.
     
     Args:
@@ -517,27 +517,45 @@ def display_summary(suite_report: TestSuiteReport):
     
     # Scenario details
     if suite_report.scenario_reports:
-        console.print("\n📋 Scenario Details")
-        
-        scenario_table = Table(show_header=True, header_style="bold blue")
-        scenario_table.add_column("Scenario")
-        scenario_table.add_column("Status")
-        scenario_table.add_column("Steps")
-        scenario_table.add_column("Duration")
-        
-        for scenario in suite_report.scenario_reports:
-            status = "✅ PASS" if scenario.passed else "❌ FAIL"
-            steps = f"{scenario.passed_steps}/{scenario.total_steps}"
-            duration = f"{scenario.duration_seconds:.2f}s"
+            console.print("\n📋 Scenario Details")
             
-            scenario_table.add_row(
-                scenario.scenario_name,
-                status,
-                steps,
-                duration
-            )
-        
-        console.print(scenario_table)
+            scenario_table = Table(show_header=True, header_style="bold blue")
+            scenario_table.add_column("Scenario")
+            scenario_table.add_column("Status")
+            scenario_table.add_column("Steps")
+            scenario_table.add_column("Duration")
+            scenario_table.add_column("Justification")
+            
+            for scenario in suite_report.scenario_reports:
+                status = "✅ PASS" if scenario.passed else "❌ FAIL"
+                steps = f"{scenario.passed_steps}/{scenario.total_steps}"
+                duration = f"{scenario.duration_seconds:.2f}s"
+                justification = scenario.justification or "No justification available"
+                
+                # Truncate justification for table display
+                if len(justification) > 80:
+                    justification = justification[:77] + "..."
+                
+                scenario_table.add_row(
+                    scenario.scenario_name,
+                    status,
+                    steps,
+                    duration,
+                    justification
+                )
+            
+            console.print(scenario_table)
+            
+            # Show detailed justification for failed scenarios
+            failed_scenarios = [s for s in suite_report.scenario_reports if not s.passed]
+            if failed_scenarios and verbose:
+                console.print("\n🔍 Detailed Justification for Failed Scenarios")
+                for scenario in failed_scenarios:
+                    console.print(f"\n**{scenario.scenario_name}**")
+                    if scenario.justification:
+                        console.print(f"💭 {scenario.justification}")
+                    if scenario.error:
+                        console.print(f"❌ Error: {scenario.error}")
 
 
 def generate_reports(suite_report: TestSuiteReport, report_path: str):
diff --git a/replicantx/models.py b/replicantx/models.py
index 069e640..91f55fa 100644
--- a/replicantx/models.py
+++ b/replicantx/models.py
@@ -264,6 +264,8 @@ class ScenarioReport(BaseModel):
     step_results: List[StepResult] = Field(default_factory=list, description="Results for each step")
     error: Optional[str] = Field(None, description="Overall error message if scenario failed")
     conversation_history: Optional[str] = Field(None, description="Complete conversation history for agent scenarios")
+    justification: Optional[str] = Field(None, description="Explanation of why the scenario passed or failed")
+    goal_evaluation_result: Optional[GoalEvaluationResult] = Field(None, description="Goal evaluation result for agent scenarios")
     started_at: datetime = Field(default_factory=datetime.now, description="When scenario started")
     completed_at: Optional[datetime] = Field(None, description="When scenario completed")
     
diff --git a/replicantx/reporters/json.py b/replicantx/reporters/json.py
index 19574ec..83c54b5 100644
--- a/replicantx/reporters/json.py
+++ b/replicantx/reporters/json.py
@@ -72,6 +72,8 @@ def _serialize_scenario_report(self, report: ScenarioReport) -> Dict[str, Any]:
                 "completed_at": report.completed_at.isoformat() if report.completed_at else None,
             },
             "error": report.error,
+            "justification": report.justification,
+            "goal_evaluation_result": self._serialize_goal_evaluation_result(report.goal_evaluation_result) if report.goal_evaluation_result else None,
             "step_results": [
                 self._serialize_step_result(step) for step in report.step_results
             ],
@@ -149,6 +151,24 @@ def _serialize_step_result(self, step: 'StepResult') -> Dict[str, Any]:
             ]
         }
     
+    def _serialize_goal_evaluation_result(self, result: 'GoalEvaluationResult') -> Dict[str, Any]:
+        """Serialize a goal evaluation result to dictionary.
+        
+        Args:
+            result: Goal evaluation result to serialize
+            
+        Returns:
+            Dictionary representation of the goal evaluation result
+        """
+        return {
+            "goal_achieved": result.goal_achieved,
+            "confidence": result.confidence,
+            "reasoning": result.reasoning,
+            "evaluation_method": result.evaluation_method,
+            "fallback_used": result.fallback_used,
+            "timestamp": result.timestamp.isoformat()
+        }
+    
     def _serialize_assertion_result(self, assertion: 'AssertionResult') -> Dict[str, Any]:
         """Serialize an assertion result to dictionary.
         
diff --git a/replicantx/reporters/markdown.py b/replicantx/reporters/markdown.py
index 4625a26..f884b36 100644
--- a/replicantx/reporters/markdown.py
+++ b/replicantx/reporters/markdown.py
@@ -215,6 +215,17 @@ def _generate_test_suite_markdown(self, report: TestSuiteReport) -> str:
                 lines.append(f"**Steps:** {scenario.passed_steps}/{scenario.total_steps}")
                 lines.append(f"**Success Rate:** {scenario.success_rate:.1f}%")
                 lines.append(f"**Duration:** {scenario.duration_seconds:.2f}s")
+                if scenario.justification:
+                    lines.append(f"**Justification:** {scenario.justification}")
+                
+                # Goal evaluation details for agent scenarios
+                if scenario.goal_evaluation_result:
+                    lines.append(f"**Goal Evaluation:**")
+                    lines.append(f"- Method: {scenario.goal_evaluation_result.evaluation_method}")
+                    lines.append(f"- Confidence: {scenario.goal_evaluation_result.confidence:.2f}")
+                    lines.append(f"- Fallback Used: {'Yes' if scenario.goal_evaluation_result.fallback_used else 'No'}")
+                    lines.append(f"- Reasoning: {scenario.goal_evaluation_result.reasoning}")
+                
                 lines.append("")
                 
                 # Complete conversation history for agent scenarios
diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py
index e67aa72..a6e7269 100644
--- a/replicantx/scenarios/agent.py
+++ b/replicantx/scenarios/agent.py
@@ -10,7 +10,7 @@
 
 import asyncio
 from datetime import datetime
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Any
 
 from ..auth import AuthBase, SupabaseAuth, JWTAuth, NoopAuth
 from ..models import (
@@ -289,6 +289,13 @@ async def run(self) -> ScenarioReport:
             goal_achieved = conversation_summary.get('goal_achieved', False)
             report.passed = report.passed and goal_achieved
             
+            # Store goal evaluation result if available
+            if hasattr(self.replicant_agent.state, 'goal_evaluation_result') and self.replicant_agent.state.goal_evaluation_result:
+                report.goal_evaluation_result = self.replicant_agent.state.goal_evaluation_result
+            
+            # Generate justification for the overall result
+            report.justification = self._generate_justification(report, conversation_summary)
+            
             # Add conversation history to the last step result for reporting
             if report.step_results and self.replicant_agent:
                 conversation_history = self._format_full_conversation()
@@ -650,6 +657,83 @@ def _format_conversation_summary(self, summary: Dict) -> str:
         ]
         return "\n".join(lines)
     
+    def _generate_justification(self, report: 'ScenarioReport', conversation_summary: Dict[str, Any]) -> str:
+        """Generate justification for the scenario result.
+        
+        Args:
+            report: The scenario report
+            conversation_summary: Summary from the replicant agent
+            
+        Returns:
+            Justification string explaining why the scenario passed or failed
+        """
+        goal_achieved = conversation_summary.get('goal_achieved', False)
+        total_turns = conversation_summary.get('total_turns', 0)
+        facts_used = conversation_summary.get('facts_used', 0)
+        
+        if report.passed:
+            # Scenario passed - explain why
+            justification_parts = []
+            
+            if report.passed_steps == report.total_steps:
+                justification_parts.append(f"All {report.total_steps} steps passed successfully")
+            else:
+                justification_parts.append(f"{report.passed_steps}/{report.total_steps} steps passed")
+            
+            if goal_achieved:
+                justification_parts.append("Goal was achieved")
+                
+                # Add goal evaluation details if available
+                if 'goal_evaluation_method' in conversation_summary:
+                    method = conversation_summary.get('goal_evaluation_method', 'unknown')
+                    confidence = conversation_summary.get('goal_evaluation_confidence', 0.0)
+                    reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided')
+                    
+                    justification_parts.append(f"Goal evaluation: {method} method with {confidence:.2f} confidence")
+                    justification_parts.append(f"Reasoning: {reasoning}")
+            else:
+                justification_parts.append("Goal was not achieved")
+            
+            justification_parts.append(f"Conversation completed in {total_turns} turns")
+            if facts_used > 0:
+                justification_parts.append(f"Used {facts_used} available facts")
+            
+            return ". ".join(justification_parts) + "."
+        else:
+            # Scenario failed - explain why
+            justification_parts = []
+            
+            if report.failed_steps > 0:
+                failed_step_details = []
+                for step in report.step_results:
+                    if not step.passed:
+                        failed_step_details.append(f"Step {step.step_index + 1}")
+                        if step.error:
+                            failed_step_details.append(f"Error: {step.error}")
+                        elif step.assertions:
+                            failed_assertions = [a for a in step.assertions if not a.passed]
+                            if failed_assertions:
+                                failed_step_details.append(f"Failed assertions: {len(failed_assertions)}")
+                
+                justification_parts.append(f"Failed steps: {', '.join(failed_step_details)}")
+            
+            if not goal_achieved:
+                justification_parts.append("Goal was not achieved")
+                
+                # Add goal evaluation details if available
+                if 'goal_evaluation_method' in conversation_summary:
+                    method = conversation_summary.get('goal_evaluation_method', 'unknown')
+                    confidence = conversation_summary.get('goal_evaluation_confidence', 0.0)
+                    reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided')
+                    
+                    justification_parts.append(f"Goal evaluation: {method} method with {confidence:.2f} confidence")
+                    justification_parts.append(f"Reasoning: {reasoning}")
+            
+            if report.error:
+                justification_parts.append(f"Error: {report.error}")
+            
+            return ". ".join(justification_parts) + "."
+    
     def _format_full_conversation(self) -> str:
         """Format the complete conversation history for reporting.
         
diff --git a/replicantx/scenarios/basic.py b/replicantx/scenarios/basic.py
index 967c6ad..a44a6c3 100644
--- a/replicantx/scenarios/basic.py
+++ b/replicantx/scenarios/basic.py
@@ -157,6 +157,9 @@ async def run(self) -> ScenarioReport:
                     report.failed_steps += 1
                     report.passed = False
             
+            # Generate justification for the result
+            report.justification = self._generate_justification(report)
+            
             # Log final summary
             if self.watch:
                 self._watch_log("")
@@ -165,6 +168,8 @@ async def run(self) -> ScenarioReport:
                 self._watch_log(f"🏁 Status: {status}")
                 self._watch_log(f"🔢 Steps: {report.passed_steps}/{report.total_steps} passed")
                 self._watch_log(f"⏱️  Duration: {report.total_duration_ms/1000:.1f}s")
+                if report.justification:
+                    self._watch_log(f"💭 Justification: {report.justification}")
             
             self._debug_log("Basic scenario completed", {
                 "passed": report.passed,
@@ -230,6 +235,43 @@ async def _execute_step(self, step_index: int, step: Step) -> StepResult:
         
         return step_result
     
+    def _generate_justification(self, report: 'ScenarioReport') -> str:
+        """Generate justification for the scenario result.
+        
+        Args:
+            report: The scenario report
+            
+        Returns:
+            Justification string explaining why the scenario passed or failed
+        """
+        if report.passed:
+            if report.passed_steps == report.total_steps:
+                return f"All {report.total_steps} steps passed successfully with all assertions satisfied."
+            else:
+                return f"{report.passed_steps}/{report.total_steps} steps passed. Some steps may have been skipped due to configuration."
+        else:
+            # Scenario failed - explain why
+            justification_parts = []
+            
+            if report.failed_steps > 0:
+                failed_step_details = []
+                for step in report.step_results:
+                    if not step.passed:
+                        failed_step_details.append(f"Step {step.step_index + 1}")
+                        if step.error:
+                            failed_step_details.append(f"Error: {step.error}")
+                        elif step.assertions:
+                            failed_assertions = [a for a in step.assertions if not a.passed]
+                            if failed_assertions:
+                                failed_step_details.append(f"Failed assertions: {len(failed_assertions)}")
+                
+                justification_parts.append(f"Failed steps: {', '.join(failed_step_details)}")
+            
+            if report.error:
+                justification_parts.append(f"Error: {report.error}")
+            
+            return ". ".join(justification_parts) + "."
+    
     async def _make_api_request(self, user_message: str, timeout: Optional[int] = None) -> HTTPResponse:
         """Make API request with user message.