Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 40 additions & 22 deletions replicantx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ async def run_scenarios_async(
suite_report.completed_at = datetime.now()

# Display summary
display_summary(suite_report)
display_summary(suite_report, verbose)

# Generate reports
if report_path:
Expand Down Expand Up @@ -487,7 +487,7 @@ def load_scenario_config(file_path: str) -> ScenarioConfig:
raise Exception(f"Invalid scenario configuration: {e}")


def display_summary(suite_report: TestSuiteReport):
def display_summary(suite_report: TestSuiteReport, verbose: bool = False):
"""Display test execution summary.

Args:
Expand Down Expand Up @@ -517,27 +517,45 @@ def display_summary(suite_report: TestSuiteReport):

# Scenario details
if suite_report.scenario_reports:
console.print("\n📋 Scenario Details")

scenario_table = Table(show_header=True, header_style="bold blue")
scenario_table.add_column("Scenario")
scenario_table.add_column("Status")
scenario_table.add_column("Steps")
scenario_table.add_column("Duration")

for scenario in suite_report.scenario_reports:
status = "✅ PASS" if scenario.passed else "❌ FAIL"
steps = f"{scenario.passed_steps}/{scenario.total_steps}"
duration = f"{scenario.duration_seconds:.2f}s"
console.print("\n📋 Scenario Details")

scenario_table.add_row(
scenario.scenario_name,
status,
steps,
duration
)

console.print(scenario_table)
scenario_table = Table(show_header=True, header_style="bold blue")
scenario_table.add_column("Scenario")
scenario_table.add_column("Status")
scenario_table.add_column("Steps")
scenario_table.add_column("Duration")
scenario_table.add_column("Justification")

for scenario in suite_report.scenario_reports:
status = "✅ PASS" if scenario.passed else "❌ FAIL"
steps = f"{scenario.passed_steps}/{scenario.total_steps}"
duration = f"{scenario.duration_seconds:.2f}s"
justification = scenario.justification or "No justification available"

# Truncate justification for table display
if len(justification) > 80:
justification = justification[:77] + "..."

scenario_table.add_row(
scenario.scenario_name,
status,
steps,
duration,
justification
)

console.print(scenario_table)

# Show detailed justification for failed scenarios
failed_scenarios = [s for s in suite_report.scenario_reports if not s.passed]
if failed_scenarios and verbose:
console.print("\n🔍 Detailed Justification for Failed Scenarios")
for scenario in failed_scenarios:
console.print(f"\n**{scenario.scenario_name}**")
if scenario.justification:
console.print(f"💭 {scenario.justification}")
if scenario.error:
console.print(f"❌ Error: {scenario.error}")


def generate_reports(suite_report: TestSuiteReport, report_path: str):
Expand Down
2 changes: 2 additions & 0 deletions replicantx/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ class ScenarioReport(BaseModel):
step_results: List[StepResult] = Field(default_factory=list, description="Results for each step")
error: Optional[str] = Field(None, description="Overall error message if scenario failed")
conversation_history: Optional[str] = Field(None, description="Complete conversation history for agent scenarios")
justification: Optional[str] = Field(None, description="Explanation of why the scenario passed or failed")
goal_evaluation_result: Optional[GoalEvaluationResult] = Field(None, description="Goal evaluation result for agent scenarios")
started_at: datetime = Field(default_factory=datetime.now, description="When scenario started")
completed_at: Optional[datetime] = Field(None, description="When scenario completed")

Expand Down
20 changes: 20 additions & 0 deletions replicantx/reporters/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ def _serialize_scenario_report(self, report: ScenarioReport) -> Dict[str, Any]:
"completed_at": report.completed_at.isoformat() if report.completed_at else None,
},
"error": report.error,
"justification": report.justification,
"goal_evaluation_result": self._serialize_goal_evaluation_result(report.goal_evaluation_result) if report.goal_evaluation_result else None,
"step_results": [
self._serialize_step_result(step) for step in report.step_results
],
Expand Down Expand Up @@ -149,6 +151,24 @@ def _serialize_step_result(self, step: 'StepResult') -> Dict[str, Any]:
]
}

def _serialize_goal_evaluation_result(self, result: 'GoalEvaluationResult') -> Dict[str, Any]:
"""Serialize a goal evaluation result to dictionary.

Args:
result: Goal evaluation result to serialize

Returns:
Dictionary representation of the goal evaluation result
"""
return {
"goal_achieved": result.goal_achieved,
"confidence": result.confidence,
"reasoning": result.reasoning,
"evaluation_method": result.evaluation_method,
"fallback_used": result.fallback_used,
"timestamp": result.timestamp.isoformat()
}

def _serialize_assertion_result(self, assertion: 'AssertionResult') -> Dict[str, Any]:
"""Serialize an assertion result to dictionary.

Expand Down
11 changes: 11 additions & 0 deletions replicantx/reporters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,17 @@ def _generate_test_suite_markdown(self, report: TestSuiteReport) -> str:
lines.append(f"**Steps:** {scenario.passed_steps}/{scenario.total_steps}")
lines.append(f"**Success Rate:** {scenario.success_rate:.1f}%")
lines.append(f"**Duration:** {scenario.duration_seconds:.2f}s")
if scenario.justification:
lines.append(f"**Justification:** {scenario.justification}")

# Goal evaluation details for agent scenarios
if scenario.goal_evaluation_result:
lines.append(f"**Goal Evaluation:**")
lines.append(f"- Method: {scenario.goal_evaluation_result.evaluation_method}")
lines.append(f"- Confidence: {scenario.goal_evaluation_result.confidence:.2f}")
lines.append(f"- Fallback Used: {'Yes' if scenario.goal_evaluation_result.fallback_used else 'No'}")
lines.append(f"- Reasoning: {scenario.goal_evaluation_result.reasoning}")

lines.append("")

# Complete conversation history for agent scenarios
Expand Down
86 changes: 85 additions & 1 deletion replicantx/scenarios/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import asyncio
from datetime import datetime
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Any

from ..auth import AuthBase, SupabaseAuth, JWTAuth, NoopAuth
from ..models import (
Expand Down Expand Up @@ -289,6 +289,13 @@ async def run(self) -> ScenarioReport:
goal_achieved = conversation_summary.get('goal_achieved', False)
report.passed = report.passed and goal_achieved

# Store goal evaluation result if available
if hasattr(self.replicant_agent.state, 'goal_evaluation_result') and self.replicant_agent.state.goal_evaluation_result:
report.goal_evaluation_result = self.replicant_agent.state.goal_evaluation_result

# Generate justification for the overall result
report.justification = self._generate_justification(report, conversation_summary)

# Add conversation history to the last step result for reporting
if report.step_results and self.replicant_agent:
conversation_history = self._format_full_conversation()
Expand Down Expand Up @@ -650,6 +657,83 @@ def _format_conversation_summary(self, summary: Dict) -> str:
]
return "\n".join(lines)

def _generate_justification(self, report: 'ScenarioReport', conversation_summary: Dict[str, Any]) -> str:
"""Generate justification for the scenario result.

Args:
report: The scenario report
conversation_summary: Summary from the replicant agent

Returns:
Justification string explaining why the scenario passed or failed
"""
goal_achieved = conversation_summary.get('goal_achieved', False)
total_turns = conversation_summary.get('total_turns', 0)
facts_used = conversation_summary.get('facts_used', 0)

if report.passed:
# Scenario passed - explain why
justification_parts = []

if report.passed_steps == report.total_steps:
justification_parts.append(f"All {report.total_steps} steps passed successfully")
else:
justification_parts.append(f"{report.passed_steps}/{report.total_steps} steps passed")

if goal_achieved:
justification_parts.append("Goal was achieved")

# Add goal evaluation details if available
if 'goal_evaluation_method' in conversation_summary:
method = conversation_summary.get('goal_evaluation_method', 'unknown')
confidence = conversation_summary.get('goal_evaluation_confidence', 0.0)
reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided')

justification_parts.append(f"Goal evaluation: {method} method with {confidence:.2f} confidence")
justification_parts.append(f"Reasoning: {reasoning}")
else:
justification_parts.append("Goal was not achieved")

justification_parts.append(f"Conversation completed in {total_turns} turns")
if facts_used > 0:
justification_parts.append(f"Used {facts_used} available facts")

return ". ".join(justification_parts) + "."
else:
# Scenario failed - explain why
justification_parts = []

if report.failed_steps > 0:
failed_step_details = []
for step in report.step_results:
if not step.passed:
failed_step_details.append(f"Step {step.step_index + 1}")
if step.error:
failed_step_details.append(f"Error: {step.error}")
elif step.assertions:
failed_assertions = [a for a in step.assertions if not a.passed]
if failed_assertions:
failed_step_details.append(f"Failed assertions: {len(failed_assertions)}")

justification_parts.append(f"Failed steps: {', '.join(failed_step_details)}")

if not goal_achieved:
justification_parts.append("Goal was not achieved")

# Add goal evaluation details if available
if 'goal_evaluation_method' in conversation_summary:
method = conversation_summary.get('goal_evaluation_method', 'unknown')
confidence = conversation_summary.get('goal_evaluation_confidence', 0.0)
reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided')

justification_parts.append(f"Goal evaluation: {method} method with {confidence:.2f} confidence")
justification_parts.append(f"Reasoning: {reasoning}")

if report.error:
justification_parts.append(f"Error: {report.error}")

return ". ".join(justification_parts) + "."

def _format_full_conversation(self) -> str:
"""Format the complete conversation history for reporting.

Expand Down
42 changes: 42 additions & 0 deletions replicantx/scenarios/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ async def run(self) -> ScenarioReport:
report.failed_steps += 1
report.passed = False

# Generate justification for the result
report.justification = self._generate_justification(report)

# Log final summary
if self.watch:
self._watch_log("")
Expand All @@ -165,6 +168,8 @@ async def run(self) -> ScenarioReport:
self._watch_log(f"🏁 Status: {status}")
self._watch_log(f"🔢 Steps: {report.passed_steps}/{report.total_steps} passed")
self._watch_log(f"⏱️ Duration: {report.total_duration_ms/1000:.1f}s")
if report.justification:
self._watch_log(f"💭 Justification: {report.justification}")

self._debug_log("Basic scenario completed", {
"passed": report.passed,
Expand Down Expand Up @@ -230,6 +235,43 @@ async def _execute_step(self, step_index: int, step: Step) -> StepResult:

return step_result

def _generate_justification(self, report: 'ScenarioReport') -> str:
"""Generate justification for the scenario result.

Args:
report: The scenario report

Returns:
Justification string explaining why the scenario passed or failed
"""
if report.passed:
if report.passed_steps == report.total_steps:
return f"All {report.total_steps} steps passed successfully with all assertions satisfied."
else:
return f"{report.passed_steps}/{report.total_steps} steps passed. Some steps may have been skipped due to configuration."
else:
# Scenario failed - explain why
justification_parts = []

if report.failed_steps > 0:
failed_step_details = []
for step in report.step_results:
if not step.passed:
failed_step_details.append(f"Step {step.step_index + 1}")
if step.error:
failed_step_details.append(f"Error: {step.error}")
elif step.assertions:
failed_assertions = [a for a in step.assertions if not a.passed]
if failed_assertions:
failed_step_details.append(f"Failed assertions: {len(failed_assertions)}")

justification_parts.append(f"Failed steps: {', '.join(failed_step_details)}")

if report.error:
justification_parts.append(f"Error: {report.error}")

return ". ".join(justification_parts) + "."

async def _make_api_request(self, user_message: str, timeout: Optional[int] = None) -> HTTPResponse:
"""Make API request with user message.

Expand Down
Loading