From 3145140dd22bfb72c1a17f872b62ac063f9b94f2 Mon Sep 17 00:00:00 2001 From: Gus Fraser Date: Fri, 25 Jul 2025 13:36:12 +0100 Subject: [PATCH 1/2] Add date/time to replicant system prompt --- pyproject.toml | 2 +- replicantx/scenarios/agent.py | 12 ++++++++++++ replicantx/scenarios/replicant.py | 11 +++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bfcb9dc..bf807dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "replicantx" -version = "0.1.4" +version = "0.1.5" description = "End-to-end testing harness for AI agents via web service API" readme = "README.md" requires-python = ">=3.11" diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py index 00e5f7a..93aadb4 100644 --- a/replicantx/scenarios/agent.py +++ b/replicantx/scenarios/agent.py @@ -149,7 +149,13 @@ async def run(self) -> ScenarioReport: # Initialize Replicant agent self.replicant_agent = ReplicantAgent.create(self.config.replicant) + current_datetime = datetime.now() + date_str = current_datetime.strftime("%A, %B %d, %Y") + time_str = current_datetime.strftime("%I:%M %p %Z") + self._debug_log("Replicant Agent initialized", { + "current_date": date_str, + "current_time": time_str, "goal": self.config.replicant.goal, "facts_count": len(self.config.replicant.facts), "facts": str(self.config.replicant.facts), @@ -174,7 +180,13 @@ async def run(self) -> ScenarioReport: # Initialize watch mode if self.watch: + current_datetime = datetime.now() + date_str = current_datetime.strftime("%A, %B %d, %Y") + time_str = current_datetime.strftime("%I:%M %p %Z") + self._watch_log("๐Ÿ‘ฅ [bold green]LIVE CONVERSATION[/bold green] - Starting agent scenario") + self._watch_log(f"๐Ÿ“… Date: {date_str}") + self._watch_log(f"๐Ÿ• Time: {time_str}") self._watch_log(f"๐ŸŽฏ Goal: {self.config.replicant.goal}") self._watch_log(f"๐Ÿ“ Facts: {len(self.config.replicant.facts)} items available") self._watch_log("") diff --git a/replicantx/scenarios/replicant.py b/replicantx/scenarios/replicant.py index 0feeac9..8795415 100644 --- a/replicantx/scenarios/replicant.py +++ b/replicantx/scenarios/replicant.py @@ -62,8 +62,14 @@ async def generate_response(self, api_message: str, conversation_state: Conversa Generated response """ try: + # Get current date and time + current_datetime = datetime.now() + date_str = current_datetime.strftime("%A, %B %d, %Y") + time_str = current_datetime.strftime("%I:%M %p %Z") + # Prepare context with facts AND conversation history - context = f"Available facts: {json.dumps(self.facts, indent=2)}\n\n" + context = f"Current date and time: {date_str} at {time_str}\n\n" + context += f"Available facts: {json.dumps(self.facts, indent=2)}\n\n" # Add conversation history for context if conversation_state.conversation_history: @@ -74,7 +80,8 @@ async def generate_response(self, api_message: str, conversation_state: Conversa context += f"Current API message: {api_message}\n\n" context += "Please generate a natural response as a user working toward your goal. " - context += "Use the available facts when appropriate, and respond naturally to the API's question or statement." + context += "Use the available facts when appropriate, and respond naturally to the API's question or statement. " + context += "You know the current date and time, so you can reference it when relevant to the conversation." # Create and use PydanticAI agent agent = self._create_agent() From 4ebe6051dae69f67b520195f58bef4ccd1d66632 Mon Sep 17 00:00:00 2001 From: Gus Fraser Date: Fri, 25 Jul 2025 17:17:00 +0100 Subject: [PATCH 2/2] Intelligent goal evaluation 0.1.5 --- README.md | 194 +++++++++++++ replicantx/models.py | 24 ++ replicantx/scenarios/agent.py | 26 +- replicantx/scenarios/replicant.py | 327 +++++++++++++++++++++- tests/intelligent_evaluation_example.yaml | 73 +++++ 5 files changed, 620 insertions(+), 24 deletions(-) create mode 100644 tests/intelligent_evaluation_example.yaml diff --git a/README.md b/README.md index 2d56eaa..e2bc491 100644 --- a/README.md +++ b/README.md @@ -973,6 +973,200 @@ system_prompt: | information when asked but expect efficient service. ``` +## ๐ŸŽฏ Goal Evaluation Modes + +ReplicantX provides intelligent goal evaluation to accurately determine when conversation objectives have been achieved, solving the common problem of false positives with simple keyword matching. + +### The Problem with Keywords + +Traditional keyword-based completion detection can produce false positives: + +```yaml +# Problematic scenario +completion_keywords: ["confirmed", "booked"] + +# False positive examples: +# โŒ "I'll let you know when your booking has been confirmed" (contains "confirmed") +# โŒ "Have you booked with us before?" (contains "booked") +# โŒ "Your booking confirmation is pending" (contains "booking") +``` + +### Three Evaluation Modes + +#### 1. **Keywords Mode** (Default - Backwards Compatible) +Simple substring matching - the original behavior: + +```yaml +replicant: + goal: "Book a flight to Paris" + goal_evaluation_mode: "keywords" # Default + completion_keywords: ["confirmed", "booked", "reservation number"] +``` + +**Use when:** +- โœ… Maintaining existing test compatibility +- โœ… Simple scenarios with clear completion signals +- โœ… Performance is critical (no LLM calls) + +#### 2. **Intelligent Mode** (Recommended) +LLM-powered analysis that understands context and intent: + +```yaml +replicant: + goal: "Book a business class flight to Paris" + goal_evaluation_mode: "intelligent" + goal_evaluation_model: "openai:gpt-4o-mini" # Optional: separate model for evaluation + completion_keywords: ["confirmed", "booked"] # Still required for compatibility +``` + +**Benefits:** +- โœ… **Context-aware**: Distinguishes promises from accomplishments +- โœ… **False positive reduction**: "I'll confirm later" โ‰  "Your booking is confirmed" +- โœ… **Intent understanding**: Recognizes goal completion without exact keywords +- โœ… **Reasoning provided**: Detailed explanation of evaluation decisions + +#### 3. **Hybrid Mode** (Best of Both Worlds) +Attempts LLM evaluation first, falls back to keywords if uncertain: + +```yaml +replicant: + goal: "Get help with billing issue" + goal_evaluation_mode: "hybrid" + goal_evaluation_model: "openai:gpt-4o-mini" + completion_keywords: ["resolved", "ticket created", "issue closed"] +``` + +**Benefits:** +- โœ… **Smart evaluation** when LLM is confident +- โœ… **Reliable fallback** when LLM is uncertain +- โœ… **Cost-effective** for mixed scenarios +- โœ… **Production-ready** with built-in safety net + +### Custom Evaluation Prompts + +For domain-specific scenarios, customize the evaluation logic: + +```yaml +replicant: + goal: "Complete a customer support ticket" + goal_evaluation_mode: "intelligent" + goal_evaluation_prompt: | + Evaluate if the customer support goal is achieved. Look for: + 1. Issue resolution confirmation from the agent + 2. Ticket number or reference provided + 3. Customer satisfaction or acknowledgment + 4. Clear closure statements + + Goal: {goal} + User Facts: {facts} + Recent Conversation: {conversation} + + Respond exactly: + RESULT: [ACHIEVED or NOT_ACHIEVED] + CONFIDENCE: [0.0 to 1.0] + REASONING: [Brief explanation] + completion_keywords: ["resolved", "ticket created"] +``` + +### Example: Flight Booking with Intelligent Evaluation + +```yaml +name: "Smart Flight Booking Test" +base_url: "https://api.example.com/chat" +auth: + provider: noop +level: agent +replicant: + goal: "Book a round-trip business class flight to Paris" + facts: + name: "Sarah Johnson" + email: "sarah@example.com" + travel_class: "business" + destination: "Paris" + departure_city: "New York" + travel_date: "next Friday" + return_date: "following Monday" + budget: "$3000" + system_prompt: | + You are a customer booking a flight. Provide information when asked + but don't volunteer everything upfront. Be conversational and natural. + initial_message: "Hi, I'd like to book a flight to Paris." + max_turns: 15 + + # Intelligent goal evaluation + goal_evaluation_mode: "intelligent" + goal_evaluation_model: "openai:gpt-4o-mini" # Fast, cost-effective model + + # Still needed for fallback/compatibility + completion_keywords: ["booked", "confirmed", "reservation number"] + + llm: + model: "openai:gpt-4o" + temperature: 0.7 + max_tokens: 150 +``` + +### Evaluation Results in Reports + +The watch mode now shows detailed evaluation information: + +```bash +๐Ÿ“Š CONVERSATION COMPLETE +๐Ÿ Status: โœ… SUCCESS +๐ŸŽฏ Goal achieved: Yes +๐Ÿง  Evaluation method: intelligent +๐Ÿ“Š Confidence: 0.89 +๐Ÿ’ญ Reasoning: The flight has been successfully booked with confirmation number ABC123 provided +``` + +### Migration Strategy + +**Phase 1: Test Intelligent Mode** +```yaml +# Update specific tests to use intelligent evaluation +goal_evaluation_mode: "intelligent" +``` + +**Phase 2: Adopt Hybrid Mode** +```yaml +# Use hybrid for safety while gaining intelligence +goal_evaluation_mode: "hybrid" +``` + +**Phase 3: Gradual Rollout** +```yaml +# Eventually make intelligent/hybrid the default for new tests +goal_evaluation_mode: "intelligent" +``` + +### When to Use Each Mode + +| Mode | Use Case | Pros | Cons | +|------|----------|------|------| +| **keywords** | Legacy tests, simple APIs | Fast, deterministic | False positives | +| **intelligent** | Modern apps, complex goals | Accurate, context-aware | Requires LLM | +| **hybrid** | Production, mixed scenarios | Smart + safe fallback | Slightly more complex | + +**Recommendation**: Start with `hybrid` mode for new tests to get the benefits of intelligent evaluation with keyword fallback safety. + +### ๐Ÿงช Try the Example + +See a complete example that demonstrates false positive prevention: + +```bash +# Download the example test +curl -O https://raw.githubusercontent.com/helixtechnologies/replicantx/main/tests/intelligent_evaluation_example.yaml + +# Run with intelligent evaluation +replicantx run intelligent_evaluation_example.yaml --watch + +# Compare with keyword-only mode by changing goal_evaluation_mode to "keywords" +``` + +This example shows how intelligent evaluation distinguishes between: +- โŒ "I'll create a ticket for your issue" (promise) +- โœ… "Your refund has been processed, reference #REF123" (completion) + ## ๐Ÿง  LLM Integration ReplicantX uses **PydanticAI** for powerful LLM integration with multiple providers: diff --git a/replicantx/models.py b/replicantx/models.py index ecba850..59c5697 100644 --- a/replicantx/models.py +++ b/replicantx/models.py @@ -65,6 +65,13 @@ class SessionPlacement(str, Enum): URL = "url" # In URL path (RESTful) +class GoalEvaluationMode(str, Enum): + """Goal evaluation modes.""" + KEYWORDS = "keywords" # Simple keyword matching (legacy behavior) + INTELLIGENT = "intelligent" # LLM-based goal evaluation + HYBRID = "hybrid" # LLM with keyword fallback + + class LLMConfig(BaseModel): """Configuration for LLM using PydanticAI models.""" model_config = ConfigDict(extra="forbid") @@ -90,6 +97,18 @@ class Message(BaseModel): metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata") +class GoalEvaluationResult(BaseModel): + """Result of goal evaluation.""" + model_config = ConfigDict(extra="forbid") + + goal_achieved: bool = Field(..., description="Whether the goal has been achieved") + confidence: float = Field(..., description="Confidence score from 0.0 to 1.0") + reasoning: str = Field(..., description="Explanation of why the goal is/isn't achieved") + evaluation_method: str = Field(..., description="Method used: 'keywords', 'intelligent', or 'hybrid'") + fallback_used: bool = Field(False, description="Whether hybrid mode fell back to keywords") + timestamp: datetime = Field(default_factory=datetime.now, description="When evaluation was performed") + + class AssertionResult(BaseModel): """Result of an assertion check.""" model_config = ConfigDict(extra="forbid") @@ -195,6 +214,11 @@ class ReplicantConfig(BaseModel): session_placement: SessionPlacement = Field(SessionPlacement.BODY, description="Session ID placement: 'header', 'body', or 'url' (default: body)") session_variable_name: str = Field("session_id", description="Name of the session variable in header/body (default: session_id)") llm: LLMConfig = Field(default_factory=LLMConfig, description="LLM configuration for response generation") + + # Goal evaluation configuration + goal_evaluation_mode: GoalEvaluationMode = Field(GoalEvaluationMode.KEYWORDS, description="Goal evaluation mode: 'keywords' (default), 'intelligent', or 'hybrid'") + goal_evaluation_model: Optional[str] = Field(None, description="PydanticAI model for goal evaluation (defaults to main LLM model if not specified)") + goal_evaluation_prompt: Optional[str] = Field(None, description="Custom prompt for goal evaluation (uses default if not specified)") class ScenarioConfig(BaseModel): diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py index 93aadb4..14ee434 100644 --- a/replicantx/scenarios/agent.py +++ b/replicantx/scenarios/agent.py @@ -203,17 +203,8 @@ async def run(self) -> ScenarioReport: self._watch_log(f"๐Ÿ‘ค [bold cyan]User:[/bold cyan] {current_message}") - # Record initial message in conversation history - from ..models import Message - initial_message = Message( - role="user", - content=current_message, - timestamp=datetime.now() - ) - self.replicant_agent.state.conversation_history.append(initial_message) - # Continue conversation until completion or limits reached - while self.replicant_agent.should_continue_conversation(): + while await self.replicant_agent.should_continue_conversation(): self._debug_log(f"Executing conversation step {step_index + 1}", { "user_message": current_message, "turn_count": self.replicant_agent.state.turn_count, @@ -266,7 +257,9 @@ async def run(self) -> ScenarioReport: "parsed_response": parsed_response }) - current_message = await self.replicant_agent.process_api_response(parsed_response) + # For the first response, pass the triggering message to add to conversation history + triggering_message = current_message if step_index == 0 else None + current_message = await self.replicant_agent.process_api_response(parsed_response, triggering_message) self._debug_log("Generated next user message", { "next_message": current_message, @@ -309,6 +302,17 @@ async def run(self) -> ScenarioReport: self._watch_log(f"๐ŸŽฏ Goal achieved: {'Yes' if conversation_summary.get('goal_achieved', False) else 'No'}") self._watch_log(f"๐Ÿ“ Facts used: {conversation_summary.get('facts_used', 0)}") self._watch_log(f"๐Ÿ’ฌ Total turns: {conversation_summary.get('total_turns', 0)}") + + # Add goal evaluation details if available + if 'goal_evaluation_method' in conversation_summary: + method = conversation_summary.get('goal_evaluation_method', 'unknown') + confidence = conversation_summary.get('goal_evaluation_confidence', 0.0) + fallback = conversation_summary.get('goal_evaluation_fallback_used', False) + reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided') + + self._watch_log(f"๐Ÿง  Evaluation method: {method}" + (" (fallback used)" if fallback else "")) + self._watch_log(f"๐Ÿ“Š Confidence: {confidence:.2f}") + self._watch_log(f"๐Ÿ’ญ Reasoning: {reasoning}") self._debug_log("Scenario completed successfully", { "passed": report.passed, diff --git a/replicantx/scenarios/replicant.py b/replicantx/scenarios/replicant.py index 8795415..46eb39e 100644 --- a/replicantx/scenarios/replicant.py +++ b/replicantx/scenarios/replicant.py @@ -16,7 +16,7 @@ from pydantic_ai import Agent as PydanticAgent from pydantic_ai.models import infer_model -from ..models import ReplicantConfig, Message +from ..models import ReplicantConfig, Message, GoalEvaluationResult, GoalEvaluationMode from ..tools.http_client import HTTPResponse @@ -28,6 +28,277 @@ class ConversationState(BaseModel): goal_achieved: bool = Field(False, description="Whether the goal has been achieved") conversation_history: List[Message] = Field(default_factory=list, description="Full conversation history") extracted_info: Dict[str, Any] = Field(default_factory=dict, description="Information extracted from the conversation") + goal_evaluation_result: Optional[GoalEvaluationResult] = Field(None, description="Latest goal evaluation result") + + +class GoalEvaluator(BaseModel): + """Evaluates whether conversation goals have been achieved using different strategies.""" + model_config = {"extra": "forbid"} + + mode: GoalEvaluationMode = Field(..., description="Evaluation mode") + model_name: Optional[str] = Field(None, description="Model for intelligent evaluation") + custom_prompt: Optional[str] = Field(None, description="Custom evaluation prompt") + completion_keywords: List[str] = Field(..., description="Keywords for keyword-based evaluation") + + @classmethod + def create(cls, config: ReplicantConfig) -> "GoalEvaluator": + """Create a GoalEvaluator from ReplicantConfig. + + Args: + config: Replicant configuration + + Returns: + Configured GoalEvaluator + """ + model_name = config.goal_evaluation_model or config.llm.model + + return cls( + mode=config.goal_evaluation_mode, + model_name=model_name, + custom_prompt=config.goal_evaluation_prompt, + completion_keywords=config.completion_keywords + ) + + async def evaluate_goal_completion( + self, + goal: str, + conversation_history: List[Message], + facts: Dict[str, Any] + ) -> GoalEvaluationResult: + """Evaluate whether the conversation goal has been achieved. + + Args: + goal: The goal to evaluate + conversation_history: Full conversation history + facts: Available facts for context + + Returns: + Goal evaluation result + """ + if self.mode == GoalEvaluationMode.KEYWORDS: + return self._evaluate_with_keywords(goal, conversation_history) + elif self.mode == GoalEvaluationMode.INTELLIGENT: + return await self._evaluate_with_llm(goal, conversation_history, facts) + elif self.mode == GoalEvaluationMode.HYBRID: + return await self._evaluate_hybrid(goal, conversation_history, facts) + else: + raise ValueError(f"Unknown goal evaluation mode: {self.mode}") + + def _evaluate_with_keywords( + self, + goal: str, + conversation_history: List[Message] + ) -> GoalEvaluationResult: + """Evaluate goal completion using keyword matching (legacy behavior). + + Args: + goal: The goal to evaluate + conversation_history: Full conversation history + + Returns: + Goal evaluation result + """ + # Check for completion keywords in recent API responses + goal_achieved = False + matched_keywords = [] + + if conversation_history: + recent_messages = conversation_history[-2:] # Last 2 messages + for message in recent_messages: + if message.role == "assistant": + message_lower = message.content.lower() + for keyword in self.completion_keywords: + if keyword.lower() in message_lower: + goal_achieved = True + matched_keywords.append(keyword) + + reasoning = f"Keyword evaluation: {'Found' if goal_achieved else 'No'} completion keywords" + if matched_keywords: + reasoning += f" (matched: {', '.join(matched_keywords)})" + + return GoalEvaluationResult( + goal_achieved=goal_achieved, + confidence=1.0 if goal_achieved else 0.0, # Keywords give binary confidence + reasoning=reasoning, + evaluation_method="keywords", + fallback_used=False + ) + + async def _evaluate_with_llm( + self, + goal: str, + conversation_history: List[Message], + facts: Dict[str, Any] + ) -> GoalEvaluationResult: + """Evaluate goal completion using LLM analysis. + + Args: + goal: The goal to evaluate + conversation_history: Full conversation history + facts: Available facts for context + + Returns: + Goal evaluation result + """ + try: + # Build evaluation prompt + prompt = self._build_evaluation_prompt(goal, conversation_history, facts) + + # Create LLM agent for evaluation + model = infer_model(self.model_name) + agent = PydanticAgent( + model=model, + instructions="You are an expert at evaluating whether conversation goals have been achieved. Be precise and analytical.", + model_settings={"temperature": 0.1, "max_tokens": 200} # Low temperature for consistency + ) + + # Get evaluation + result = await agent.run(prompt) + response = result.output.strip() + + # Parse LLM response + goal_achieved, confidence, reasoning = self._parse_llm_response(response) + + return GoalEvaluationResult( + goal_achieved=goal_achieved, + confidence=confidence, + reasoning=reasoning, + evaluation_method="intelligent", + fallback_used=False + ) + + except Exception as e: + # Return failure result if LLM evaluation fails + return GoalEvaluationResult( + goal_achieved=False, + confidence=0.0, + reasoning=f"LLM evaluation failed: {str(e)}", + evaluation_method="intelligent", + fallback_used=False + ) + + async def _evaluate_hybrid( + self, + goal: str, + conversation_history: List[Message], + facts: Dict[str, Any] + ) -> GoalEvaluationResult: + """Evaluate goal completion using LLM with keyword fallback. + + Args: + goal: The goal to evaluate + conversation_history: Full conversation history + facts: Available facts for context + + Returns: + Goal evaluation result + """ + # Try LLM evaluation first + try: + llm_result = await self._evaluate_with_llm(goal, conversation_history, facts) + if llm_result.confidence > 0.5: # Use LLM result if confident + return llm_result + except Exception: + pass + + # Fall back to keyword evaluation + keyword_result = self._evaluate_with_keywords(goal, conversation_history) + keyword_result.evaluation_method = "hybrid" + keyword_result.fallback_used = True + keyword_result.reasoning = f"LLM evaluation uncertain, using keyword fallback: {keyword_result.reasoning}" + + return keyword_result + + def _build_evaluation_prompt( + self, + goal: str, + conversation_history: List[Message], + facts: Dict[str, Any] + ) -> str: + """Build the evaluation prompt for LLM analysis. + + Args: + goal: The goal to evaluate + conversation_history: Full conversation history + facts: Available facts for context + + Returns: + Formatted evaluation prompt + """ + if self.custom_prompt: + # Use custom prompt with variable substitution + return self.custom_prompt.format( + goal=goal, + facts=json.dumps(facts, indent=2), + conversation=self._format_conversation_for_prompt(conversation_history) + ) + + # Default evaluation prompt + prompt = f"""Given this conversation goal: "{goal}" + +User facts: {json.dumps(facts, indent=2)} + +Recent conversation history: +{self._format_conversation_for_prompt(conversation_history[-6:])} + +Has the goal been definitively achieved? Consider: +1. Has the user received confirmation that the action was completed? +2. Are there concrete indicators of success (confirmation numbers, bookings, etc.)? +3. Distinguish between promises ("I will do this") vs accomplishments ("This is done") +4. Look for specific completion indicators, not just polite acknowledgments + +Respond in this exact format: +RESULT: [ACHIEVED or NOT_ACHIEVED] +CONFIDENCE: [0.0 to 1.0] +REASONING: [Brief explanation of your decision]""" + + return prompt + + def _format_conversation_for_prompt(self, messages: List[Message]) -> str: + """Format conversation history for the evaluation prompt. + + Args: + messages: List of messages to format + + Returns: + Formatted conversation string + """ + formatted = [] + for msg in messages: + role = "User" if msg.role == "user" else "Assistant" + formatted.append(f"{role}: {msg.content}") + return "\n".join(formatted) + + def _parse_llm_response(self, response: str) -> Tuple[bool, float, str]: + """Parse LLM evaluation response. + + Args: + response: Raw LLM response + + Returns: + Tuple of (goal_achieved, confidence, reasoning) + """ + lines = response.strip().split('\n') + + goal_achieved = False + confidence = 0.5 + reasoning = "Could not parse LLM response" + + for line in lines: + line = line.strip() + if line.startswith('RESULT:'): + result_text = line.replace('RESULT:', '').strip().upper() + goal_achieved = result_text == 'ACHIEVED' + elif line.startswith('CONFIDENCE:'): + try: + confidence = float(line.replace('CONFIDENCE:', '').strip()) + confidence = max(0.0, min(1.0, confidence)) # Clamp to [0, 1] + except ValueError: + confidence = 0.5 + elif line.startswith('REASONING:'): + reasoning = line.replace('REASONING:', '').strip() + + return goal_achieved, confidence, reasoning class ResponseGenerator(BaseModel): @@ -126,6 +397,7 @@ class ReplicantAgent(BaseModel): config: ReplicantConfig = Field(..., description="Replicant configuration") state: ConversationState = Field(default_factory=ConversationState, description="Current conversation state") response_generator: ResponseGenerator = Field(..., description="Response generation utility") + goal_evaluator: GoalEvaluator = Field(..., description="Goal evaluation utility") @classmethod def create(cls, config: ReplicantConfig) -> "ReplicantAgent": @@ -151,12 +423,15 @@ def create(cls, config: ReplicantConfig) -> "ReplicantAgent": facts=config.facts ) + goal_evaluator = GoalEvaluator.create(config) + return cls( config=config, - response_generator=response_generator + response_generator=response_generator, + goal_evaluator=goal_evaluator ) - def should_continue_conversation(self) -> bool: + async def should_continue_conversation(self) -> bool: """Determine if the conversation should continue. Returns: @@ -170,15 +445,20 @@ def should_continue_conversation(self) -> bool: if self.state.goal_achieved: return False - # Check for completion keywords in recent API responses + # Evaluate goal completion using the configured method if self.state.conversation_history: - recent_messages = self.state.conversation_history[-2:] # Last 2 messages - for message in recent_messages: - if message.role == "assistant": - message_lower = message.content.lower() - if any(keyword in message_lower for keyword in self.config.completion_keywords): - self.state.goal_achieved = True - return False + evaluation_result = await self.goal_evaluator.evaluate_goal_completion( + goal=self.config.goal, + conversation_history=self.state.conversation_history, + facts=self.config.facts + ) + + # Store evaluation result for reporting + self.state.goal_evaluation_result = evaluation_result + + if evaluation_result.goal_achieved: + self.state.goal_achieved = True + return False return True @@ -190,15 +470,25 @@ def get_initial_message(self) -> str: """ return self.config.initial_message - async def process_api_response(self, api_response: str) -> str: + async def process_api_response(self, api_response: str, triggering_message: Optional[str] = None) -> str: """Process an API response and generate the next user message. Args: api_response: Response from the API + triggering_message: The user message that triggered this API response (for initial message) Returns: Next user message """ + # Add the triggering user message if this is the first response + if triggering_message: + user_trigger_message = Message( + role="user", + content=triggering_message, + timestamp=datetime.now() + ) + self.state.conversation_history.append(user_trigger_message) + # Add API response to conversation history api_message = Message( role="assistant", @@ -229,13 +519,24 @@ def get_conversation_summary(self) -> Dict[str, Any]: Returns: Conversation summary """ - return { + summary = { "total_turns": self.state.turn_count, "goal_achieved": self.state.goal_achieved, "conversation_length": len(self.state.conversation_history), "facts_used": self._count_facts_used(), "goal": self.config.goal, } + + # Add goal evaluation details if available + if self.state.goal_evaluation_result: + summary.update({ + "goal_evaluation_method": self.state.goal_evaluation_result.evaluation_method, + "goal_evaluation_confidence": self.state.goal_evaluation_result.confidence, + "goal_evaluation_reasoning": self.state.goal_evaluation_result.reasoning, + "goal_evaluation_fallback_used": self.state.goal_evaluation_result.fallback_used, + }) + + return summary def _count_facts_used(self) -> int: """Count how many facts were used in the conversation. diff --git a/tests/intelligent_evaluation_example.yaml b/tests/intelligent_evaluation_example.yaml new file mode 100644 index 0000000..b898066 --- /dev/null +++ b/tests/intelligent_evaluation_example.yaml @@ -0,0 +1,73 @@ +# Example demonstrating intelligent goal evaluation +# This file shows how to avoid false positives with LLM-powered evaluation + +name: "Intelligent Goal Evaluation - Customer Support Example" +base_url: "https://api.example.com/support" +auth: + provider: noop +level: agent +replicant: + goal: "Get help resolving a billing dispute" + facts: + name: "Alex Chen" + account_number: "ACC-789456" + email: "alex.chen@email.com" + phone: "+1-555-9876" + issue_type: "billing_dispute" + disputed_charge: "$149.99 charged on March 15th" + expected_charge: "$99.99 monthly subscription" + account_type: "Premium" + system_prompt: | + You are a customer named Alex Chen who needs help with a billing dispute. + You notice an incorrect charge on your account and want it resolved. + You're polite but want a clear resolution. Provide information from your + facts when asked, but don't volunteer everything at once. + initial_message: "Hello, I have a billing issue I need help with." + max_turns: 12 + + # Traditional keywords that could cause false positives + completion_keywords: + - "resolved" + - "refund" + - "credited" + - "ticket created" + - "issue closed" + + # Intelligent evaluation that understands context + goal_evaluation_mode: "intelligent" + goal_evaluation_model: "openai:gpt-4o-mini" # Cost-effective evaluation model + goal_evaluation_prompt: | + Evaluate if the billing dispute has been truly resolved. Look for: + + ACHIEVED indicators: + โœ… Confirmed refund processed or account credited + โœ… Ticket resolution with reference number + โœ… Billing correction acknowledged and completed + โœ… Customer satisfaction confirmed + + NOT ACHIEVED indicators: + โŒ "I'll look into the refund" (promise, not completion) + โŒ "A ticket has been created" (process started, not resolved) + โŒ "The issue should be resolved soon" (future promise) + โŒ "Let me transfer you for the refund" (escalation, not resolution) + + Examples of false positives to avoid: + โŒ "I'll make sure this is resolved" โ‰  actual resolution + โŒ "Your refund will be processed" โ‰  refund completed + โŒ "I've created a ticket for this issue" โ‰  issue resolved + + Goal: {goal} + User Facts: {facts} + Recent Conversation: {conversation} + + RESULT: [ACHIEVED or NOT_ACHIEVED] + CONFIDENCE: [0.0 to 1.0] + REASONING: [Explain why the goal is/isn't achieved] + + fullconversation: true # Send full conversation history for context + payload_format: openai # Use standard format + + llm: + model: "openai:gpt-4o" # Main conversation model + temperature: 0.8 + max_tokens: 120 \ No newline at end of file