From 3145140dd22bfb72c1a17f872b62ac063f9b94f2 Mon Sep 17 00:00:00 2001
From: Gus Fraser <gus@techblurt.com>
Date: Fri, 25 Jul 2025 13:36:12 +0100
Subject: [PATCH 1/2] Add date/time to replicant system prompt

---
 pyproject.toml                    |  2 +-
 replicantx/scenarios/agent.py     | 12 ++++++++++++
 replicantx/scenarios/replicant.py | 11 +++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index bfcb9dc..bf807dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "replicantx"
-version = "0.1.4"
+version = "0.1.5"
 description = "End-to-end testing harness for AI agents via web service API"
 readme = "README.md"
 requires-python = ">=3.11"
diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py
index 00e5f7a..93aadb4 100644
--- a/replicantx/scenarios/agent.py
+++ b/replicantx/scenarios/agent.py
@@ -149,7 +149,13 @@ async def run(self) -> ScenarioReport:
         # Initialize Replicant agent
         self.replicant_agent = ReplicantAgent.create(self.config.replicant)
         
+        current_datetime = datetime.now()
+        date_str = current_datetime.strftime("%A, %B %d, %Y")
+        time_str = current_datetime.strftime("%I:%M %p %Z")
+        
         self._debug_log("Replicant Agent initialized", {
+            "current_date": date_str,
+            "current_time": time_str,
             "goal": self.config.replicant.goal,
             "facts_count": len(self.config.replicant.facts),
             "facts": str(self.config.replicant.facts),
@@ -174,7 +180,13 @@ async def run(self) -> ScenarioReport:
         
         # Initialize watch mode
         if self.watch:
+            current_datetime = datetime.now()
+            date_str = current_datetime.strftime("%A, %B %d, %Y")
+            time_str = current_datetime.strftime("%I:%M %p %Z")
+            
             self._watch_log("👥 [bold green]LIVE CONVERSATION[/bold green] - Starting agent scenario")
+            self._watch_log(f"📅 Date: {date_str}")
+            self._watch_log(f"🕐 Time: {time_str}")
             self._watch_log(f"🎯 Goal: {self.config.replicant.goal}")
             self._watch_log(f"📝 Facts: {len(self.config.replicant.facts)} items available")
             self._watch_log("")
diff --git a/replicantx/scenarios/replicant.py b/replicantx/scenarios/replicant.py
index 0feeac9..8795415 100644
--- a/replicantx/scenarios/replicant.py
+++ b/replicantx/scenarios/replicant.py
@@ -62,8 +62,14 @@ async def generate_response(self, api_message: str, conversation_state: Conversa
             Generated response
         """
         try:
+            # Get current date and time
+            current_datetime = datetime.now()
+            date_str = current_datetime.strftime("%A, %B %d, %Y")
+            time_str = current_datetime.strftime("%I:%M %p %Z")
+            
             # Prepare context with facts AND conversation history
-            context = f"Available facts: {json.dumps(self.facts, indent=2)}\n\n"
+            context = f"Current date and time: {date_str} at {time_str}\n\n"
+            context += f"Available facts: {json.dumps(self.facts, indent=2)}\n\n"
             
             # Add conversation history for context
             if conversation_state.conversation_history:
@@ -74,7 +80,8 @@ async def generate_response(self, api_message: str, conversation_state: Conversa
             
             context += f"Current API message: {api_message}\n\n"
             context += "Please generate a natural response as a user working toward your goal. "
-            context += "Use the available facts when appropriate, and respond naturally to the API's question or statement."
+            context += "Use the available facts when appropriate, and respond naturally to the API's question or statement. "
+            context += "You know the current date and time, so you can reference it when relevant to the conversation."
             
             # Create and use PydanticAI agent
             agent = self._create_agent()

From 4ebe6051dae69f67b520195f58bef4ccd1d66632 Mon Sep 17 00:00:00 2001
From: Gus Fraser <gus@techblurt.com>
Date: Fri, 25 Jul 2025 17:17:00 +0100
Subject: [PATCH 2/2] Intelligent goal evaluation 0.1.5

---
 README.md                                 | 194 +++++++++++++
 replicantx/models.py                      |  24 ++
 replicantx/scenarios/agent.py             |  26 +-
 replicantx/scenarios/replicant.py         | 327 +++++++++++++++++++++-
 tests/intelligent_evaluation_example.yaml |  73 +++++
 5 files changed, 620 insertions(+), 24 deletions(-)
 create mode 100644 tests/intelligent_evaluation_example.yaml

diff --git a/README.md b/README.md
index 2d56eaa..e2bc491 100644
--- a/README.md
+++ b/README.md
@@ -973,6 +973,200 @@ system_prompt: |
   information when asked but expect efficient service.
 ```
 
+## 🎯 Goal Evaluation Modes
+
+ReplicantX provides intelligent goal evaluation to accurately determine when conversation objectives have been achieved, solving the common problem of false positives with simple keyword matching.
+
+### The Problem with Keywords
+
+Traditional keyword-based completion detection can produce false positives:
+
+```yaml
+# Problematic scenario
+completion_keywords: ["confirmed", "booked"]
+
+# False positive examples:
+# ❌ "I'll let you know when your booking has been confirmed" (contains "confirmed")
+# ❌ "Have you booked with us before?" (contains "booked") 
+# ❌ "Your booking confirmation is pending" (contains "booking")
+```
+
+### Three Evaluation Modes
+
+#### 1. **Keywords Mode** (Default - Backwards Compatible)
+Simple substring matching - the original behavior:
+
+```yaml
+replicant:
+  goal: "Book a flight to Paris"
+  goal_evaluation_mode: "keywords"  # Default
+  completion_keywords: ["confirmed", "booked", "reservation number"]
+```
+
+**Use when:**
+- ✅ Maintaining existing test compatibility
+- ✅ Simple scenarios with clear completion signals
+- ✅ Performance is critical (no LLM calls)
+
+#### 2. **Intelligent Mode** (Recommended)
+LLM-powered analysis that understands context and intent:
+
+```yaml
+replicant:
+  goal: "Book a business class flight to Paris"
+  goal_evaluation_mode: "intelligent"
+  goal_evaluation_model: "openai:gpt-4o-mini"  # Optional: separate model for evaluation
+  completion_keywords: ["confirmed", "booked"]  # Still required for compatibility
+```
+
+**Benefits:**
+- ✅ **Context-aware**: Distinguishes promises from accomplishments
+- ✅ **False positive reduction**: "I'll confirm later" ≠ "Your booking is confirmed"
+- ✅ **Intent understanding**: Recognizes goal completion without exact keywords
+- ✅ **Reasoning provided**: Detailed explanation of evaluation decisions
+
+#### 3. **Hybrid Mode** (Best of Both Worlds)
+Attempts LLM evaluation first, falls back to keywords if uncertain:
+
+```yaml
+replicant:
+  goal: "Get help with billing issue"
+  goal_evaluation_mode: "hybrid"
+  goal_evaluation_model: "openai:gpt-4o-mini"
+  completion_keywords: ["resolved", "ticket created", "issue closed"]
+```
+
+**Benefits:**
+- ✅ **Smart evaluation** when LLM is confident
+- ✅ **Reliable fallback** when LLM is uncertain
+- ✅ **Cost-effective** for mixed scenarios
+- ✅ **Production-ready** with built-in safety net
+
+### Custom Evaluation Prompts
+
+For domain-specific scenarios, customize the evaluation logic:
+
+```yaml
+replicant:
+  goal: "Complete a customer support ticket"
+  goal_evaluation_mode: "intelligent"
+  goal_evaluation_prompt: |
+    Evaluate if the customer support goal is achieved. Look for:
+    1. Issue resolution confirmation from the agent
+    2. Ticket number or reference provided
+    3. Customer satisfaction or acknowledgment
+    4. Clear closure statements
+    
+    Goal: {goal}
+    User Facts: {facts}
+    Recent Conversation: {conversation}
+    
+    Respond exactly:
+    RESULT: [ACHIEVED or NOT_ACHIEVED]
+    CONFIDENCE: [0.0 to 1.0]
+    REASONING: [Brief explanation]
+  completion_keywords: ["resolved", "ticket created"]
+```
+
+### Example: Flight Booking with Intelligent Evaluation
+
+```yaml
+name: "Smart Flight Booking Test"
+base_url: "https://api.example.com/chat"
+auth:
+  provider: noop
+level: agent
+replicant:
+  goal: "Book a round-trip business class flight to Paris"
+  facts:
+    name: "Sarah Johnson"
+    email: "sarah@example.com"
+    travel_class: "business"
+    destination: "Paris"
+    departure_city: "New York"
+    travel_date: "next Friday"
+    return_date: "following Monday"
+    budget: "$3000"
+  system_prompt: |
+    You are a customer booking a flight. Provide information when asked
+    but don't volunteer everything upfront. Be conversational and natural.
+  initial_message: "Hi, I'd like to book a flight to Paris."
+  max_turns: 15
+  
+  # Intelligent goal evaluation
+  goal_evaluation_mode: "intelligent"
+  goal_evaluation_model: "openai:gpt-4o-mini"  # Fast, cost-effective model
+  
+  # Still needed for fallback/compatibility  
+  completion_keywords: ["booked", "confirmed", "reservation number"]
+  
+  llm:
+    model: "openai:gpt-4o"
+    temperature: 0.7
+    max_tokens: 150
+```
+
+### Evaluation Results in Reports
+
+The watch mode now shows detailed evaluation information:
+
+```bash
+📊 CONVERSATION COMPLETE
+🏁 Status: ✅ SUCCESS
+🎯 Goal achieved: Yes
+🧠 Evaluation method: intelligent
+📊 Confidence: 0.89
+💭 Reasoning: The flight has been successfully booked with confirmation number ABC123 provided
+```
+
+### Migration Strategy
+
+**Phase 1: Test Intelligent Mode**
+```yaml
+# Update specific tests to use intelligent evaluation
+goal_evaluation_mode: "intelligent"
+```
+
+**Phase 2: Adopt Hybrid Mode**
+```yaml
+# Use hybrid for safety while gaining intelligence
+goal_evaluation_mode: "hybrid"
+```
+
+**Phase 3: Gradual Rollout**
+```yaml
+# Eventually make intelligent/hybrid the default for new tests
+goal_evaluation_mode: "intelligent"
+```
+
+### When to Use Each Mode
+
+| Mode | Use Case | Pros | Cons |
+|------|----------|------|------|
+| **keywords** | Legacy tests, simple APIs | Fast, deterministic | False positives |
+| **intelligent** | Modern apps, complex goals | Accurate, context-aware | Requires LLM |
+| **hybrid** | Production, mixed scenarios | Smart + safe fallback | Slightly more complex |
+
+**Recommendation**: Start with `hybrid` mode for new tests to get the benefits of intelligent evaluation with keyword fallback safety.
+
+### 🧪 Try the Example
+
+See a complete example that demonstrates false positive prevention:
+
+```bash
+# Download the example test
+curl -O https://raw.githubusercontent.com/helixtechnologies/replicantx/main/tests/intelligent_evaluation_example.yaml
+
+# Run with intelligent evaluation
+replicantx run intelligent_evaluation_example.yaml --watch
+
+# Compare with keyword-only mode by changing goal_evaluation_mode to "keywords"
+```
+
+This example shows how intelligent evaluation distinguishes between:
+- ❌ "I'll create a ticket for your issue" (promise)
+- ✅ "Your refund has been processed, reference #REF123" (completion)
+
 ## 🧠 LLM Integration
 
 ReplicantX uses **PydanticAI** for powerful LLM integration with multiple providers:
diff --git a/replicantx/models.py b/replicantx/models.py
index ecba850..59c5697 100644
--- a/replicantx/models.py
+++ b/replicantx/models.py
@@ -65,6 +65,13 @@ class SessionPlacement(str, Enum):
     URL = "url"  # In URL path (RESTful)
 
 
+class GoalEvaluationMode(str, Enum):
+    """Goal evaluation modes."""
+    KEYWORDS = "keywords"  # Simple keyword matching (legacy behavior)
+    INTELLIGENT = "intelligent"  # LLM-based goal evaluation
+    HYBRID = "hybrid"  # LLM with keyword fallback
+
+
 class LLMConfig(BaseModel):
     """Configuration for LLM using PydanticAI models."""
     model_config = ConfigDict(extra="forbid")
@@ -90,6 +97,18 @@ class Message(BaseModel):
     metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
 
 
+class GoalEvaluationResult(BaseModel):
+    """Result of goal evaluation."""
+    model_config = ConfigDict(extra="forbid")
+    
+    goal_achieved: bool = Field(..., description="Whether the goal has been achieved")
+    confidence: float = Field(..., description="Confidence score from 0.0 to 1.0")
+    reasoning: str = Field(..., description="Explanation of why the goal is/isn't achieved")
+    evaluation_method: str = Field(..., description="Method used: 'keywords', 'intelligent', or 'hybrid'")
+    fallback_used: bool = Field(False, description="Whether hybrid mode fell back to keywords")
+    timestamp: datetime = Field(default_factory=datetime.now, description="When evaluation was performed")
+
+
 class AssertionResult(BaseModel):
     """Result of an assertion check."""
     model_config = ConfigDict(extra="forbid")
@@ -195,6 +214,11 @@ class ReplicantConfig(BaseModel):
     session_placement: SessionPlacement = Field(SessionPlacement.BODY, description="Session ID placement: 'header', 'body', or 'url' (default: body)")
     session_variable_name: str = Field("session_id", description="Name of the session variable in header/body (default: session_id)")
     llm: LLMConfig = Field(default_factory=LLMConfig, description="LLM configuration for response generation")
+    
+    # Goal evaluation configuration
+    goal_evaluation_mode: GoalEvaluationMode = Field(GoalEvaluationMode.KEYWORDS, description="Goal evaluation mode: 'keywords' (default), 'intelligent', or 'hybrid'")
+    goal_evaluation_model: Optional[str] = Field(None, description="PydanticAI model for goal evaluation (defaults to main LLM model if not specified)")
+    goal_evaluation_prompt: Optional[str] = Field(None, description="Custom prompt for goal evaluation (uses default if not specified)")
 
 
 class ScenarioConfig(BaseModel):
diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py
index 93aadb4..14ee434 100644
--- a/replicantx/scenarios/agent.py
+++ b/replicantx/scenarios/agent.py
@@ -203,17 +203,8 @@ async def run(self) -> ScenarioReport:
             
             self._watch_log(f"👤 [bold cyan]User:[/bold cyan] {current_message}")
             
-            # Record initial message in conversation history
-            from ..models import Message
-            initial_message = Message(
-                role="user",
-                content=current_message,
-                timestamp=datetime.now()
-            )
-            self.replicant_agent.state.conversation_history.append(initial_message)
-            
             # Continue conversation until completion or limits reached
-            while self.replicant_agent.should_continue_conversation():
+            while await self.replicant_agent.should_continue_conversation():
                 self._debug_log(f"Executing conversation step {step_index + 1}", {
                     "user_message": current_message,
                     "turn_count": self.replicant_agent.state.turn_count,
@@ -266,7 +257,9 @@ async def run(self) -> ScenarioReport:
                         "parsed_response": parsed_response
                     })
                     
-                    current_message = await self.replicant_agent.process_api_response(parsed_response)
+                    # For the first response, pass the triggering message to add to conversation history
+                    triggering_message = current_message if step_index == 0 else None
+                    current_message = await self.replicant_agent.process_api_response(parsed_response, triggering_message)
                     
                     self._debug_log("Generated next user message", {
                         "next_message": current_message,
@@ -309,6 +302,17 @@ async def run(self) -> ScenarioReport:
                 self._watch_log(f"🎯 Goal achieved: {'Yes' if conversation_summary.get('goal_achieved', False) else 'No'}")
                 self._watch_log(f"📝 Facts used: {conversation_summary.get('facts_used', 0)}")
                 self._watch_log(f"💬 Total turns: {conversation_summary.get('total_turns', 0)}")
+                
+                # Add goal evaluation details if available
+                if 'goal_evaluation_method' in conversation_summary:
+                    method = conversation_summary.get('goal_evaluation_method', 'unknown')
+                    confidence = conversation_summary.get('goal_evaluation_confidence', 0.0)
+                    fallback = conversation_summary.get('goal_evaluation_fallback_used', False)
+                    reasoning = conversation_summary.get('goal_evaluation_reasoning', 'No reasoning provided')
+                    
+                    self._watch_log(f"🧠 Evaluation method: {method}" + (" (fallback used)" if fallback else ""))
+                    self._watch_log(f"📊 Confidence: {confidence:.2f}")
+                    self._watch_log(f"💭 Reasoning: {reasoning}")
             
             self._debug_log("Scenario completed successfully", {
                 "passed": report.passed,
diff --git a/replicantx/scenarios/replicant.py b/replicantx/scenarios/replicant.py
index 8795415..46eb39e 100644
--- a/replicantx/scenarios/replicant.py
+++ b/replicantx/scenarios/replicant.py
@@ -16,7 +16,7 @@
 from pydantic_ai import Agent as PydanticAgent
 from pydantic_ai.models import infer_model
 
-from ..models import ReplicantConfig, Message
+from ..models import ReplicantConfig, Message, GoalEvaluationResult, GoalEvaluationMode
 from ..tools.http_client import HTTPResponse
 
 
@@ -28,6 +28,277 @@ class ConversationState(BaseModel):
     goal_achieved: bool = Field(False, description="Whether the goal has been achieved")
     conversation_history: List[Message] = Field(default_factory=list, description="Full conversation history")
     extracted_info: Dict[str, Any] = Field(default_factory=dict, description="Information extracted from the conversation")
+    goal_evaluation_result: Optional[GoalEvaluationResult] = Field(None, description="Latest goal evaluation result")
+
+
+class GoalEvaluator(BaseModel):
+    """Evaluates whether conversation goals have been achieved using different strategies."""
+    model_config = {"extra": "forbid"}
+    
+    mode: GoalEvaluationMode = Field(..., description="Evaluation mode")
+    model_name: Optional[str] = Field(None, description="Model for intelligent evaluation")
+    custom_prompt: Optional[str] = Field(None, description="Custom evaluation prompt")
+    completion_keywords: List[str] = Field(..., description="Keywords for keyword-based evaluation")
+    
+    @classmethod
+    def create(cls, config: ReplicantConfig) -> "GoalEvaluator":
+        """Create a GoalEvaluator from ReplicantConfig.
+        
+        Args:
+            config: Replicant configuration
+            
+        Returns:
+            Configured GoalEvaluator
+        """
+        model_name = config.goal_evaluation_model or config.llm.model
+        
+        return cls(
+            mode=config.goal_evaluation_mode,
+            model_name=model_name,
+            custom_prompt=config.goal_evaluation_prompt,
+            completion_keywords=config.completion_keywords
+        )
+    
+    async def evaluate_goal_completion(
+        self, 
+        goal: str, 
+        conversation_history: List[Message], 
+        facts: Dict[str, Any]
+    ) -> GoalEvaluationResult:
+        """Evaluate whether the conversation goal has been achieved.
+        
+        Args:
+            goal: The goal to evaluate
+            conversation_history: Full conversation history
+            facts: Available facts for context
+            
+        Returns:
+            Goal evaluation result
+        """
+        if self.mode == GoalEvaluationMode.KEYWORDS:
+            return self._evaluate_with_keywords(goal, conversation_history)
+        elif self.mode == GoalEvaluationMode.INTELLIGENT:
+            return await self._evaluate_with_llm(goal, conversation_history, facts)
+        elif self.mode == GoalEvaluationMode.HYBRID:
+            return await self._evaluate_hybrid(goal, conversation_history, facts)
+        else:
+            raise ValueError(f"Unknown goal evaluation mode: {self.mode}")
+    
+    def _evaluate_with_keywords(
+        self, 
+        goal: str, 
+        conversation_history: List[Message]
+    ) -> GoalEvaluationResult:
+        """Evaluate goal completion using keyword matching (legacy behavior).
+        
+        Args:
+            goal: The goal to evaluate
+            conversation_history: Full conversation history
+            
+        Returns:
+            Goal evaluation result
+        """
+        # Check for completion keywords in recent API responses
+        goal_achieved = False
+        matched_keywords = []
+        
+        if conversation_history:
+            recent_messages = conversation_history[-2:]  # Last 2 messages
+            for message in recent_messages:
+                if message.role == "assistant":
+                    message_lower = message.content.lower()
+                    for keyword in self.completion_keywords:
+                        if keyword.lower() in message_lower:
+                            goal_achieved = True
+                            matched_keywords.append(keyword)
+        
+        reasoning = f"Keyword evaluation: {'Found' if goal_achieved else 'No'} completion keywords"
+        if matched_keywords:
+            reasoning += f" (matched: {', '.join(matched_keywords)})"
+        
+        return GoalEvaluationResult(
+            goal_achieved=goal_achieved,
+            confidence=1.0 if goal_achieved else 0.0,  # Keywords give binary confidence
+            reasoning=reasoning,
+            evaluation_method="keywords",
+            fallback_used=False
+        )
+    
+    async def _evaluate_with_llm(
+        self, 
+        goal: str, 
+        conversation_history: List[Message], 
+        facts: Dict[str, Any]
+    ) -> GoalEvaluationResult:
+        """Evaluate goal completion using LLM analysis.
+        
+        Args:
+            goal: The goal to evaluate
+            conversation_history: Full conversation history
+            facts: Available facts for context
+            
+        Returns:
+            Goal evaluation result
+        """
+        try:
+            # Build evaluation prompt
+            prompt = self._build_evaluation_prompt(goal, conversation_history, facts)
+            
+            # Create LLM agent for evaluation
+            model = infer_model(self.model_name)
+            agent = PydanticAgent(
+                model=model,
+                instructions="You are an expert at evaluating whether conversation goals have been achieved. Be precise and analytical.",
+                model_settings={"temperature": 0.1, "max_tokens": 200}  # Low temperature for consistency
+            )
+            
+            # Get evaluation
+            result = await agent.run(prompt)
+            response = result.output.strip()
+            
+            # Parse LLM response
+            goal_achieved, confidence, reasoning = self._parse_llm_response(response)
+            
+            return GoalEvaluationResult(
+                goal_achieved=goal_achieved,
+                confidence=confidence,
+                reasoning=reasoning,
+                evaluation_method="intelligent",
+                fallback_used=False
+            )
+            
+        except Exception as e:
+            # Return failure result if LLM evaluation fails
+            return GoalEvaluationResult(
+                goal_achieved=False,
+                confidence=0.0,
+                reasoning=f"LLM evaluation failed: {str(e)}",
+                evaluation_method="intelligent",
+                fallback_used=False
+            )
+    
+    async def _evaluate_hybrid(
+        self, 
+        goal: str, 
+        conversation_history: List[Message], 
+        facts: Dict[str, Any]
+    ) -> GoalEvaluationResult:
+        """Evaluate goal completion using LLM with keyword fallback.
+        
+        Args:
+            goal: The goal to evaluate
+            conversation_history: Full conversation history
+            facts: Available facts for context
+            
+        Returns:
+            Goal evaluation result
+        """
+        # Try LLM evaluation first
+        try:
+            llm_result = await self._evaluate_with_llm(goal, conversation_history, facts)
+            if llm_result.confidence > 0.5:  # Use LLM result if confident
+                return llm_result
+        except Exception:
+            pass
+        
+        # Fall back to keyword evaluation
+        keyword_result = self._evaluate_with_keywords(goal, conversation_history)
+        keyword_result.evaluation_method = "hybrid"
+        keyword_result.fallback_used = True
+        keyword_result.reasoning = f"LLM evaluation uncertain, using keyword fallback: {keyword_result.reasoning}"
+        
+        return keyword_result
+    
+    def _build_evaluation_prompt(
+        self, 
+        goal: str, 
+        conversation_history: List[Message], 
+        facts: Dict[str, Any]
+    ) -> str:
+        """Build the evaluation prompt for LLM analysis.
+        
+        Args:
+            goal: The goal to evaluate
+            conversation_history: Full conversation history
+            facts: Available facts for context
+            
+        Returns:
+            Formatted evaluation prompt
+        """
+        if self.custom_prompt:
+            # Use custom prompt with variable substitution
+            return self.custom_prompt.format(
+                goal=goal,
+                facts=json.dumps(facts, indent=2),
+                conversation=self._format_conversation_for_prompt(conversation_history)
+            )
+        
+        # Default evaluation prompt
+        prompt = f"""Given this conversation goal: "{goal}"
+
+User facts: {json.dumps(facts, indent=2)}
+
+Recent conversation history:
+{self._format_conversation_for_prompt(conversation_history[-6:])}
+
+Has the goal been definitively achieved? Consider:
+1. Has the user received confirmation that the action was completed?
+2. Are there concrete indicators of success (confirmation numbers, bookings, etc.)?
+3. Distinguish between promises ("I will do this") vs accomplishments ("This is done")
+4. Look for specific completion indicators, not just polite acknowledgments
+
+Respond in this exact format:
+RESULT: [ACHIEVED or NOT_ACHIEVED]
+CONFIDENCE: [0.0 to 1.0]
+REASONING: [Brief explanation of your decision]"""
+
+        return prompt
+    
+    def _format_conversation_for_prompt(self, messages: List[Message]) -> str:
+        """Format conversation history for the evaluation prompt.
+        
+        Args:
+            messages: List of messages to format
+            
+        Returns:
+            Formatted conversation string
+        """
+        formatted = []
+        for msg in messages:
+            role = "User" if msg.role == "user" else "Assistant"
+            formatted.append(f"{role}: {msg.content}")
+        return "\n".join(formatted)
+    
+    def _parse_llm_response(self, response: str) -> Tuple[bool, float, str]:
+        """Parse LLM evaluation response.
+        
+        Args:
+            response: Raw LLM response
+            
+        Returns:
+            Tuple of (goal_achieved, confidence, reasoning)
+        """
+        lines = response.strip().split('\n')
+        
+        goal_achieved = False
+        confidence = 0.5
+        reasoning = "Could not parse LLM response"
+        
+        for line in lines:
+            line = line.strip()
+            if line.startswith('RESULT:'):
+                result_text = line.replace('RESULT:', '').strip().upper()
+                goal_achieved = result_text == 'ACHIEVED'
+            elif line.startswith('CONFIDENCE:'):
+                try:
+                    confidence = float(line.replace('CONFIDENCE:', '').strip())
+                    confidence = max(0.0, min(1.0, confidence))  # Clamp to [0, 1]
+                except ValueError:
+                    confidence = 0.5
+            elif line.startswith('REASONING:'):
+                reasoning = line.replace('REASONING:', '').strip()
+        
+        return goal_achieved, confidence, reasoning
 
 
 class ResponseGenerator(BaseModel):
@@ -126,6 +397,7 @@ class ReplicantAgent(BaseModel):
     config: ReplicantConfig = Field(..., description="Replicant configuration")
     state: ConversationState = Field(default_factory=ConversationState, description="Current conversation state")
     response_generator: ResponseGenerator = Field(..., description="Response generation utility")
+    goal_evaluator: GoalEvaluator = Field(..., description="Goal evaluation utility")
     
     @classmethod
     def create(cls, config: ReplicantConfig) -> "ReplicantAgent":
@@ -151,12 +423,15 @@ def create(cls, config: ReplicantConfig) -> "ReplicantAgent":
             facts=config.facts
         )
         
+        goal_evaluator = GoalEvaluator.create(config)
+        
         return cls(
             config=config,
-            response_generator=response_generator
+            response_generator=response_generator,
+            goal_evaluator=goal_evaluator
         )
     
-    def should_continue_conversation(self) -> bool:
+    async def should_continue_conversation(self) -> bool:
         """Determine if the conversation should continue.
         
         Returns:
@@ -170,15 +445,20 @@ def should_continue_conversation(self) -> bool:
         if self.state.goal_achieved:
             return False
         
-        # Check for completion keywords in recent API responses
+        # Evaluate goal completion using the configured method
         if self.state.conversation_history:
-            recent_messages = self.state.conversation_history[-2:]  # Last 2 messages
-            for message in recent_messages:
-                if message.role == "assistant":
-                    message_lower = message.content.lower()
-                    if any(keyword in message_lower for keyword in self.config.completion_keywords):
-                        self.state.goal_achieved = True
-                        return False
+            evaluation_result = await self.goal_evaluator.evaluate_goal_completion(
+                goal=self.config.goal,
+                conversation_history=self.state.conversation_history,
+                facts=self.config.facts
+            )
+            
+            # Store evaluation result for reporting
+            self.state.goal_evaluation_result = evaluation_result
+            
+            if evaluation_result.goal_achieved:
+                self.state.goal_achieved = True
+                return False
         
         return True
     
@@ -190,15 +470,25 @@ def get_initial_message(self) -> str:
         """
         return self.config.initial_message
     
-    async def process_api_response(self, api_response: str) -> str:
+    async def process_api_response(self, api_response: str, triggering_message: Optional[str] = None) -> str:
         """Process an API response and generate the next user message.
         
         Args:
             api_response: Response from the API
+            triggering_message: The user message that triggered this API response (for initial message)
             
         Returns:
             Next user message
         """
+        # Add the triggering user message if this is the first response
+        if triggering_message:
+            user_trigger_message = Message(
+                role="user",
+                content=triggering_message,
+                timestamp=datetime.now()
+            )
+            self.state.conversation_history.append(user_trigger_message)
+        
         # Add API response to conversation history
         api_message = Message(
             role="assistant",
@@ -229,13 +519,24 @@ def get_conversation_summary(self) -> Dict[str, Any]:
         Returns:
             Conversation summary
         """
-        return {
+        summary = {
             "total_turns": self.state.turn_count,
             "goal_achieved": self.state.goal_achieved,
             "conversation_length": len(self.state.conversation_history),
             "facts_used": self._count_facts_used(),
             "goal": self.config.goal,
         }
+        
+        # Add goal evaluation details if available
+        if self.state.goal_evaluation_result:
+            summary.update({
+                "goal_evaluation_method": self.state.goal_evaluation_result.evaluation_method,
+                "goal_evaluation_confidence": self.state.goal_evaluation_result.confidence,
+                "goal_evaluation_reasoning": self.state.goal_evaluation_result.reasoning,
+                "goal_evaluation_fallback_used": self.state.goal_evaluation_result.fallback_used,
+            })
+        
+        return summary
     
     def _count_facts_used(self) -> int:
         """Count how many facts were used in the conversation.
diff --git a/tests/intelligent_evaluation_example.yaml b/tests/intelligent_evaluation_example.yaml
new file mode 100644
index 0000000..b898066
--- /dev/null
+++ b/tests/intelligent_evaluation_example.yaml
@@ -0,0 +1,73 @@
+# Example demonstrating intelligent goal evaluation
+# This file shows how to avoid false positives with LLM-powered evaluation
+
+name: "Intelligent Goal Evaluation - Customer Support Example"
+base_url: "https://api.example.com/support"
+auth:
+  provider: noop
+level: agent
+replicant:
+  goal: "Get help resolving a billing dispute"
+  facts:
+    name: "Alex Chen"
+    account_number: "ACC-789456"
+    email: "alex.chen@email.com"
+    phone: "+1-555-9876"
+    issue_type: "billing_dispute"
+    disputed_charge: "$149.99 charged on March 15th"
+    expected_charge: "$99.99 monthly subscription"
+    account_type: "Premium"
+  system_prompt: |
+    You are a customer named Alex Chen who needs help with a billing dispute.
+    You notice an incorrect charge on your account and want it resolved.
+    You're polite but want a clear resolution. Provide information from your 
+    facts when asked, but don't volunteer everything at once.
+  initial_message: "Hello, I have a billing issue I need help with."
+  max_turns: 12
+  
+  # Traditional keywords that could cause false positives
+  completion_keywords:
+    - "resolved"
+    - "refund"
+    - "credited"
+    - "ticket created"
+    - "issue closed"
+  
+  # Intelligent evaluation that understands context
+  goal_evaluation_mode: "intelligent"
+  goal_evaluation_model: "openai:gpt-4o-mini"  # Cost-effective evaluation model
+  goal_evaluation_prompt: |
+    Evaluate if the billing dispute has been truly resolved. Look for:
+    
+    ACHIEVED indicators:
+    ✅ Confirmed refund processed or account credited
+    ✅ Ticket resolution with reference number
+    ✅ Billing correction acknowledged and completed
+    ✅ Customer satisfaction confirmed
+    
+    NOT ACHIEVED indicators:
+    ❌ "I'll look into the refund" (promise, not completion)
+    ❌ "A ticket has been created" (process started, not resolved)
+    ❌ "The issue should be resolved soon" (future promise)
+    ❌ "Let me transfer you for the refund" (escalation, not resolution)
+    
+    Examples of false positives to avoid:
+    ❌ "I'll make sure this is resolved" ≠ actual resolution
+    ❌ "Your refund will be processed" ≠ refund completed
+    ❌ "I've created a ticket for this issue" ≠ issue resolved
+    
+    Goal: {goal}
+    User Facts: {facts}
+    Recent Conversation: {conversation}
+    
+    RESULT: [ACHIEVED or NOT_ACHIEVED]
+    CONFIDENCE: [0.0 to 1.0]
+    REASONING: [Explain why the goal is/isn't achieved]
+  
+  fullconversation: true  # Send full conversation history for context
+  payload_format: openai  # Use standard format
+  
+  llm:
+    model: "openai:gpt-4o"  # Main conversation model
+    temperature: 0.8
+    max_tokens: 120 
\ No newline at end of file