From 8bb946c6cc6ed6c6d9e5db9e07febeb21f20983d Mon Sep 17 00:00:00 2001 From: Gus Fraser Date: Fri, 8 Aug 2025 14:46:28 +0100 Subject: [PATCH] GPT-5 workaround Don't send temperature anymore Added --verbose flag --- pyproject.toml | 35 +++++++++++++------------ replicantx/cli.py | 6 ++--- replicantx/scenarios/agent.py | 8 +++--- replicantx/scenarios/replicant.py | 43 ++++++++++++++++++++++++++----- 4 files changed, 62 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index ced61be..5705e32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "replicantx" -version = "0.1.9" +version = "0.1.10" description = "End-to-end testing harness for AI agents via web service API" readme = "README.md" requires-python = ">=3.11" @@ -14,7 +14,7 @@ authors = [ ] keywords = ["ai", "agent", "testing", "e2e", "api"] classifiers = [ - "Development Status :: 3 - Alpha", + "Development Status :: 4 - Beta", "Intended Audience :: Developers", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.11", @@ -23,20 +23,21 @@ classifiers = [ ] dependencies = [ - "pydantic>=2.7", - "typer>=0.12.0", - "httpx>=0.27.0", - "PyYAML>=6.0", - "supabase>=2.0.0", - "jinja2>=3.1.0", - "rich>=13.0.0", - "pydantic-ai>=0.3.0", - "python-dotenv>=1.0.0", + "pydantic>=2.11.7", + "typer>=0.16.0", + "httpx>=0.28.1", + "PyYAML>=6.0.2", + "supabase>=2.18.0", + "jinja2>=3.1.6", + "rich>=14.1.0", + "pydantic-ai>=0.6.2", + "python-dotenv>=1.1.1", + "typing-extensions>=4.14.1", ] [project.optional-dependencies] cli = [ - "typer[all]>=0.12.0", + "typer[all]>=0.16.0", ] dev = [ "pytest>=8.0.0", @@ -48,15 +49,15 @@ dev = [ # Note: PydanticAI is included in core dependencies and handles all LLM providers # Optional providers can be installed separately based on PydanticAI documentation openai = [ - "openai>=1.0.0", + "openai>=1.99.3", ] anthropic = [ - "anthropic>=0.34.0", + "anthropic>=0.61.0", ] all = [ - "typer[all]>=0.12.0", - "openai>=1.0.0", - "anthropic>=0.34.0", + "typer[all]>=0.16.0", + "openai>=1.99.3", + "anthropic>=0.61.0", ] [project.scripts] diff --git a/replicantx/cli.py b/replicantx/cli.py index 23138ff..0bba369 100644 --- a/replicantx/cli.py +++ b/replicantx/cli.py @@ -73,7 +73,7 @@ def run( False, "--ci", help="CI mode: exit with non-zero code if any tests fail" ), verbose: bool = typer.Option( - False, "--verbose", "-v", help="Enable verbose output" + False, "--verbose", help="Enable verbose output" ), debug: bool = typer.Option( False, "--debug", help="Enable debug mode: Shows detailed technical information including HTTP client setup, request payloads, response validation, AI processing, and assertion results. Perfect for troubleshooting failed tests and performance analysis." @@ -266,7 +266,7 @@ async def run_scenarios_sequential( if config.level == TestLevel.BASIC: runner = BasicScenarioRunner(config, debug=debug, watch=watch) elif config.level == TestLevel.AGENT: - runner = AgentScenarioRunner(config, debug=debug, watch=watch) + runner = AgentScenarioRunner(config, debug=debug, watch=watch, verbose=verbose) else: raise ValueError(f"Unsupported test level: {config.level}") @@ -400,7 +400,7 @@ async def _execute_scenario( if config.level == TestLevel.BASIC: runner = BasicScenarioRunner(config, debug=debug, watch=watch) elif config.level == TestLevel.AGENT: - runner = AgentScenarioRunner(config, debug=debug, watch=watch) + runner = AgentScenarioRunner(config, debug=debug, watch=watch, verbose=verbose) else: raise ValueError(f"Unsupported test level: {config.level}") diff --git a/replicantx/scenarios/agent.py b/replicantx/scenarios/agent.py index d41dcec..bd77be3 100644 --- a/replicantx/scenarios/agent.py +++ b/replicantx/scenarios/agent.py @@ -27,18 +27,20 @@ class AgentScenarioRunner: """Runner for Replicant agent-driven (Level 2) test scenarios.""" - def __init__(self, config: ScenarioConfig, debug: bool = False, watch: bool = False): + def __init__(self, config: ScenarioConfig, debug: bool = False, watch: bool = False, verbose: bool = False): """Initialize the agent scenario runner. Args: config: Scenario configuration with Replicant agent setup debug: Enable debug mode with technical details watch: Enable watch mode for real-time monitoring + verbose: Enable verbose output for system prompts """ self.config = config self.debug = debug self.watch = watch - self.console = Console() if (debug or watch) else None + self.verbose = verbose + self.console = Console() if (debug or watch or verbose) else None self.auth_provider = self._create_auth_provider() self.http_client: Optional[HTTPClient] = None self.replicant_agent: Optional[ReplicantAgent] = None @@ -147,7 +149,7 @@ async def run(self) -> ScenarioReport: }) # Initialize Replicant agent - self.replicant_agent = ReplicantAgent.create(self.config.replicant) + self.replicant_agent = ReplicantAgent.create(self.config.replicant, verbose=self.verbose) current_datetime = datetime.now() date_str = current_datetime.strftime("%A, %B %d, %Y") diff --git a/replicantx/scenarios/replicant.py b/replicantx/scenarios/replicant.py index eeb68d1..7517dd1 100644 --- a/replicantx/scenarios/replicant.py +++ b/replicantx/scenarios/replicant.py @@ -39,9 +39,10 @@ class GoalEvaluator(BaseModel): model_name: Optional[str] = Field(None, description="Model for intelligent evaluation") custom_prompt: Optional[str] = Field(None, description="Custom evaluation prompt") completion_keywords: List[str] = Field(..., description="Keywords for keyword-based evaluation") + verbose: bool = Field(False, description="Enable verbose output for system prompts") @classmethod - def create(cls, config: ReplicantConfig) -> "GoalEvaluator": + def create(cls, config: ReplicantConfig, verbose: bool = False) -> "GoalEvaluator": """Create a GoalEvaluator from ReplicantConfig. Args: @@ -56,7 +57,8 @@ def create(cls, config: ReplicantConfig) -> "GoalEvaluator": mode=config.goal_evaluation_mode, model_name=model_name, custom_prompt=config.goal_evaluation_prompt, - completion_keywords=config.completion_keywords + completion_keywords=config.completion_keywords, + verbose=verbose ) async def evaluate_goal_completion( @@ -146,12 +148,24 @@ async def _evaluate_with_llm( # Create LLM agent for evaluation model = infer_model(self.model_name) + # Only include max_tokens for evaluation - don't set temperature to avoid compatibility issues agent = PydanticAgent( model=model, instructions="You are an expert at evaluating whether conversation goals have been achieved. Be precise and analytical.", - model_settings={"temperature": 0.1, "max_tokens": 200} # Low temperature for consistency + model_settings={"max_tokens": 1000} # Only include max_tokens, skip temperature for compatibility ) + # Verbose logging of the goal evaluation prompt + if self.verbose: + print("\n" + "="*80) + print("🔍 VERBOSE: GOAL EVALUATION PROMPT SENT TO PYDANTICAI") + print("="*80) + print(f"Model: {self.model_name}") + print(f"Model Settings: {{'max_tokens': 200}}") + print(f"Instructions: You are an expert at evaluating whether conversation goals have been achieved. Be precise and analytical.") + print(f"Prompt: {prompt}") + print("="*80 + "\n") + # Get evaluation result = await agent.run(prompt) response = result.output.strip() @@ -327,6 +341,7 @@ class ResponseGenerator(BaseModel): system_prompt: str = Field(..., description="System prompt for response generation") model_settings: Dict[str, Any] = Field(default_factory=dict, description="Model settings") facts: Dict[str, Any] = Field(..., description="Available facts") + verbose: bool = Field(False, description="Enable verbose output for system prompts") def _create_agent(self) -> PydanticAgent: """Create a PydanticAI agent instance.""" @@ -374,6 +389,18 @@ async def generate_response(self, api_message: str, conversation_state: Conversa # Create and use PydanticAI agent agent = self._create_agent() + + # Verbose logging of the complete system prompt + if self.verbose: + print("\n" + "="*80) + print("🔍 VERBOSE: COMPLETE SYSTEM PROMPT SENT TO PYDANTICAI") + print("="*80) + print(f"Model: {self.model_name}") + print(f"Model Settings: {self.model_settings}") + print(f"System Prompt: {self.system_prompt}") + print(f"Context: {context}") + print("="*80 + "\n") + result = await agent.run(context) return result.output @@ -418,16 +445,17 @@ class ReplicantAgent(BaseModel): goal_evaluator: GoalEvaluator = Field(..., description="Goal evaluation utility") @classmethod - def create(cls, config: ReplicantConfig) -> "ReplicantAgent": + def create(cls, config: ReplicantConfig, verbose: bool = False) -> "ReplicantAgent": """Create a new Replicant agent. Args: config: Replicant configuration + verbose: Enable verbose output for system prompts Returns: Configured Replicant agent """ - # Build model settings + # Build model settings - only include parameters that are explicitly provided model_settings = {} if config.llm.temperature is not None: model_settings["temperature"] = config.llm.temperature @@ -438,10 +466,11 @@ def create(cls, config: ReplicantConfig) -> "ReplicantAgent": model_name=config.llm.model, system_prompt=config.system_prompt, model_settings=model_settings, - facts=config.facts + facts=config.facts, + verbose=verbose ) - goal_evaluator = GoalEvaluator.create(config) + goal_evaluator = GoalEvaluator.create(config, verbose=verbose) return cls( config=config,