Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "replicantx"
version = "0.1.9"
version = "0.1.10"
description = "End-to-end testing harness for AI agents via web service API"
readme = "README.md"
requires-python = ">=3.11"
Expand All @@ -14,7 +14,7 @@ authors = [
]
keywords = ["ai", "agent", "testing", "e2e", "api"]
classifiers = [
"Development Status :: 3 - Alpha",
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
Expand All @@ -23,20 +23,21 @@ classifiers = [
]

dependencies = [
"pydantic>=2.7",
"typer>=0.12.0",
"httpx>=0.27.0",
"PyYAML>=6.0",
"supabase>=2.0.0",
"jinja2>=3.1.0",
"rich>=13.0.0",
"pydantic-ai>=0.3.0",
"python-dotenv>=1.0.0",
"pydantic>=2.11.7",
"typer>=0.16.0",
"httpx>=0.28.1",
"PyYAML>=6.0.2",
"supabase>=2.18.0",
"jinja2>=3.1.6",
"rich>=14.1.0",
"pydantic-ai>=0.6.2",
"python-dotenv>=1.1.1",
"typing-extensions>=4.14.1",
]

[project.optional-dependencies]
cli = [
"typer[all]>=0.12.0",
"typer[all]>=0.16.0",
]
dev = [
"pytest>=8.0.0",
Expand All @@ -48,15 +49,15 @@ dev = [
# Note: PydanticAI is included in core dependencies and handles all LLM providers
# Optional providers can be installed separately based on PydanticAI documentation
openai = [
"openai>=1.0.0",
"openai>=1.99.3",
]
anthropic = [
"anthropic>=0.34.0",
"anthropic>=0.61.0",
]
all = [
"typer[all]>=0.12.0",
"openai>=1.0.0",
"anthropic>=0.34.0",
"typer[all]>=0.16.0",
"openai>=1.99.3",
"anthropic>=0.61.0",
]

[project.scripts]
Expand Down
6 changes: 3 additions & 3 deletions replicantx/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def run(
False, "--ci", help="CI mode: exit with non-zero code if any tests fail"
),
verbose: bool = typer.Option(
False, "--verbose", "-v", help="Enable verbose output"
False, "--verbose", help="Enable verbose output"
),
debug: bool = typer.Option(
False, "--debug", help="Enable debug mode: Shows detailed technical information including HTTP client setup, request payloads, response validation, AI processing, and assertion results. Perfect for troubleshooting failed tests and performance analysis."
Expand Down Expand Up @@ -266,7 +266,7 @@ async def run_scenarios_sequential(
if config.level == TestLevel.BASIC:
runner = BasicScenarioRunner(config, debug=debug, watch=watch)
elif config.level == TestLevel.AGENT:
runner = AgentScenarioRunner(config, debug=debug, watch=watch)
runner = AgentScenarioRunner(config, debug=debug, watch=watch, verbose=verbose)
else:
raise ValueError(f"Unsupported test level: {config.level}")

Expand Down Expand Up @@ -400,7 +400,7 @@ async def _execute_scenario(
if config.level == TestLevel.BASIC:
runner = BasicScenarioRunner(config, debug=debug, watch=watch)
elif config.level == TestLevel.AGENT:
runner = AgentScenarioRunner(config, debug=debug, watch=watch)
runner = AgentScenarioRunner(config, debug=debug, watch=watch, verbose=verbose)
else:
raise ValueError(f"Unsupported test level: {config.level}")

Expand Down
8 changes: 5 additions & 3 deletions replicantx/scenarios/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,20 @@
class AgentScenarioRunner:
"""Runner for Replicant agent-driven (Level 2) test scenarios."""

def __init__(self, config: ScenarioConfig, debug: bool = False, watch: bool = False):
def __init__(self, config: ScenarioConfig, debug: bool = False, watch: bool = False, verbose: bool = False):
"""Initialize the agent scenario runner.

Args:
config: Scenario configuration with Replicant agent setup
debug: Enable debug mode with technical details
watch: Enable watch mode for real-time monitoring
verbose: Enable verbose output for system prompts
"""
self.config = config
self.debug = debug
self.watch = watch
self.console = Console() if (debug or watch) else None
self.verbose = verbose
self.console = Console() if (debug or watch or verbose) else None
self.auth_provider = self._create_auth_provider()
self.http_client: Optional[HTTPClient] = None
self.replicant_agent: Optional[ReplicantAgent] = None
Expand Down Expand Up @@ -147,7 +149,7 @@ async def run(self) -> ScenarioReport:
})

# Initialize Replicant agent
self.replicant_agent = ReplicantAgent.create(self.config.replicant)
self.replicant_agent = ReplicantAgent.create(self.config.replicant, verbose=self.verbose)

current_datetime = datetime.now()
date_str = current_datetime.strftime("%A, %B %d, %Y")
Expand Down
43 changes: 36 additions & 7 deletions replicantx/scenarios/replicant.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ class GoalEvaluator(BaseModel):
model_name: Optional[str] = Field(None, description="Model for intelligent evaluation")
custom_prompt: Optional[str] = Field(None, description="Custom evaluation prompt")
completion_keywords: List[str] = Field(..., description="Keywords for keyword-based evaluation")
verbose: bool = Field(False, description="Enable verbose output for system prompts")

@classmethod
def create(cls, config: ReplicantConfig) -> "GoalEvaluator":
def create(cls, config: ReplicantConfig, verbose: bool = False) -> "GoalEvaluator":
"""Create a GoalEvaluator from ReplicantConfig.

Args:
Expand All @@ -56,7 +57,8 @@ def create(cls, config: ReplicantConfig) -> "GoalEvaluator":
mode=config.goal_evaluation_mode,
model_name=model_name,
custom_prompt=config.goal_evaluation_prompt,
completion_keywords=config.completion_keywords
completion_keywords=config.completion_keywords,
verbose=verbose
)

async def evaluate_goal_completion(
Expand Down Expand Up @@ -146,12 +148,24 @@ async def _evaluate_with_llm(

# Create LLM agent for evaluation
model = infer_model(self.model_name)
# Only include max_tokens for evaluation - don't set temperature to avoid compatibility issues
agent = PydanticAgent(
model=model,
instructions="You are an expert at evaluating whether conversation goals have been achieved. Be precise and analytical.",
model_settings={"temperature": 0.1, "max_tokens": 200} # Low temperature for consistency
model_settings={"max_tokens": 1000} # Only include max_tokens, skip temperature for compatibility
)

# Verbose logging of the goal evaluation prompt
if self.verbose:
print("\n" + "="*80)
print("🔍 VERBOSE: GOAL EVALUATION PROMPT SENT TO PYDANTICAI")
print("="*80)
print(f"Model: {self.model_name}")
print(f"Model Settings: {{'max_tokens': 200}}")
print(f"Instructions: You are an expert at evaluating whether conversation goals have been achieved. Be precise and analytical.")
print(f"Prompt: {prompt}")
print("="*80 + "\n")

# Get evaluation
result = await agent.run(prompt)
response = result.output.strip()
Expand Down Expand Up @@ -327,6 +341,7 @@ class ResponseGenerator(BaseModel):
system_prompt: str = Field(..., description="System prompt for response generation")
model_settings: Dict[str, Any] = Field(default_factory=dict, description="Model settings")
facts: Dict[str, Any] = Field(..., description="Available facts")
verbose: bool = Field(False, description="Enable verbose output for system prompts")

def _create_agent(self) -> PydanticAgent:
"""Create a PydanticAI agent instance."""
Expand Down Expand Up @@ -374,6 +389,18 @@ async def generate_response(self, api_message: str, conversation_state: Conversa

# Create and use PydanticAI agent
agent = self._create_agent()

# Verbose logging of the complete system prompt
if self.verbose:
print("\n" + "="*80)
print("🔍 VERBOSE: COMPLETE SYSTEM PROMPT SENT TO PYDANTICAI")
print("="*80)
print(f"Model: {self.model_name}")
print(f"Model Settings: {self.model_settings}")
print(f"System Prompt: {self.system_prompt}")
print(f"Context: {context}")
print("="*80 + "\n")

result = await agent.run(context)

return result.output
Expand Down Expand Up @@ -418,16 +445,17 @@ class ReplicantAgent(BaseModel):
goal_evaluator: GoalEvaluator = Field(..., description="Goal evaluation utility")

@classmethod
def create(cls, config: ReplicantConfig) -> "ReplicantAgent":
def create(cls, config: ReplicantConfig, verbose: bool = False) -> "ReplicantAgent":
"""Create a new Replicant agent.

Args:
config: Replicant configuration
verbose: Enable verbose output for system prompts

Returns:
Configured Replicant agent
"""
# Build model settings
# Build model settings - only include parameters that are explicitly provided
model_settings = {}
if config.llm.temperature is not None:
model_settings["temperature"] = config.llm.temperature
Expand All @@ -438,10 +466,11 @@ def create(cls, config: ReplicantConfig) -> "ReplicantAgent":
model_name=config.llm.model,
system_prompt=config.system_prompt,
model_settings=model_settings,
facts=config.facts
facts=config.facts,
verbose=verbose
)

goal_evaluator = GoalEvaluator.create(config)
goal_evaluator = GoalEvaluator.create(config, verbose=verbose)

return cls(
config=config,
Expand Down
Loading