diff --git a/examples/deepeval/README.md b/examples/deepeval/README.md new file mode 100644 index 00000000..25cb6fc5 --- /dev/null +++ b/examples/deepeval/README.md @@ -0,0 +1,560 @@ +# DeepEval GEval Evaluator Example + +This example demonstrates how to extend the agent-control `Evaluator` base class to create custom evaluators using external libraries like [DeepEval](https://deepeval.com). + +## Overview + +DeepEval's GEval is an LLM-as-a-judge metric that uses chain-of-thoughts (CoT) to evaluate LLM outputs based on custom criteria. This example shows how to: + +1. **Extend the base Evaluator class** - Create a custom evaluator by implementing the required interface +2. **Configure evaluation criteria** - Define custom quality metrics (coherence, relevance, correctness, etc.) +3. **Register via entry points** - Make the evaluator discoverable by the agent-control server +4. **Integrate with agent-control** - Use the evaluator in controls to enforce quality standards + +## Architecture + +``` +examples/deepeval/ +├── __init__.py # Package initialization +├── config.py # DeepEvalEvaluatorConfig - Configuration model +├── evaluator.py # DeepEvalEvaluator - Main evaluator implementation +├── qa_agent.py # Q&A agent with DeepEval controls +├── setup_controls.py # Setup script to create controls on server +├── start_server_with_evaluator.sh # Helper script to start server with evaluator +├── pyproject.toml # Project config with entry point and dependencies +└── README.md # This file +``` + +**Package Structure Notes:** +- Uses a **flat layout** with Python files at the root (configured via `packages = ["."]` in pyproject.toml) +- Modules use **absolute imports** (e.g., `from config import X`) rather than relative imports +- Entry point `evaluator:DeepEvalEvaluator` references the module directly +- Install with `uv pip install -e .` to register the entry point for server discovery + +### Key Components + +1. **DeepEvalEvaluatorConfig** ([config.py](config.py)) + - Pydantic model defining configuration options + - Based on DeepEval's GEval API parameters + - Validates that either `criteria` or `evaluation_steps` is provided + +2. **DeepEvalEvaluator** ([evaluator.py](evaluator.py)) + - Extends `Evaluator[DeepEvalEvaluatorConfig]` + - Implements the `evaluate()` method + - Registered with `@register_evaluator` decorator + - Handles LLMTestCase creation and metric execution + +3. **Q&A Agent Demo** ([qa_agent.py](qa_agent.py)) + - Complete working agent with DeepEval quality controls + - Uses `@control()` decorator for automatic evaluation + - Demonstrates handling `ControlViolationError` + +4. **Setup Script** ([setup_controls.py](setup_controls.py)) + - Creates agent and registers with server + - Configures DeepEval-based controls + - Creates 3 quality controls (coherence, relevance, correctness) + +5. **Entry Point Registration** ([pyproject.toml](pyproject.toml)) + - Registers evaluator with server via `project.entry-points` + - Depends on `agent-control-models>=3.0.0` and `agent-control-sdk>=3.0.0` + - In monorepo: uses workspace dependencies (editable installs) + - For third-party: can use published PyPI packages + - Enables automatic discovery when server starts + +## How It Works + +### 1. Extending the Evaluator Base Class + +The evaluator follows the standard pattern for all agent-control evaluators: + +```python +from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator + +@register_evaluator +class DeepEvalEvaluator(Evaluator[DeepEvalEvaluatorConfig]): + # Define metadata + metadata = EvaluatorMetadata( + name="deepeval-geval", + version="1.0.0", + description="DeepEval GEval custom LLM-based evaluator", + requires_api_key=True, + timeout_ms=30000, + ) + + # Define config model + config_model = DeepEvalEvaluatorConfig + + # Implement evaluate method + async def evaluate(self, data: Any) -> EvaluatorResult: + # matched=True triggers the deny action when quality fails + # matched=False allows the request when quality passes + return EvaluatorResult( + matched=not is_successful, # Trigger when quality fails + confidence=score, + message=reason, + ) +``` + +### 2. Entry Point Registration + +The evaluator is registered via `pyproject.toml`: + +```toml +[project] +dependencies = [ + "agent-control-models>=3.0.0", + "agent-control-sdk>=3.0.0", + "deepeval>=1.0.0", + # ... other dependencies +] + +[project.entry-points."agent_control.evaluators"] +deepeval-geval = "evaluator:DeepEvalEvaluator" +``` + +This makes the evaluator automatically discoverable by the server when it starts. The pattern works with both workspace dependencies (for monorepo development) and published PyPI packages (for third-party evaluators). + +### 3. Configuration + +DeepEval's GEval supports two modes: + +**With Criteria** (auto-generates evaluation steps): +```python +config = DeepEvalEvaluatorConfig( + name="Coherence", + criteria="Evaluate whether the response is coherent and logically consistent.", + evaluation_params=["input", "actual_output"], + threshold=0.6, +) +``` + +**With Explicit Steps**: +```python +config = DeepEvalEvaluatorConfig( + name="Correctness", + evaluation_steps=[ + "Check whether facts in actual output contradict expected output", + "Heavily penalize omission of critical details", + "Minor wording differences are acceptable" + ], + evaluation_params=["input", "actual_output", "expected_output"], + threshold=0.7, +) +``` + +### 4. Using in Control Definitions + +Once registered, the evaluator can be used in control definitions: + +```python +control_definition = { + "name": "check-coherence", + "description": "Ensures responses are coherent and logically consistent", + "definition": { + "description": "Ensures responses are coherent", + "enabled": True, + "execution": "server", + "scope": {"stages": ["post"]}, # Apply to all steps at post stage + "selector": {}, # Pass full data (input + output) + "evaluator": { + "name": "deepeval-geval", # From metadata.name + "config": { + "name": "Coherence", + "criteria": "Evaluate whether the response is coherent", + "evaluation_params": ["input", "actual_output"], + "threshold": 0.6, + "model": "gpt-4o", + }, + }, + "action": { + "decision": "deny", + "message": "Response failed coherence check", + }, + }, +} +``` + +**Key points:** +- `execution: "server"` - Required field +- `scope: {"stages": ["post"]}` - Apply to all function calls at post stage +- `selector: {}` - Pass full data so evaluator gets both input and output +- `evaluation_params: ["input", "actual_output"]` - Both fields required for relevance checks + +## Getting Started from Fresh Clone + +This example demonstrates **custom evaluator development** within the agent-control monorepo. It uses workspace dependencies (editable installs) to work with the latest development versions of: +- `agent-control-models` - Base evaluator classes and types +- `agent-control-sdk` - Agent Control SDK for integration +- `deepeval` - DeepEval evaluation framework + +**Note:** This is a **development/monorepo example** showing the evaluator architecture. + +### 1. Clone Repository + +```bash +# Clone the repository +git clone https://github.com/agentcontrol/agent-control.git +cd agent-control +``` + +### 2. Start Database and Server + +```bash +# Start PostgreSQL database and run migrations +cd server && docker-compose up -d && make alembic-upgrade && cd .. + +# Start the agent-control server (from repository root) +make server-run +``` + +The server will be running at `http://localhost:8000`. + +### 3. Install DeepEval Example + +```bash +# Navigate to the DeepEval example directory +cd examples/deepeval + +# Install dependencies +uv sync + +# Install the evaluator package itself in editable mode +uv pip install -e . +``` + +This installs: +- **Dependencies**: `deepeval>=1.0.0`, `openai>=1.0.0`, `pydantic>=2.0.0`, etc. +- **Workspace packages** (as editable installs): `agent-control-models`, `agent-control-sdk` +- **This evaluator package** in editable mode, which registers the entry point for server discovery + +The entry point `deepeval-geval = "evaluator:DeepEvalEvaluator"` makes the evaluator discoverable by the server. + +### 4. Set Environment Variables + +```bash +# Required for DeepEval GEval (uses OpenAI models) +export OPENAI_API_KEY="your-openai-api-key" + +# Optional: Disable DeepEval telemetry +export DEEPEVAL_TELEMETRY_OPT_OUT="true" +``` + +### 5. Restart Server + +After installing the DeepEval example, restart the server so it can discover the new evaluator: + +```bash +# Stop the server (Ctrl+C) and restart +cd ../../ # Back to repository root +make server-run +``` + +Verify the evaluator is registered: +```bash +curl http://localhost:8000/api/v1/evaluators | grep deepeval-geval +``` + +### 6. Setup Agent and Controls + +```bash +cd examples/deepeval +uv run setup_controls.py +``` + +This creates the agent registration and three quality controls (coherence, relevance, correctness). + +### 7. Run the Q&A Agent + +```bash +uv run qa_agent.py +``` + +Try asking questions like "What is Python?" or test the controls with "Tell me about something trigger_irrelevant". + +--- + +## Testing the Agent + +### Interactive Commands + +Once the agent is running, try these commands: + +``` +You: What is Python? +You: What is the capital of France? +You: Test trigger_incoherent response please +You: Tell me about something trigger_irrelevant +You: /test-good # Test with quality questions +You: /test-bad # Test quality control triggers +You: /help # Show all commands +You: /quit # Exit +``` + +The agent will: +- Accept questions with coherent, relevant responses +- Block questions that produce incoherent or irrelevant responses +- Show which control triggered when quality checks fail + +### What to Expect + +**Good Quality Responses** (Pass controls): +``` +You: What is Python? +Agent: Python is a high-level, interpreted programming language known for its + simplicity and readability. It was created by Guido van Rossum and first + released in 1991. Python supports multiple programming paradigms... +``` + +**Poor Quality Responses** (Blocked by controls): +``` +You: Test trigger_incoherent response please +⚠️ Quality control triggered: check-coherence + Reason: Response failed coherence check + +Agent: I apologize, but my response didn't meet quality standards. + Could you rephrase your question or ask something else? +``` + +The DeepEval controls evaluate responses in real-time and block those that don't meet quality thresholds. + +## Evaluation Parameters + +DeepEval supports multiple test case parameters: + +- `input` - The user query or prompt +- `actual_output` - The LLM's generated response +- `expected_output` - Reference/ground truth answer +- `context` - Additional context for evaluation +- `retrieval_context` - Retrieved documents (for RAG) +- `tools_called` - Tools invoked by the agent +- `expected_tools` - Expected tool usage +- Plus MCP-related parameters + +Configure which parameters to use via the `evaluation_params` config field. + +**Important:** For relevance checks, always include both `input` and `actual_output` so the evaluator can compare the question with the answer. + +## For Third-Party Developers + +This example shows the **evaluator architecture** for extending agent-control. While this specific example is set up for monorepo development, the same pattern works for third-party evluators using published packages. + +To create your own evaluator: + +1. **Extend the Evaluator base class** from `agent-control-models` (published on PyPI) +2. **Define a configuration model** using Pydantic +3. **Register via entry points** in your `pyproject.toml` +4. **Install your package** so the server can discover the entry point +5. **Restart the server** to load the new evaluator + +For standalone packages outside the monorepo, use published versions: +```toml +[project] +dependencies = [ + "agent-control-models>=3.0.0", # From PyPI + "agent-control-sdk>=3.0.0", # From PyPI + "your-evaluation-library>=1.0.0" +] +``` + +See the [Extending This Example](#extending-this-example) section below for the complete pattern. + +### Production Deployment + +For production deployments, build your evaluator as a Python wheel and install it on your agent-control server: + +**Development (this example):** +```bash +uv pip install -e . # Editable install for development +``` + +**Production:** +```bash +python -m build # Creates dist/*.whl +# Install wheel on production server where agent-control runs +``` + +**Deployment Options:** + +1. **Self-Hosted Server (Full Control)** + - Deploy your own agent-control server instance + - Install custom evaluator packages (wheel, source, or private PyPI) + - Your agents connect to this server via the SDK + - Complete control over evaluators and policies + +2. **Managed Service (If Available)** + - Use a hosted agent-control service + - May require coordination to install custom evaluators + - Or use only built-in/approved evaluators + +In both cases, evaluators run **server-side** (`execution: "server"`), so your agent applications only need the lightweight SDK installed. The evaluator package must be installed where the agent-control server runs, not in your agent application. + +## Extending This Example + +### Creating Your Own Custom Evaluator + +Follow this pattern to create evaluators for other libraries: + +1. **Define a Config Model** + ```python + from pydantic import BaseModel + + class MyEvaluatorConfig(BaseModel): + threshold: float = 0.5 + # Your config fields + ``` + +2. **Implement the Evaluator** + ```python + from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator + + @register_evaluator + class MyEvaluator(Evaluator[MyEvaluatorConfig]): + metadata = EvaluatorMetadata(name="my-evaluator", ...) + config_model = MyEvaluatorConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + score = # Your evaluation logic + return EvaluatorResult( + matched=score < self.config.threshold, # Trigger when fails + confidence=score, + ) + ``` + +3. **Register via Entry Point** + ```toml + [project.entry-points."agent_control.evaluators"] + my-evaluator = "evaluator:MyEvaluator" + ``` + +4. **Install and Use** + ```bash + uv sync # Server will discover it automatically + ``` + +### Adding More GEval Metrics + +You can create specialized evaluators for specific use cases: + +- **Bias Detection**: Evaluate responses for bias or fairness +- **Safety**: Check for harmful or unsafe content +- **Style Compliance**: Ensure responses match brand guidelines +- **Technical Accuracy**: Validate technical correctness +- **Tone Assessment**: Evaluate emotional tone and sentiment + +## Resources + +- **DeepEval Documentation**: https://deepeval.com/docs/metrics-llm-evals +- **G-Eval Guide**: https://www.confident-ai.com/blog/g-eval-the-definitive-guide +- **Agent Control Evaluators**: [Base evaluator class](../../models/src/agent_control_models/evaluator.py) +- **CrewAI Example**: [Using agent-control as a consumer](../crewai/) + +## Key Takeaways + +1. **Entry Points are Critical**: The server discovers evaluators via `project.entry-points`, not PYTHONPATH +2. **Extensibility**: The `Evaluator` base class makes it easy to integrate any evaluation library +3. **Configuration**: Pydantic models provide type-safe, validated configuration +4. **Registration**: The `@register_evaluator` decorator handles registration automatically +5. **Integration**: Evaluators work seamlessly with agent-control's policy system +6. **Control Logic**: `matched=True` triggers the action (deny/allow), so invert when quality passes + +## Troubleshooting + +### Controls not triggering + +- Check that `execution: "server"` is in control definition +- Use `scope: {"stages": ["post"]}` instead of `step_types` +- Use empty selector `{}` to pass full data (input + output) +- Restart server after evaluator code changes + +### Evaluator not found + +The server couldn't discover the evaluator. Check: + +1. **Entry point registration** in `pyproject.toml`: + ```toml + [project.entry-points."agent_control.evaluators"] + deepeval-geval = "evaluator:DeepEvalEvaluator" + ``` + +2. **Package is installed**: + ```bash + cd examples/deepeval + uv sync # Install dependencies + uv pip install -e . # Install this package + ``` + +3. **Server was restarted** after package installation: + ```bash + # Stop server (Ctrl+C), then restart + make server-run + ``` + +4. **Verify registration**: + ```bash + curl http://localhost:8000/api/v1/evaluators | grep deepeval-geval + ``` + +5. **Check server logs** for evaluator discovery messages during startup + +### Wrong evaluation results + +- For relevance: include both `input` and `actual_output` in `evaluation_params` +- Check that `matched` logic is inverted (trigger when quality fails) +- Lower threshold to be more strict (0.5 instead of 0.7) + +### Import errors: "cannot import name 'X'" + +If you see import errors like `ImportError: cannot import name 'AgentRef'`: + +1. **Stale editable install**: Reinstall the package + ```bash + uv pip install -e /path/to/package --force-reinstall --no-deps + ``` + +2. **For agent-control-models specifically**: + ```bash + uv pip install -e ../../models --force-reinstall --no-deps + ``` + +3. **Clear Python cache** if issues persist: + ```bash + find . -name "*.pyc" -delete + find . -name "__pycache__" -type d -exec rm -rf {} + + ``` + +4. **Verify installation**: + ```bash + python -c "from agent_control_models.server import AgentRef; print('Success')" + ``` + +### Package not discoverable: "attempted relative import" + +If you see `attempted relative import with no known parent package`: + +1. **Ensure the package is installed**: + ```bash + cd examples/deepeval + uv pip install -e . + ``` + +2. **Verify entry point registration**: + ```bash + uv pip show agent-control-deepeval-example + ``` + +3. **Check pyproject.toml has**: + ```toml + [tool.hatch.build.targets.wheel] + packages = ["."] + ``` + +### DeepEval telemetry files + +- DeepEval creates a `.deepeval/` directory with telemetry files in the working directory +- When the evaluator runs on the server, files appear in `server/.deepeval/` +- These files don't need to be committed (add `.deepeval/` to `.gitignore`) +- To disable telemetry: set environment variable `DEEPEVAL_TELEMETRY_OPT_OUT="true"` + +## License + +This example is part of the agent-control project. diff --git a/examples/deepeval/__init__.py b/examples/deepeval/__init__.py new file mode 100644 index 00000000..77d5fbc6 --- /dev/null +++ b/examples/deepeval/__init__.py @@ -0,0 +1,14 @@ +"""DeepEval GEval evaluator example. + +This module demonstrates how to extend the base Evaluator class to create +custom evaluators using external libraries like DeepEval. +""" + +from config import DeepEvalEvaluatorConfig, DeepEvalTestCaseParam +from evaluator import DeepEvalEvaluator + +__all__ = [ + "DeepEvalEvaluator", + "DeepEvalEvaluatorConfig", + "DeepEvalTestCaseParam", +] diff --git a/examples/deepeval/config.py b/examples/deepeval/config.py new file mode 100644 index 00000000..5c592667 --- /dev/null +++ b/examples/deepeval/config.py @@ -0,0 +1,138 @@ +"""Configuration models for DeepEval GEval evaluator. + +Based on DeepEval's GEval metric: https://deepeval.com/docs/metrics-llm-evals +""" + +from typing import Any, Literal + +from pydantic import BaseModel, Field, model_validator + + +# DeepEval's LLMTestCaseParams enum values +DeepEvalTestCaseParam = Literal[ + "input", + "actual_output", + "expected_output", + "context", + "retrieval_context", + "tools_called", + "expected_tools", + "mcp_servers", + "mcp_tools_called", + "mcp_resources_called", + "mcp_prompts_called", +] + + +class DeepEvalEvaluatorConfig(BaseModel): + """Configuration for DeepEval GEval evaluator. + + DeepEval's GEval uses LLM-as-a-judge with chain-of-thoughts (CoT) to evaluate + LLM outputs based on custom criteria. It's capable of evaluating almost any + use case with human-like accuracy. + + Example (with criteria): + ```python + config = DeepEvalEvaluatorConfig( + name="Correctness", + criteria="Determine if the actual output is correct based on the expected output.", + evaluation_params=["actual_output", "expected_output"], + threshold=0.5, + ) + ``` + + Example (with evaluation_steps): + ```python + config = DeepEvalEvaluatorConfig( + name="Correctness", + evaluation_steps=[ + "Check whether facts in actual output contradict expected output", + "Heavily penalize omission of detail", + "Vague language or contradicting opinions are acceptable" + ], + evaluation_params=["actual_output", "expected_output"], + threshold=0.5, + ) + ``` + """ + + name: str = Field( + description="Name identifier for the custom metric (e.g., 'Correctness', 'Relevance')" + ) + + criteria: str | None = Field( + default=None, + description="Description outlining the specific evaluation aspects. Either provide criteria OR evaluation_steps, not both.", + ) + + evaluation_steps: list[str] | None = Field( + default=None, + description="Specific steps the LLM should follow during evaluation. If omitted with criteria, will be auto-generated. Either provide criteria OR evaluation_steps, not both.", + ) + + evaluation_params: list[DeepEvalTestCaseParam] = Field( + description="List of test case parameters to include in evaluation (e.g., ['input', 'actual_output'])" + ) + + threshold: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Passing threshold (0-1). Metric is successful if score >= threshold.", + ) + + model: str = Field( + default="gpt-4o", + description="GPT model to use for evaluation (e.g., 'gpt-4o', 'gpt-4-turbo', 'gpt-3.5-turbo')", + ) + + strict_mode: bool = Field( + default=False, + description="If True, enforces binary scoring (0 or 1). If False, returns scores in 0-1 range.", + ) + + async_mode: bool = Field( + default=True, + description="Enable concurrent execution for better performance.", + ) + + verbose_mode: bool = Field( + default=False, + description="Print intermediate calculation steps for debugging.", + ) + + timeout_ms: int = Field( + default=30000, + ge=1000, + le=120000, + description="Request timeout in milliseconds (1-120 seconds)", + ) + + on_error: Literal["allow", "deny"] = Field( + default="allow", + description="Action on error: 'allow' (fail open) or 'deny' (fail closed)", + ) + + metadata: dict[str, Any] | None = Field( + default=None, + description="Additional metadata for logging/tracking", + ) + + @model_validator(mode="after") + def validate_criteria_or_steps(self) -> "DeepEvalEvaluatorConfig": + """Validate that either criteria or evaluation_steps is provided, but not both.""" + has_criteria = self.criteria is not None + has_steps = self.evaluation_steps is not None and len(self.evaluation_steps) > 0 + + if not has_criteria and not has_steps: + raise ValueError( + "Either 'criteria' or 'evaluation_steps' must be provided" + ) + + if has_criteria and has_steps: + raise ValueError( + "Provide either 'criteria' OR 'evaluation_steps', not both. " + "If you provide criteria, evaluation_steps will be auto-generated." + ) + + return self diff --git a/examples/deepeval/evaluator.py b/examples/deepeval/evaluator.py new file mode 100644 index 00000000..53e46938 --- /dev/null +++ b/examples/deepeval/evaluator.py @@ -0,0 +1,294 @@ +"""DeepEval GEval evaluator implementation. + +This evaluator demonstrates how to extend the base Evaluator class to integrate +DeepEval's GEval metric for custom LLM-based evaluations. + +Based on DeepEval documentation: https://deepeval.com/docs/metrics-llm-evals +""" + +import logging +from typing import Any + +from agent_control_models import ( + Evaluator, + EvaluatorMetadata, + EvaluatorResult, + register_evaluator, +) + +from config import DeepEvalEvaluatorConfig + +logger = logging.getLogger(__name__) + +# Check if deepeval is available +try: + from deepeval.metrics import GEval + from deepeval.test_case import LLMTestCase, LLMTestCaseParams + + DEEPEVAL_AVAILABLE = True +except ImportError: + DEEPEVAL_AVAILABLE = False + GEval = None # type: ignore + LLMTestCase = None # type: ignore + LLMTestCaseParams = None # type: ignore + + +@register_evaluator +class DeepEvalEvaluator(Evaluator[DeepEvalEvaluatorConfig]): + """DeepEval GEval evaluator for custom LLM-based evaluations. + + This evaluator uses DeepEval's GEval metric, which leverages LLM-as-a-judge + with chain-of-thoughts (CoT) to evaluate LLM outputs based on custom criteria. + + Features: + - Custom evaluation criteria or step-by-step evaluation logic + - Multiple test case parameters (input, output, context, etc.) + - Configurable LLM model for judging + - Binary or continuous scoring modes + - Automatic chain-of-thought generation + + Example: + ```python + from examples.deepeval import DeepEvalEvaluator, DeepEvalEvaluatorConfig + + # Create config + config = DeepEvalEvaluatorConfig( + name="Coherence", + criteria="Determine if the response is coherent and logically consistent.", + evaluation_params=["actual_output"], + threshold=0.7, + model="gpt-4o", + ) + + # Create evaluator + evaluator = DeepEvalEvaluator(config) + + # Evaluate + result = await evaluator.evaluate({ + "actual_output": "The sky is blue because of Rayleigh scattering." + }) + ``` + + Environment Variables: + OPENAI_API_KEY: Required for GPT model usage. + """ + + metadata = EvaluatorMetadata( + name="deepeval-geval", + version="1.0.0", + description="DeepEval GEval custom LLM-based evaluator", + requires_api_key=True, + timeout_ms=30000, + ) + config_model = DeepEvalEvaluatorConfig + + @classmethod + def is_available(cls) -> bool: + """Check if deepeval dependency is installed.""" + return DEEPEVAL_AVAILABLE + + def __init__(self, config: DeepEvalEvaluatorConfig) -> None: + """Initialize DeepEval evaluator with configuration. + + Args: + config: Validated DeepEvalEvaluatorConfig instance. + + Raises: + ValueError: If required configuration is invalid. + """ + super().__init__(config) + + # Create the GEval metric instance (immutable, safe for instance caching) + self._metric = self._create_geval_metric() + + def _create_geval_metric(self) -> Any: + """Create and configure the GEval metric. + + Returns: + Configured GEval metric instance. + """ + # Convert string evaluation params to LLMTestCaseParams enum + evaluation_params = [ + getattr(LLMTestCaseParams, param.upper()) + for param in self.config.evaluation_params + ] + + # Build GEval kwargs + geval_kwargs = { + "name": self.config.name, + "evaluation_params": evaluation_params, + "threshold": self.config.threshold, + "model": self.config.model, + "strict_mode": self.config.strict_mode, + "async_mode": self.config.async_mode, + "verbose_mode": self.config.verbose_mode, + } + + # Add either criteria or evaluation_steps + if self.config.criteria: + geval_kwargs["criteria"] = self.config.criteria + elif self.config.evaluation_steps: + geval_kwargs["evaluation_steps"] = self.config.evaluation_steps + + logger.debug(f"[DeepEval] Creating GEval metric with config: {geval_kwargs}") + return GEval(**geval_kwargs) + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate data using DeepEval GEval. + + Args: + data: The data to evaluate. Should be a dict with keys matching + the evaluation_params (e.g., {"actual_output": "text"}). + + Returns: + EvaluatorResult with matched status and metadata. + """ + try: + logger.debug(f"[DeepEval] Evaluating data: {data}") + + # Prepare test case from data + test_case = self._prepare_test_case(data) + + # Run the GEval metric + if self.config.async_mode: + await self._metric.a_measure(test_case) + else: + self._metric.measure(test_case) + + # Parse the results + result = self._parse_metric_result() + + logger.debug( + f"[DeepEval] Evaluation complete: matched={result.matched}, " + f"score={result.confidence}, reason={result.message}" + ) + + return result + + except Exception as e: + logger.error(f"DeepEval evaluation error: {e}", exc_info=True) + return self._handle_error(e) + + def _prepare_test_case(self, data: Any) -> Any: + """Prepare LLMTestCase from input data. + + Args: + data: Input data dict with test case parameters. + May contain: input, output, question, actual_output, etc. + + Returns: + LLMTestCase instance. + """ + # Handle both dict and string inputs + if isinstance(data, str): + # If data is a string, treat it as actual_output by default + data = {"actual_output": data} + elif not isinstance(data, dict): + data = {"actual_output": str(data)} + + # Map agent-control data structure to DeepEval LLMTestCase parameters + # Agent-control may provide: {"input": {...}, "output": "..."} + # DeepEval expects: {"input": "...", "actual_output": "..."} + mapped_data = {} + + # Handle output mapping + if "actual_output" in data: + mapped_data["actual_output"] = data["actual_output"] + elif "output" in data: + mapped_data["actual_output"] = data["output"] + + # Handle input mapping + if "input" in data: + input_val = data["input"] + # If input is a dict (e.g., function arguments), extract the question + if isinstance(input_val, dict): + # Try common field names + mapped_data["input"] = ( + input_val.get("question") or + input_val.get("query") or + input_val.get("prompt") or + str(input_val) + ) + else: + mapped_data["input"] = str(input_val) + elif "question" in data: + mapped_data["input"] = data["question"] + + # Handle other DeepEval parameters + for key in ["expected_output", "context", "retrieval_context", "tools"]: + if key in data: + mapped_data[key] = data[key] + + # Build test case kwargs + # Note: LLMTestCase requires 'input' and 'actual_output' as mandatory fields + # So we always provide them, even if not in evaluation_params + test_case_kwargs = {} + + # Always include mandatory fields for LLMTestCase + test_case_kwargs["input"] = mapped_data.get("input", "") + test_case_kwargs["actual_output"] = mapped_data.get("actual_output", "") + + # Add any additional params from evaluation_params + for param in self.config.evaluation_params: + if param not in test_case_kwargs: # Skip if already added above + if param in mapped_data: + test_case_kwargs[param] = mapped_data[param] + else: + logger.warning(f"[DeepEval] Missing parameter '{param}', using empty string") + test_case_kwargs[param] = "" + + logger.debug(f"[DeepEval] Original data keys: {list(data.keys())}") + logger.debug(f"[DeepEval] Mapped data keys: {list(mapped_data.keys())}") + logger.debug(f"[DeepEval] Test case kwargs: {test_case_kwargs}") + return LLMTestCase(**test_case_kwargs) + + def _parse_metric_result(self) -> EvaluatorResult: + """Parse GEval metric results into EvaluatorResult. + + Returns: + EvaluatorResult with evaluation results. + """ + # Get score and reason from the metric + score = self._metric.score + reason = self._metric.reason + is_successful = self._metric.is_successful() + + # NOTE: matched=True means the control should trigger (block the request) + # In DeepEval, is_successful=True means quality is GOOD (score >= threshold) + # So we want to trigger (matched=True) when quality is BAD (not is_successful) + return EvaluatorResult( + matched=not is_successful, # Invert: trigger when quality fails + confidence=score if score is not None else 0.0, + message=reason if reason else f"GEval {self.config.name}: score={score}", + metadata={ + "metric_name": self.config.name, + "score": score, + "threshold": self.config.threshold, + "model": self.config.model, + "strict_mode": self.config.strict_mode, + **(self.config.metadata or {}), + }, + ) + + def _handle_error(self, error: Exception) -> EvaluatorResult: + """Handle errors from DeepEval evaluation. + + Args: + error: The exception that occurred. + + Returns: + EvaluatorResult indicating error state. + """ + error_action = self.config.on_error + + return EvaluatorResult( + matched=(error_action == "deny"), # Fail closed if configured + confidence=0.0, + message=f"DeepEval evaluation error: {str(error)}", + metadata={ + "error": str(error), + "error_type": type(error).__name__, + "metric_name": self.config.name, + "fallback_action": error_action, + }, + ) diff --git a/examples/deepeval/pyproject.toml b/examples/deepeval/pyproject.toml new file mode 100644 index 00000000..cafca019 --- /dev/null +++ b/examples/deepeval/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "agent-control-deepeval-example" +version = "0.1.0" +description = "Agent Control DeepEval GEval Custom Evaluator Example" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "deepeval>=1.0.0", + "openai>=1.0.0", + "pydantic>=2.0.0", + "httpx>=0.24.0", + "google-re2>=1.1", + "agent-control-models>=3.0.0", + "agent-control-sdk>=3.0.0", +] + +[project.optional-dependencies] +dev = [] + +[project.entry-points."agent_control.evaluators"] +deepeval-geval = "evaluator:DeepEvalEvaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I"] diff --git a/examples/deepeval/qa_agent.py b/examples/deepeval/qa_agent.py new file mode 100755 index 00000000..63a205f7 --- /dev/null +++ b/examples/deepeval/qa_agent.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +""" +Question Answering Agent with DeepEval Quality Controls + +This example demonstrates: +1. Using agent-control SDK with @control() decorator +2. DeepEval GEval evaluators for quality enforcement +3. Handling ControlViolationError gracefully + +The agent is protected by DeepEval-based controls that check: +- Response coherence (logical consistency) +- Answer relevance (stays on topic) +- Factual correctness (when expected outputs available) + +Usage: + # Setup first (creates controls on server) + python setup_controls.py + + # Then run the agent + python qa_agent.py + +Requirements: + - agent-control server running + - OPENAI_API_KEY set (for DeepEval) + - Controls configured via setup_controls.py +""" + +import asyncio +import os +import sys + +import agent_control +from agent_control import ControlViolationError, control + +# ============================================================================= +# SDK INITIALIZATION +# ============================================================================= + +agent_control.init( + agent_name="Q&A Agent with DeepEval", + agent_id="qa-agent-deepeval", + agent_description="Question answering agent with DeepEval quality controls", + agent_version="1.0.0", +) + + +# ============================================================================= +# MOCK LLM (Simulates various quality scenarios) +# ============================================================================= + + +class MockQASystem: + """ + Simulates a Q&A system with various response quality scenarios. + + This mock helps demonstrate how DeepEval controls catch quality issues: + - Coherent responses pass + - Incoherent responses are blocked + - Irrelevant responses are blocked + """ + + GOOD_RESPONSES = { + "python": ( + "Python is a high-level, interpreted programming language known for its " + "simplicity and readability. It was created by Guido van Rossum and first " + "released in 1991. Python supports multiple programming paradigms including " + "procedural, object-oriented, and functional programming." + ), + "capital": ( + "Paris is the capital and largest city of France. It is located in the " + "north-central part of the country along the River Seine. Paris has been " + "a major center of culture, art, and politics for centuries." + ), + "photosynthesis": ( + "Photosynthesis is the process by which plants convert light energy into " + "chemical energy. Using chlorophyll, plants absorb sunlight and combine " + "carbon dioxide from the air with water from the soil to produce glucose " + "and oxygen. This process is essential for life on Earth." + ), + "gravity": ( + "Gravity is a fundamental force of nature that attracts objects with mass " + "toward each other. On Earth, gravity gives objects weight and causes them " + "to fall toward the ground. The force of gravity was described by Newton's " + "laws and later refined by Einstein's theory of general relativity." + ), + } + + # Incoherent responses (logical inconsistencies, contradictions) + INCOHERENT_RESPONSES = { + "trigger_incoherent": ( + "Python is a snake. Also Python is not a snake. " + "It's both simultaneously. Yesterday is tomorrow. " + "The sky is made of cheese but also not cheese. " + "Numbers are letters and letters are numbers." + ), + } + + # Irrelevant responses (don't answer the question) + IRRELEVANT_RESPONSES = { + "trigger_irrelevant": ( + "Bananas are yellow fruits that grow on trees. " + "The weather today is sunny. I like pizza. " + "Dogs are mammals. The year has 12 months." + ), + } + + @classmethod + def answer_question(cls, question: str) -> str: + """Generate an answer to the question.""" + question_lower = question.lower() + + # Check for test triggers + if "trigger_incoherent" in question_lower or "incoherent" in question_lower: + return cls.INCOHERENT_RESPONSES["trigger_incoherent"] + + if "trigger_irrelevant" in question_lower or "irrelevant" in question_lower: + return cls.IRRELEVANT_RESPONSES["trigger_irrelevant"] + + # Match question to good responses + if "python" in question_lower: + return cls.GOOD_RESPONSES["python"] + elif "capital" in question_lower and "france" in question_lower: + return cls.GOOD_RESPONSES["capital"] + elif "photosynthesis" in question_lower: + return cls.GOOD_RESPONSES["photosynthesis"] + elif "gravity" in question_lower: + return cls.GOOD_RESPONSES["gravity"] + else: + # Default educational response + return ( + f"That's an interesting question about '{question}'. " + "Based on general knowledge, I can provide information on this topic. " + "Would you like me to explain in more detail?" + ) + + +# ============================================================================= +# PROTECTED AGENT FUNCTION +# ============================================================================= + + +@control() +async def answer_question(question: str) -> str: + """ + Answer a question with quality controls. + + The @control() decorator: + - Checks 'pre' controls before generating (validates input) + - Checks 'post' controls after generating (validates output quality) + + DeepEval controls check: + - Coherence: Is the response logically consistent? + - Relevance: Does it address the question? + - Correctness: Is it factually accurate? (if enabled) + + If a control fails, ControlViolationError is raised. + """ + response = MockQASystem.answer_question(question) + return response + + +# ============================================================================= +# Q&A AGENT CLASS +# ============================================================================= + + +class QAAgent: + """ + Question answering agent with DeepEval quality controls. + + Demonstrates graceful error handling when quality controls fail. + """ + + def __init__(self): + self.conversation_history: list[dict[str, str]] = [] + + async def ask(self, question: str) -> str: + """ + Ask a question and get an answer. + + Handles ControlViolationError gracefully by returning + a helpful message instead of exposing internal errors. + """ + self.conversation_history.append({"role": "user", "content": question}) + + try: + # Get answer - protected by DeepEval controls + answer = await answer_question(question) + + self.conversation_history.append({"role": "assistant", "content": answer}) + return answer + + except ControlViolationError as e: + # Control triggered - return helpful feedback + fallback = ( + f"I apologize, but my response didn't meet quality standards. " + f"({e.control_name})\n\n" + f"Could you rephrase your question or ask something else?" + ) + self.conversation_history.append({"role": "assistant", "content": fallback}) + print(f"\n⚠️ Quality control triggered: {e.control_name}") + print(f" Reason: {e.message}") + return fallback + + +# ============================================================================= +# INTERACTIVE MODE +# ============================================================================= + + +def print_header(): + """Print the demo header.""" + print() + print("=" * 70) + print(" Q&A Agent with DeepEval Quality Controls") + print("=" * 70) + print() + print("This agent uses DeepEval GEval to enforce response quality:") + print(" ✓ Coherence - Responses must be logically consistent") + print(" ✓ Relevance - Answers must address the question") + print(" ○ Correctness - Factual accuracy (disabled by default)") + print() + print("Commands:") + print(" /test-good Test with high-quality questions") + print(" /test-bad Test quality control triggers") + print(" /help Show this help") + print(" /quit Exit") + print() + print("Or just type a question!") + print("-" * 70) + print() + + +def print_help(): + """Print help information.""" + print() + print("Available Commands:") + print(" /test-good Test with questions that produce quality answers") + print(" /test-bad Test questions that trigger quality controls") + print(" /help Show this help message") + print(" /quit or /exit Exit the program") + print() + print("Or ask any question and see how DeepEval evaluates quality!") + print() + + +async def run_good_tests(agent: QAAgent): + """Run tests with good quality responses.""" + print("\n" + "=" * 70) + print("Testing Good Quality Responses") + print("=" * 70) + print("\nThese should pass all quality controls.\n") + + test_questions = [ + "What is Python?", + "What is the capital of France?", + "How does photosynthesis work?", + "What is gravity?", + ] + + for question in test_questions: + print(f"Q: {question}") + answer = await agent.ask(question) + print(f"A: {answer[:150]}...") + print() + + +async def run_bad_tests(agent: QAAgent): + """Run tests that should trigger quality controls.""" + print("\n" + "=" * 70) + print("Testing Quality Control Triggers") + print("=" * 70) + print("\nThese should trigger DeepEval controls.\n") + + test_questions = [ + "Test trigger_incoherent response please", # Should fail coherence + "Tell me about something trigger_irrelevant", # Should fail relevance + ] + + for question in test_questions: + print(f"Q: {question}") + answer = await agent.ask(question) + print(f"A: {answer}") + print() + + +async def run_interactive(agent: QAAgent): + """Run interactive mode.""" + print_header() + + while True: + try: + user_input = input("You: ").strip() + except (KeyboardInterrupt, EOFError): + print("\nGoodbye!") + break + + if not user_input: + continue + + # Handle commands + if user_input.startswith("/"): + command = user_input.lower().split()[0] + + if command in ("/quit", "/exit"): + print("Goodbye!") + break + + elif command == "/help": + print_help() + + elif command == "/test-good": + await run_good_tests(agent) + + elif command == "/test-bad": + await run_bad_tests(agent) + + else: + print(f"Unknown command: {command}") + print("Type /help for available commands") + + else: + # Regular question + answer = await agent.ask(user_input) + print(f"\nAgent: {answer}\n") + + +# ============================================================================= +# MAIN +# ============================================================================= + + +async def main(): + """Run the Q&A agent.""" + # Check for OPENAI_API_KEY + if not os.getenv("OPENAI_API_KEY"): + print("\n⚠️ Warning: OPENAI_API_KEY not set!") + print(" DeepEval requires OpenAI API access for GEval.") + print(" Set it with: export OPENAI_API_KEY='your-key'") + print() + response = input("Continue anyway? (y/N): ").strip().lower() + if response != "y": + print("Exiting. Set OPENAI_API_KEY and try again.") + sys.exit(1) + + # Check server connection + server_url = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") + print(f"\nConnecting to agent-control server at {server_url}...") + + import httpx + + try: + async with httpx.AsyncClient() as client: + resp = await client.get(f"{server_url}/health", timeout=5.0) + resp.raise_for_status() + print("✓ Connected to server") + except Exception as e: + print(f"\n❌ Cannot connect to server: {e}") + print(" Make sure the agent-control server is running.") + print(" Run setup_controls.py first to configure the agent.") + sys.exit(1) + + # Create and run agent + agent = QAAgent() + await run_interactive(agent) + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n\nInterrupted. Goodbye!") diff --git a/examples/deepeval/setup_controls.py b/examples/deepeval/setup_controls.py new file mode 100755 index 00000000..ec1f37da --- /dev/null +++ b/examples/deepeval/setup_controls.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +""" +Setup script that creates DeepEval-based controls for the Q&A Agent. + +This script: +1. Registers the agent with the server +2. Creates DeepEval GEval evaluator controls for quality checks +3. Creates a policy and attaches controls +4. Assigns the policy to the agent + +The controls demonstrate using DeepEval's LLM-as-a-judge to enforce: +- Response coherence +- Answer relevance +- Factual correctness + +Run this after starting the server to have a working demo. +""" + +import asyncio +import os +import sys +import uuid + +import httpx + +# Add the current directory to the path so we can import the evaluator +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# Import and register the DeepEval evaluator +# This must be done before creating controls that use it +try: + from evaluator import DeepEvalEvaluator + + print(f"✓ DeepEval evaluator loaded: {DeepEvalEvaluator.metadata.name}") + + # Note: We don't check is_available() here because the evaluator + # may not be used immediately - it just needs to be registered + # so the server knows about it when creating control definitions + +except ImportError as e: + print(f"❌ Error: Cannot import DeepEval evaluator: {e}") + print("\nMake sure you're running from the examples/deepeval directory") + print("and that agent-control-models is installed") + sys.exit(1) + +# Agent configuration +AGENT_ID = "qa-agent-deepeval" +AGENT_NAME = "Q&A Agent with DeepEval" +AGENT_DESCRIPTION = "Question answering agent with DeepEval quality controls" + +SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") + +# DeepEval controls to create +DEEPEVAL_CONTROLS = [ + { + "name": "check-coherence", + "description": "Ensures LLM responses are coherent and logically consistent", + "definition": { + "description": "Ensures LLM responses are coherent and logically consistent", + "enabled": True, + "execution": "server", + "scope": {"stages": ["post"]}, + "selector": {}, + "evaluator": { + "name": "deepeval-geval", + "config": { + "name": "Coherence", + "criteria": ( + "Evaluate whether the response is coherent, logically consistent, " + "and well-structured. Check for contradictions and flow of ideas. " + "The response should make logical sense and not contain contradictory statements." + ), + "evaluation_params": ["input", "actual_output"], + "threshold": 0.6, + "model": "gpt-4o", + "strict_mode": False, + "verbose_mode": False, + }, + }, + "action": { + "decision": "deny", + "message": "Response failed coherence check - please reformulate", + }, + }, + }, + { + "name": "check-relevance", + "description": "Ensures responses are relevant to the user's question", + "definition": { + "description": "Ensures responses are relevant to the user's question", + "enabled": True, + "execution": "server", + "scope": {"stages": ["post"]}, + "selector": {}, + "evaluator": { + "name": "deepeval-geval", + "config": { + "name": "Relevance", + "criteria": ( + "Determine whether the actual output is relevant and directly addresses " + "the input query. Check if it stays on topic and provides useful information " + "that answers the question asked." + ), + "evaluation_params": ["input", "actual_output"], + "threshold": 0.5, + "model": "gpt-4o", + "strict_mode": False, + }, + }, + "action": { + "decision": "deny", + "message": "Response is not relevant to the question - please provide a relevant answer", + }, + }, + }, + { + "name": "check-correctness", + "description": "Validates factual correctness against expected answers (when available)", + "definition": { + "description": "Validates factual correctness against expected answers (when available)", + "enabled": False, # Disabled by default - enable when you have expected outputs + "execution": "server", + "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, + "selector": {"path": "*"}, + "evaluator": { + "name": "deepeval-geval", + "config": { + "name": "Correctness", + "evaluation_steps": [ + "Check whether facts in actual output contradict expected output", + "Heavily penalize omission of critical details", + "Minor wording differences are acceptable", + "Focus on factual accuracy, not style", + ], + "evaluation_params": ["actual_output", "expected_output"], + "threshold": 0.8, + "model": "gpt-4o", + }, + }, + "action": { + "decision": "warn", + "message": "Response may contain factual errors - review carefully", + }, + }, + }, +] + + +async def setup_demo(quiet: bool = False): + """Set up the demo agent with DeepEval controls.""" + # Generate the same UUID5 that the SDK generates + agent_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, AGENT_ID)) + + print(f"Setting up agent: {AGENT_NAME}") + print(f"Agent ID: {AGENT_ID}") + print(f"Agent UUID: {agent_uuid}") + print(f"Server URL: {SERVER_URL}") + print() + + async with httpx.AsyncClient(base_url=SERVER_URL, timeout=30.0) as client: + # Check server health + try: + resp = await client.get("/health") + resp.raise_for_status() + print("✓ Server is healthy") + except httpx.HTTPError as e: + print(f"❌ Error: Cannot connect to server at {SERVER_URL}") + print(f" {e}") + print("\nMake sure the server is running") + return False + + # Register the agent + try: + resp = await client.post( + "/api/v1/agents/initAgent", + json={ + "agent": { + "agent_id": agent_uuid, + "agent_name": AGENT_NAME, + "agent_description": AGENT_DESCRIPTION, + }, + "tools": [], + }, + ) + resp.raise_for_status() + result = resp.json() + status = "Created" if result.get("created") else "Updated" + print(f"✓ {status} agent: {AGENT_NAME}") + except httpx.HTTPError as e: + print(f"❌ Error registering agent: {e}") + return False + + # Get or create a policy for the agent + policy_name = f"policy-{AGENT_ID}" + policy_id = None + + # Check if agent already has a policy + try: + resp = await client.get(f"/api/v1/agents/{agent_uuid}/policy") + if resp.status_code == 200: + policy_id = resp.json().get("policy_id") + print(f"✓ Found existing policy: {policy_id}") + except httpx.HTTPError: + pass # No policy yet + + # Create policy if needed + if not policy_id: + try: + resp = await client.put( + "/api/v1/policies", + json={"name": policy_name}, + ) + if resp.status_code == 409: + # Policy name exists but not assigned - create with unique name + import time + + policy_name = f"policy-{AGENT_ID}-{int(time.time())}" + resp = await client.put( + "/api/v1/policies", + json={"name": policy_name}, + ) + resp.raise_for_status() + policy_id = resp.json()["policy_id"] + print(f"✓ Created policy: {policy_name}") + + # Assign policy to agent + resp = await client.post(f"/api/v1/agents/{agent_uuid}/policy/{policy_id}") + resp.raise_for_status() + print(f"✓ Assigned policy to agent") + except httpx.HTTPError as e: + print(f"❌ Error setting up policy: {e}") + return False + + # Create controls and add to policy + print() + print("Creating DeepEval controls...") + controls_created = 0 + controls_updated = 0 + + for control_spec in DEEPEVAL_CONTROLS: + control_name = control_spec["name"] + definition = control_spec["definition"] + description = control_spec["description"] + + try: + # Create control + resp = await client.put( + "/api/v1/controls", + json={"name": control_name}, + ) + if resp.status_code == 409: + # Control exists, get its ID + resp = await client.get("/api/v1/controls", params={"name": control_name}) + resp.raise_for_status() + controls = resp.json().get("controls", []) + if controls: + control_id = controls[0]["id"] + controls_updated += 1 + else: + continue + else: + resp.raise_for_status() + control_id = resp.json()["control_id"] + controls_created += 1 + + # Set control definition + resp = await client.put( + f"/api/v1/controls/{control_id}/data", + json={"data": definition}, + ) + resp.raise_for_status() + + # Add control to policy + resp = await client.post(f"/api/v1/policies/{policy_id}/controls/{control_id}") + resp.raise_for_status() + + status = "✓" if definition.get("enabled") else "○" + enabled_text = "enabled" if definition.get("enabled") else "disabled" + print(f" {status} {control_name} ({enabled_text})") + print(f" {description}") + + except httpx.HTTPError as e: + print(f" ❌ Error with control '{control_name}': {e}") + continue + + print() + if controls_created > 0: + print(f"✓ Created {controls_created} new control(s)") + if controls_updated > 0: + print(f"✓ Updated {controls_updated} existing control(s)") + print(f"✓ Agent has {len(DEEPEVAL_CONTROLS)} DeepEval control(s) configured") + print() + print("=" * 70) + print("Setup Complete!") + print("=" * 70) + print() + print("Next steps:") + print(" 1. Ensure OPENAI_API_KEY is set (required for DeepEval)") + print(" 2. Run the Q&A agent: python qa_agent.py") + print(" 3. Ask questions and observe quality controls in action") + print() + print("Note: The 'check-correctness' control is disabled by default.") + print(" Enable it when you have test cases with expected outputs.") + print() + + return True + + +if __name__ == "__main__": + success = asyncio.run(setup_demo()) + sys.exit(0 if success else 1) diff --git a/examples/deepeval/start_server_with_evaluator.sh b/examples/deepeval/start_server_with_evaluator.sh new file mode 100644 index 00000000..1c2a1390 --- /dev/null +++ b/examples/deepeval/start_server_with_evaluator.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Start the agent-control server with DeepEval evaluator registered + +# Get the directory containing this script +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# Add this directory to PYTHONPATH so the server can import the evaluator +export PYTHONPATH="$DIR:$PYTHONPATH" + +# Import the evaluator before starting the server +python3 -c "import sys; sys.path.insert(0, '$DIR'); from evaluator import DeepEvalEvaluator; print(f'✓ Loaded {DeepEvalEvaluator.metadata.name}')" + +echo "Starting server with DeepEval evaluator..." +echo "PYTHONPATH: $PYTHONPATH" +echo "" + +# Navigate to repository root and start server +cd "$DIR/../.." +./demo.sh start