From 4b13a188f62243159e3eb07ddc518af477169848 Mon Sep 17 00:00:00 2001 From: Abhinav Gupta Date: Wed, 28 Jan 2026 19:33:30 +0530 Subject: [PATCH 1/6] refactor: rename plugin terminology to evaluator throughout codebase MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename `plugins/` package to `evaluators/` with updated class names: - RegexPlugin → RegexEvaluator - ListPlugin → ListEvaluator - JSONControlEvaluatorPlugin → JSONEvaluator - SQLControlEvaluatorPlugin → SQLEvaluator - Luna2Plugin → Luna2Evaluator - Rename config classes to follow XEvaluatorConfig pattern: - RegexConfig → RegexEvaluatorConfig - ListConfig → ListEvaluatorConfig - JSONControlEvaluatorPluginConfig → JSONEvaluatorConfig - SQLControlEvaluatorPluginConfig → SQLEvaluatorConfig - Luna2Config → Luna2EvaluatorConfig - Update models package: - Rename plugin.py to evaluator.py - PluginMetadata → EvaluatorMetadata - PluginEvaluator → Evaluator - register_plugin → register_evaluator - EvaluatorConfig.plugin field → EvaluatorConfig.name - Update engine package: - discover_plugins → discover_evaluators - list_plugins → list_evaluators - Entry point: agent_control.plugins → agent_control.evaluators - get_evaluator → get_evaluator_instance (to avoid name collision) - Update server: - /api/v1/plugins endpoint → /api/v1/evaluators - Add migration to rename plugin column to evaluator in evaluator_configs table - Remove all backwards compatibility aliases - Update all documentation, examples, and tests --- .github/workflows/pr-title.yaml | 2 +- AGENTS.md | 18 +- CONTRIBUTING.md | 127 ++- README.md | 34 +- docs/OVERVIEW.md | 110 +- docs/REFERENCE.md | 112 +- docs/{plugins => evaluators}/json.md | 16 +- docs/{plugins => evaluators}/sql.md | 32 +- docs/observability.md | 2 +- engine/README.md | 38 +- engine/pyproject.toml | 4 +- engine/src/agent_control_engine/__init__.py | 18 +- engine/src/agent_control_engine/core.py | 10 +- engine/src/agent_control_engine/discovery.py | 68 +- engine/src/agent_control_engine/evaluators.py | 46 +- engine/tests/conftest.py | 12 +- engine/tests/test_core.py | 138 +-- engine/tests/test_discovery.py | 230 ++--- ...tors.py => test_evaluator_integrations.py} | 163 ++- engine/tests/test_evaluators.py | 140 +-- evaluators/README.md | 23 + {plugins => evaluators}/pyproject.toml | 17 +- .../src/agent_control_evaluators/__init__.py | 32 + .../builtin/__init__.py | 11 + .../agent_control_evaluators}/builtin/json.py | 23 +- .../agent_control_evaluators}/builtin/list.py | 22 +- .../builtin/regex.py | 22 +- .../agent_control_evaluators}/builtin/sql.py | 30 +- .../luna2/__init__.py | 14 +- .../agent_control_evaluators}/luna2/client.py | 1 - .../agent_control_evaluators}/luna2/config.py | 13 +- .../luna2/evaluator.py | 38 +- .../src/agent_control_evaluators}/py.typed | 0 evaluators/tests/__init__.py | 1 + evaluators/tests/test_base.py | 141 +++ {plugins => evaluators}/tests/test_json.py | 339 +++--- {plugins => evaluators}/tests/test_sql.py | 968 +++++++++--------- examples/README.md | 4 +- examples/agent_control_demo/setup_controls.py | 6 +- .../agent_control_demo/update_controls.py | 4 +- examples/customer_support_agent/README.md | 14 +- .../setup_demo_controls.py | 16 +- examples/galileo/README.md | 8 +- examples/galileo/luna2_demo.py | 8 +- examples/galileo/pyproject.toml | 2 +- examples/langchain/README.md | 16 +- examples/langchain/pyproject.toml | 4 +- examples/langchain/setup_sql_controls.py | 2 +- examples/langchain/sql_agent_protection.py | 2 +- models/src/agent_control_models/__init__.py | 48 +- models/src/agent_control_models/controls.py | 60 +- models/src/agent_control_models/errors.py | 4 +- .../{plugin.py => evaluator.py} | 112 +- .../src/agent_control_models/observability.py | 10 +- models/src/agent_control_models/server.py | 14 +- plugins/README.md | 144 --- plugins/src/agent_control_plugins/__init__.py | 30 - plugins/src/agent_control_plugins/base.py | 9 - .../agent_control_plugins/builtin/__init__.py | 11 - plugins/tests/__init__.py | 2 - plugins/tests/test_base.py | 145 --- pyproject.toml | 6 +- scripts/build.py | 12 +- sdks/python/pyproject.toml | 6 +- sdks/python/src/agent_control/__init__.py | 6 +- sdks/python/src/agent_control/agents.py | 6 +- .../src/agent_control/control_decorators.py | 2 +- sdks/python/src/agent_control/controls.py | 2 +- sdks/python/src/agent_control/evaluation.py | 22 +- .../src/agent_control/evaluators/__init__.py | 56 + .../src/agent_control/evaluators/base.py | 9 + .../src/agent_control/plugins/__init__.py | 57 -- sdks/python/src/agent_control/plugins/base.py | 10 - sdks/python/tests/test_evaluators.py | 260 +++++ sdks/python/tests/test_local_evaluation.py | 40 +- ...una2_plugin.py => test_luna2_evaluator.py} | 336 +++--- sdks/python/tests/test_plugins.py | 260 ----- server/Dockerfile | 2 +- server/README.md | 10 +- ...c8d9e0f1a2b3_rename_plugin_to_evaluator.py | 48 + server/pyproject.toml | 12 +- server/src/agent_control_server/auth.py | 2 +- .../agent_control_server/endpoints/agents.py | 42 +- .../endpoints/controls.py | 28 +- .../endpoints/evaluation.py | 8 +- .../endpoints/evaluator_configs.py | 54 +- .../endpoints/evaluators.py | 55 + .../agent_control_server/endpoints/plugins.py | 53 - server/src/agent_control_server/main.py | 14 +- server/src/agent_control_server/models.py | 2 +- .../services/evaluator_utils.py | 16 +- server/tests/conftest.py | 6 +- server/tests/test_auth.py | 28 +- server/tests/test_controls.py | 2 +- server/tests/test_controls_validation.py | 20 +- server/tests/test_error_handling.py | 2 +- server/tests/test_evaluation_e2e.py | 16 +- .../test_evaluation_e2e_list_evaluator.py | 26 +- .../test_evaluation_e2e_sql_evaluator.py | 20 +- .../tests/test_evaluation_error_handling.py | 20 +- server/tests/test_evaluator_configs.py | 56 +- server/tests/test_evaluator_schemas.py | 4 +- server/tests/test_evaluator_utils.py | 8 +- server/tests/test_new_features.py | 44 +- server/tests/test_observability_endpoints.py | 2 +- server/tests/test_observability_models.py | 4 +- server/tests/utils.py | 2 +- 107 files changed, 2645 insertions(+), 2741 deletions(-) rename docs/{plugins => evaluators}/json.md (96%) rename docs/{plugins => evaluators}/sql.md (91%) rename engine/tests/{test_plugin_evaluators.py => test_evaluator_integrations.py} (59%) create mode 100644 evaluators/README.md rename {plugins => evaluators}/pyproject.toml (55%) create mode 100644 evaluators/src/agent_control_evaluators/__init__.py create mode 100644 evaluators/src/agent_control_evaluators/builtin/__init__.py rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/builtin/json.py (97%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/builtin/list.py (90%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/builtin/regex.py (82%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/builtin/sql.py (98%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/luna2/__init__.py (71%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/luna2/client.py (99%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/luna2/config.py (93%) rename plugins/src/agent_control_plugins/luna2/plugin.py => evaluators/src/agent_control_evaluators/luna2/evaluator.py (91%) rename {plugins/src/agent_control_plugins => evaluators/src/agent_control_evaluators}/py.typed (100%) create mode 100644 evaluators/tests/__init__.py create mode 100644 evaluators/tests/test_base.py rename {plugins => evaluators}/tests/test_json.py (63%) rename {plugins => evaluators}/tests/test_sql.py (70%) rename models/src/agent_control_models/{plugin.py => evaluator.py} (60%) delete mode 100644 plugins/README.md delete mode 100644 plugins/src/agent_control_plugins/__init__.py delete mode 100644 plugins/src/agent_control_plugins/base.py delete mode 100644 plugins/src/agent_control_plugins/builtin/__init__.py delete mode 100644 plugins/tests/__init__.py delete mode 100644 plugins/tests/test_base.py create mode 100644 sdks/python/src/agent_control/evaluators/__init__.py create mode 100644 sdks/python/src/agent_control/evaluators/base.py delete mode 100644 sdks/python/src/agent_control/plugins/__init__.py delete mode 100644 sdks/python/src/agent_control/plugins/base.py create mode 100644 sdks/python/tests/test_evaluators.py rename sdks/python/tests/{test_luna2_plugin.py => test_luna2_evaluator.py} (68%) delete mode 100644 sdks/python/tests/test_plugins.py create mode 100644 server/alembic/versions/c8d9e0f1a2b3_rename_plugin_to_evaluator.py create mode 100644 server/src/agent_control_server/endpoints/evaluators.py delete mode 100644 server/src/agent_control_server/endpoints/plugins.py diff --git a/.github/workflows/pr-title.yaml b/.github/workflows/pr-title.yaml index 216a3484..04f5186a 100644 --- a/.github/workflows/pr-title.yaml +++ b/.github/workflows/pr-title.yaml @@ -32,7 +32,7 @@ jobs: server models engine - plugins + evaluators ci docs infra diff --git a/AGENTS.md b/AGENTS.md index 8309395d..32d05971 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -22,11 +22,11 @@ Forwarded targets: ## Repo layout (uv workspace members) -- `models/`: shared Pydantic v2 models and plugin base classes (`models/src/agent_control_models/`) -- `engine/`: **control evaluation engine and plugin system** — all evaluation logic, plugin discovery, and plugin orchestration lives here (`engine/src/agent_control_engine/`) +- `models/`: shared Pydantic v2 models and evaluator base classes (`models/src/agent_control_models/`) +- `engine/`: **control evaluation engine and evaluator system** — all evaluation logic, evaluator discovery, and evaluator orchestration lives here (`engine/src/agent_control_engine/`) - `server/`: FastAPI server (`server/src/agent_control_server/`) - `sdks/python/`: Python SDK — uses engine for evaluation (`sdks/python/src/agent_control/`) -- `plugins/`: plugin implementations (`plugins/src/agent_control_plugins/`) +- `evaluators/`: evaluator implementations (`evaluators/src/agent_control_evaluators/`) - `ui/`: Nextjs based web app to manage agent controls - `examples/`: runnable examples (ruff has relaxed import rules here) @@ -66,12 +66,12 @@ All testing guidance (including “behavior changes require tests”) lives in ` 4) add SDK wrapper in `sdks/python/src/agent_control/` 5) add tests (server + SDK) and update docs/examples if user-facing -- Add a new evaluator plugin: - 1) implement plugin class extending `PluginEvaluator` in `plugins/src/agent_control_plugins/` - 2) use `@register_plugin` decorator (from `agent_control_models`) - 3) add entry point in `plugins/pyproject.toml` for auto-discovery - 4) add tests in the plugins package - 5) plugin is automatically available to server and SDK via `discover_plugins()` +- Add a new evaluator: + 1) implement evaluator class extending `Evaluator` in `evaluators/src/agent_control_evaluators/` + 2) use `@register_evaluator` decorator (from `agent_control_models`) + 3) add entry point in `evaluators/pyproject.toml` for auto-discovery + 4) add tests in the evaluators package + 5) evaluator is automatically available to server and SDK via `discover_evaluators()` ## Git/PR workflow diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b9f10b7..be940723 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,7 +12,7 @@ agent-control/ ├── server/ # FastAPI server (agent-control-server) ├── sdks/python/ # Python SDK (agent-control) ├── engine/ # Control evaluation engine (agent-control-engine) -├── plugins/ # Plugin implementations (agent-control-plugins) +├── evaluators/ # Evaluator implementations (agent-control-evaluators) └── examples/ # Usage examples ``` @@ -20,7 +20,7 @@ agent-control/ ``` SDK ──────────────────────────────────────┐ ▼ -Server ──► Engine ──► Models ◄── Plugins +Server ──► Engine ──► Models ◄── Evaluators ``` --- @@ -151,7 +151,7 @@ sdks/python/src/agent_control/ ├── control_sets.py # Control set management ├── evaluation.py # Evaluation checks ├── control_decorators.py # @control decorator -└── plugins/ # Plugin system +└── evaluators/ # Evaluator system ``` **Key exports:** @@ -186,7 +186,7 @@ make test # Starts server automatically ### Engine (`engine/`) -Core control evaluation logic. The engine loads plugins and executes evaluations. +Core control evaluation logic. The engine loads evaluators and executes evaluations. ```bash # Location @@ -194,13 +194,13 @@ engine/src/agent_control_engine/ # Key files ├── core.py # Main ControlEngine class -├── evaluators.py # Plugin loader and caching +├── evaluators.py # Evaluator loader and caching └── selectors.py # Data selection from payloads ``` **How it works:** -- The engine uses the plugin registry to find evaluators -- Plugins are cached for performance (LRU cache) +- The engine uses the evaluator registry to find evaluators +- Evaluators are cached for performance (LRU cache) - Selectors extract data from payloads before evaluation **Testing:** @@ -209,78 +209,77 @@ cd engine make test ``` -> **Note:** To add new evaluators, create a plugin in `plugins/` rather than modifying the engine directly. See the Plugins section above. +> **Note:** To add new evaluators, create an evaluator in `evaluators/` rather than modifying the engine directly. See the Evaluators section below. --- -### Plugins (`plugins/`) +### Evaluators (`evaluators/`) -Extensible evaluator plugins for custom detection logic. +Extensible evaluators for custom detection logic. ```bash # Location -plugins/src/agent_control_plugins/ +evaluators/src/agent_control_evaluators/ # Key directories -├── base.py # PluginEvaluator base class ├── builtin/ # Built-in evaluators -│ ├── regex.py # RegexPlugin - pattern matching -│ └── list.py # ListPlugin - value matching +│ ├── regex.py # RegexEvaluator - pattern matching +│ └── list.py # ListEvaluator - value matching └── luna2/ # Galileo Luna-2 integration - ├── plugin.py # Luna2Plugin implementation - ├── config.py # Luna2Config model - └── client.py # Direct HTTP client (no SDK dependency) + ├── evaluator.py # Luna2Evaluator implementation + ├── config.py # Luna2Config model + └── client.py # Direct HTTP client (no SDK dependency) ``` -**Adding a new plugin:** +**Adding a new evaluator:** -1. **Create plugin directory:** +1. **Create evaluator directory:** ```bash - mkdir plugins/src/agent_control_plugins/my_plugin/ + mkdir evaluators/src/agent_control_evaluators/my_evaluator/ ``` 2. **Define configuration model (`config.py`):** ```python from pydantic import BaseModel, Field - class MyPluginConfig(BaseModel): - """Configuration for MyPlugin.""" + class MyEvaluatorConfig(BaseModel): + """Configuration for MyEvaluator.""" threshold: float = Field(0.5, ge=0.0, le=1.0) api_endpoint: str = Field(default="https://api.example.com") ``` -3. **Implement plugin (`plugin.py`):** +3. **Implement evaluator (`evaluator.py`):** ```python from typing import Any from agent_control_models import ( EvaluatorResult, - PluginEvaluator, - PluginMetadata, - register_plugin, + Evaluator, + EvaluatorMetadata, + register_evaluator, ) - from .config import MyPluginConfig - - @register_plugin - class MyPlugin(PluginEvaluator[MyPluginConfig]): - """My custom evaluator plugin.""" - - metadata = PluginMetadata( - name="my-plugin", + from .config import MyEvaluatorConfig + + @register_evaluator + class MyEvaluator(Evaluator[MyEvaluatorConfig]): + """My custom evaluator.""" + + metadata = EvaluatorMetadata( + name="my-evaluator", version="1.0.0", description="Custom detection logic", requires_api_key=False, timeout_ms=5000, ) - config_model = MyPluginConfig + config_model = MyEvaluatorConfig - def __init__(self, config: MyPluginConfig) -> None: + def __init__(self, config: MyEvaluatorConfig) -> None: super().__init__(config) # Initialize any clients or resources async def evaluate(self, data: Any) -> EvaluatorResult: # Your detection logic here score = await self._analyze(str(data)) - + return EvaluatorResult( matched=score > self.config.threshold, confidence=score, @@ -291,28 +290,28 @@ plugins/src/agent_control_plugins/ 4. **Export in `__init__.py`:** ```python - from .config import MyPluginConfig - from .plugin import MyPlugin + from .config import MyEvaluatorConfig + from .evaluator import MyEvaluator - __all__ = ["MyPlugin", "MyPluginConfig"] + __all__ = ["MyEvaluator", "MyEvaluatorConfig"] ``` -5. **Add optional dependencies in `plugins/pyproject.toml`:** +5. **Add optional dependencies in `evaluators/pyproject.toml`:** ```toml [project.optional-dependencies] - my-plugin = ["httpx>=0.24.0"] # Add your dependencies - all = ["httpx>=0.24.0", ...] # Include in 'all' extra + my-evaluator = ["httpx>=0.24.0"] # Add your dependencies + all = ["httpx>=0.24.0", ...] # Include in 'all' extra ``` -6. **Add tests in `plugins/tests/`** +6. **Add tests in `evaluators/tests/`** -**Plugin Best Practices:** +**Evaluator Best Practices:** - Use Pydantic for config validation - Make API calls async with httpx - Return confidence scores (0.0-1.0) - Include helpful metadata for debugging - Handle errors gracefully (respect `on_error` config) -- Avoid storing request-scoped state (plugins are cached) +- Avoid storing request-scoped state (evaluators are cached) --- @@ -411,7 +410,7 @@ Update `version` in respective `pyproject.toml` files: - `server/pyproject.toml` - `sdks/python/pyproject.toml` - `engine/pyproject.toml` -- `plugins/pyproject.toml` +- `evaluators/pyproject.toml` --- @@ -455,22 +454,22 @@ test: add control set integration tests 5. Add tests for both server and SDK 6. Update examples if user-facing -### Add a new evaluator plugin +### Add a new evaluator -1. Create plugin directory in `plugins/src/agent_control_plugins/` -2. Implement `PluginEvaluator` interface (see Plugins section above) -3. Add `@register_plugin` decorator to your plugin class -4. Add optional dependencies in `plugins/pyproject.toml` -5. Export from `plugins/src/agent_control_plugins/__init__.py` -6. Add tests in `plugins/tests/` +1. Create evaluator directory in `evaluators/src/agent_control_evaluators/` +2. Implement `Evaluator` interface (see Evaluators section above) +3. Add `@register_evaluator` decorator to your evaluator class +4. Add optional dependencies in `evaluators/pyproject.toml` +5. Export from `evaluators/src/agent_control_evaluators/__init__.py` +6. Add tests in `evaluators/tests/` 7. Update `docs/OVERVIEW.md` with usage examples ### Add a built-in evaluator (regex/list style) -1. Add evaluator class in `plugins/src/agent_control_plugins/builtin/` +1. Add evaluator class in `evaluators/src/agent_control_evaluators/builtin/` 2. Add config model in `models/src/agent_control_models/controls.py` -3. Register with `@register_plugin` decorator -4. Add comprehensive tests in `plugins/tests/` +3. Register with `@register_evaluator` decorator +4. Add comprehensive tests in `evaluators/tests/` ### Update shared models @@ -496,18 +495,18 @@ test: add control set integration tests --- -## Plugin Development Quick Reference +## Evaluator Development Quick Reference | Task | Location | |------|----------| -| Plugin base class | `agent_control_models.PluginEvaluator` | -| Plugin metadata | `agent_control_models.PluginMetadata` | +| Evaluator base class | `agent_control_models.Evaluator` | +| Evaluator metadata | `agent_control_models.EvaluatorMetadata` | | Evaluator result | `agent_control_models.EvaluatorResult` | -| Register decorator | `@agent_control_models.register_plugin` | -| Built-in plugins | `plugins/src/agent_control_plugins/builtin/` | -| Plugin tests | `plugins/tests/` | +| Register decorator | `@agent_control_models.register_evaluator` | +| Built-in evaluators | `evaluators/src/agent_control_evaluators/builtin/` | +| Evaluator tests | `evaluators/tests/` | -**Plugin config model fields:** +**Evaluator config model fields:** ```python from pydantic import BaseModel, Field diff --git a/README.md b/README.md index f45cd98d..2f0e5b96 100644 --- a/README.md +++ b/README.md @@ -40,8 +40,8 @@ except ControlViolationError as e: - **Centralized Policies** — Define controls once, apply to multiple agents - **Web Dashboard** — Manage agents and controls through the UI - **API Key Authentication** — Secure your control server in production -- **Pluggable Evaluators** — Regex, list matching, AI-powered detection (Luna-2), or custom plugins -- **Fail-Safe Defaults** — Deny controls fail closed on error; plugins like Luna-2 support configurable error handling +- **Pluggable Evaluators** — Regex, list matching, AI-powered detection (Luna-2), or custom evaluators +- **Fail-Safe Defaults** — Deny controls fail closed on error; evaluators like Luna-2 support configurable error handling --- @@ -133,7 +133,7 @@ async def main(): | `AGENT_CONTROL_URL` | `http://localhost:8000` | Server URL for SDK | | `AGENT_CONTROL_API_KEY` | — | API key for authentication (if enabled) | | `DB_URL` | `sqlite+aiosqlite:///./agent_control.db` | Database connection string | -| `GALILEO_API_KEY` | — | Required for Luna-2 AI evaluator plugin | +| `GALILEO_API_KEY` | — | Required for Luna-2 AI evaluator | ### Server Configuration @@ -161,7 +161,7 @@ Controls are defined via the API or dashboard. Each control specifies what to ch "scope": { "step_types": ["llm_inference"], "stages": ["post"] }, "selector": { "path": "output" }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": "\\b\\d{3}-\\d{2}-\\d{4}\\b" } }, "action": { "decision": "deny" } @@ -179,7 +179,7 @@ Controls are defined via the API or dashboard. Each control specifies what to ch "scope": { "step_types": ["llm_inference"], "stages": ["pre"] }, "selector": { "path": "input" }, "evaluator": { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -200,11 +200,11 @@ Agent Control is built as a monorepo with these components: ``` ┌──────────────────────────────────────────────────────────────────┐ -│ Your Application │ +│ Your Application │ │ ┌────────────────────────────────────────────────────────────┐ │ -│ │ @control() decorator │ │ -│ │ │ │ │ -│ │ ▼ │ │ +│ │ @control() decorator │ │ +│ │ │ │ │ +│ │ ▼ │ │ │ │ ┌──────────┐ ┌─────────────────┐ ┌──────────────┐ │ │ │ │ │ Input │───▶│ Agent Control │───▶│ Output │ │ │ │ │ │ │ │ Engine │ │ │ │ │ @@ -214,19 +214,19 @@ Agent Control is built as a monorepo with these components: │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ Agent Control Server │ +│ Agent Control Server │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ -│ │ Controls │ │ Policies │ │ Plugins │ │ Agents │ │ +│ │ Controls │ │ Policies │ │ Evaluators │ │ Agents │ │ │ │ API │ │ API │ │ Registry │ │ API │ │ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ Plugin Ecosystem │ +│ Evaluator Ecosystem │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ │ Regex │ │ List │ │ Luna-2 │ │ Custom │ │ -│ │ Evaluator │ │ Evaluator │ │ Plugin │ │ Plugins │ │ +│ │ Evaluator │ │ Evaluator │ │ Evaluator │ │ Evaluators │ │ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -235,9 +235,9 @@ Agent Control is built as a monorepo with these components: |:--------|:------------| | `agent-control` | Python SDK with `@control()` decorator | | `agent-control-server` | FastAPI server with Control Management API | -| `agent-control-engine` | Core evaluation logic and plugin system | +| `agent-control-engine` | Core evaluation logic and evaluator system | | `agent-control-models` | Shared Pydantic v2 models | -| `agent-control-plugins` | Built-in evaluator plugins | +| `agent-control-evaluators` | Built-in evaluators | | `ui` | Next.js web dashboard | --- @@ -252,7 +252,7 @@ agent-control/ ├── server/ # FastAPI server (agent-control-server) ├── engine/ # Evaluation engine (agent-control-engine) ├── models/ # Shared models (agent-control-models) -├── plugins/ # Plugin implementations (agent-control-plugins) +├── evaluators/ # Evaluator implementations (agent-control-evaluators) ├── ui/ # Next.js dashboard └── examples/ # Usage examples ``` @@ -291,7 +291,7 @@ For detailed development workflows, see [CONTRIBUTING.md](CONTRIBUTING.md). - **[Python SDK](sdks/python/README.md)** — SDK installation, usage, and API reference - **[Server](server/README.md)** — Server setup, configuration, and deployment - **[UI Dashboard](ui/README.md)** — Web dashboard setup and usage -- **[Plugins](plugins/README.md)** — Available evaluator plugins and custom plugin development +- **[Evaluators](evaluators/README.md)** — Available evaluators and custom evaluator development ### Examples diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index e9e526d9..ad4646c2 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -40,7 +40,7 @@ Example: *"If the output contains an SSN pattern, block the response."* "scope": { "step_types": ["llm_inference"], "stages": ["post"] }, "selector": { "path": "output" }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": "\\b\\d{3}-\\d{2}-\\d{4}\\b" } }, "action": { "decision": "deny" } @@ -95,7 +95,7 @@ A **Selector** defines *what data* to extract from the payload for evaluation. ### 🔍 Evaluators -An **Evaluator** defines *how* to analyze the selected data. Agent Control provides built-in evaluators and supports custom plugins. +An **Evaluator** defines *how* to analyze the selected data. Agent Control provides built-in evaluators and supports custom evaluators. ### ⚡ Actions @@ -138,7 +138,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). ```json // Block Social Security Numbers { - "plugin": "regex", + "name": "regex", "config": { "pattern": "\\b\\d{3}-\\d{2}-\\d{4}\\b" } @@ -146,7 +146,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). // Block credit card numbers (case-insensitive "card" + digits) { - "plugin": "regex", + "name": "regex", "config": { "pattern": "card.*\\d{4}[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}", "flags": ["IGNORECASE"] @@ -155,7 +155,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). // Block AWS access keys { - "plugin": "regex", + "name": "regex", "config": { "pattern": "AKIA[0-9A-Z]{16}" } @@ -188,7 +188,7 @@ Flexible value matching with multiple modes and logic options. ```json // Block admin/root keywords (any match, contains, case-insensitive) { - "plugin": "list", + "name": "list", "config": { "values": ["admin", "root", "sudo", "superuser"], "logic": "any", @@ -199,7 +199,7 @@ Flexible value matching with multiple modes and logic options. // Require approval keyword (trigger if NOT found) { - "plugin": "list", + "name": "list", "config": { "values": ["APPROVED", "VERIFIED"], "match_on": "no_match", @@ -209,7 +209,7 @@ Flexible value matching with multiple modes and logic options. // Block competitor mentions { - "plugin": "list", + "name": "list", "config": { "values": ["CompetitorA", "CompetitorB", "CompetitorC"], "match_mode": "contains", @@ -219,7 +219,7 @@ Flexible value matching with multiple modes and logic options. // Allowlist: only permit specific tools { - "plugin": "list", + "name": "list", "config": { "values": ["search", "calculate", "lookup"], "match_on": "no_match" @@ -236,7 +236,7 @@ Flexible value matching with multiple modes and logic options. --- -### 3. Luna-2 Plugin (`galileo-luna2`) +### 3. Luna-2 Evaluator (`galileo-luna2`) AI-powered detection using Galileo's Luna-2 small language models. Provides real-time, low-latency evaluation for complex patterns that can't be caught with regex or lists. @@ -267,7 +267,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real ```json // Block toxic inputs (score > 0.5) { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -278,7 +278,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Block prompt injection attempts { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "prompt_injection", "operator": "gt", @@ -289,7 +289,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Flag potential hallucinations (warn but allow) { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "hallucination", "operator": "gt", @@ -299,7 +299,7 @@ AI-powered detection using Galileo's Luna-2 small language models. Provides real // Using a central stage (pre-defined server-side rules) { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "stage_type": "central", "stage_name": "production-safety", @@ -365,7 +365,7 @@ Every evaluation is logged with: Answer questions like: *"Why was this blocked?"* or *"What threats did we stop this week?"* ### 🔌 Pluggable Architecture -Use built-in evaluators or bring your own. The plugin system supports: +Use built-in evaluators or bring your own. The evaluator system supports: - Simple pattern matching (regex, word lists) - AI-powered detection (toxicity, prompt injection, hallucination) - Custom business logic @@ -397,17 +397,17 @@ Choose how to handle failures: ┌──────────────────────────────────────────────────────────────────┐ │ Agent Control Server │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ -│ │ Controls │ │ Policies │ │ Plugins │ │ Agents │ │ +│ │ Controls │ │ Policies │ │ Evaluators │ │ Agents │ │ │ │ API │ │ API │ │ Registry │ │ API │ │ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ Plugin Ecosystem │ +│ Evaluator Ecosystem │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ │ Regex │ │ List │ │ Luna-2 │ │ Custom │ │ -│ │ Evaluator │ │ Evaluator │ │ Plugin │ │ Plugins │ │ +│ │ Evaluator │ │ Evaluator │ │ Evaluator │ │ Evaluators │ │ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -419,45 +419,45 @@ Choose how to handle failures: | **SDK** | Python client library with `@control()` decorator | | **Server** | FastAPI service that stores and evaluates controls | | **Engine** | Core evaluation logic (can run locally or server-side) | -| **Plugins** | Extensible evaluators for different detection methods | +| **Evaluators** | Extensible evaluators for different detection methods | | **Models** | Shared Pydantic models for type-safe communication | --- -## Creating Custom Plugins +## Creating Custom Evaluators -Partners and developers can create custom plugins to extend Agent Control with their own detection capabilities. +Partners and developers can create custom evaluators to extend Agent Control with their own detection capabilities. -### Plugin Interface +### Evaluator Interface -Every plugin implements the `PluginEvaluator` base class: +Every evaluator implements the `Evaluator` base class: ```python from typing import Any from pydantic import BaseModel -from agent_control_models import EvaluatorResult, PluginEvaluator, PluginMetadata, register_plugin +from agent_control_models import EvaluatorResult, Evaluator, EvaluatorMetadata, register_evaluator -class MyPluginConfig(BaseModel): - """Configuration schema for your plugin.""" +class MyEvaluatorConfig(BaseModel): + """Configuration schema for your evaluator.""" threshold: float = 0.5 custom_option: str = "default" -@register_plugin -class MyCustomPlugin(PluginEvaluator[MyPluginConfig]): - """Your custom evaluator plugin.""" - - metadata = PluginMetadata( - name="my-custom-plugin", +@register_evaluator +class MyCustomEvaluator(Evaluator[MyEvaluatorConfig]): + """Your custom evaluator.""" + + metadata = EvaluatorMetadata( + name="my-custom-evaluator", version="1.0.0", description="Detects custom patterns using proprietary logic", requires_api_key=True, # Set to True if you need credentials timeout_ms=5000, ) - config_model = MyPluginConfig + config_model = MyEvaluatorConfig - def __init__(self, config: MyPluginConfig) -> None: + def __init__(self, config: MyEvaluatorConfig) -> None: """Initialize with validated configuration.""" super().__init__(config) # Set up any clients, load models, etc. @@ -495,28 +495,28 @@ class MyCustomPlugin(PluginEvaluator[MyPluginConfig]): return 0.0 ``` -### Plugin Registration +### Evaluator Registration -Plugins are discovered automatically via Python entry points. To make your plugin available: +Evaluators are discovered automatically via Python entry points. To make your evaluator available: -1. **Create a Python package** with your plugin class decorated with `@register_plugin` +1. **Create a Python package** with your evaluator class decorated with `@register_evaluator` 2. **Register as an entry point** in your `pyproject.toml`: ```toml - [project.entry-points."agent_control.plugins"] - my-plugin = "my_package.plugin:MyPlugin" + [project.entry-points."agent_control.evaluators"] + my-evaluator = "my_package.evaluator:MyEvaluator" ``` 3. **Install it** in the Agent Control environment ```bash -# Install your plugin -pip install my-custom-plugin +# Install your evaluator +pip install my-custom-evaluator -# It's now available for use in controls +# It's now available ``` ### Optional Dependencies -If your plugin has optional dependencies, override `is_available()`: +If your evaluator has optional dependencies, override `is_available()`: ```python try: @@ -525,21 +525,21 @@ try: except ImportError: AVAILABLE = False -@register_plugin -class MyPlugin(PluginEvaluator[MyConfig]): +@register_evaluator +class MyEvaluator(Evaluator[MyConfig]): @classmethod def is_available(cls) -> bool: return AVAILABLE ``` -When `is_available()` returns `False`, the plugin is silently skipped during registration. +When `is_available()` returns `False`, the evaluator is silently skipped during registration. -### Plugin Best Practices +### Evaluator Best Practices | Practice | Why | |----------|-----| | **Use Pydantic for config** | Automatic validation and documentation | -| **Implement timeouts** | Prevent slow plugins from blocking agents | +| **Implement timeouts** | Prevent slow evaluators from blocking agents | | **Return confidence scores** | Enable threshold-based filtering | | **Include metadata** | Helps with debugging and observability | | **Handle errors gracefully** | Respect the `on_error` configuration | @@ -550,11 +550,11 @@ When `is_available()` returns `False`, the plugin is silently skipped during reg Here's how a partner might integrate their content moderation API: ```python -@register_plugin -class ContentModerationPlugin(PluginEvaluator[ContentModConfig]): +@register_evaluator +class ContentModerationEvaluator(Evaluator[ContentModConfig]): """Integration with Acme Content Moderation API.""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="acme-content-mod", version="1.0.0", description="Acme Inc. content moderation", @@ -641,7 +641,7 @@ docker-compose up ## Roadmap - [ ] Web UI for control management -- [ ] More built-in plugins (OpenAI Moderation, Perspective API, etc.) +- [ ] More built-in evaluators (OpenAI Moderation, Perspective API, etc.) - [ ] Metrics and analytics dashboard - [ ] Multi-language SDK support (TypeScript, Go) - [ ] Webhook notifications for violations @@ -652,11 +652,11 @@ docker-compose up We welcome contributions! See [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelines. -### Adding a Plugin +### Adding an Evaluator 1. Fork the repository -2. Create your plugin in `plugins/src/agent_control_plugins/` -3. Add tests in `plugins/tests/` +2. Create your evaluator in `evaluators/src/agent_control_evaluators/` +3. Add tests in `evaluators/tests/` 4. Submit a pull request --- diff --git a/docs/REFERENCE.md b/docs/REFERENCE.md index 7afd474c..486829d9 100644 --- a/docs/REFERENCE.md +++ b/docs/REFERENCE.md @@ -57,7 +57,7 @@ Example: *"If the output contains an SSN pattern, block the response."* "scope": { "step_types": ["llm_inference"], "stages": ["post"] }, "selector": { "path": "output" }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": "\\b\\d{3}-\\d{2}-\\d{4}\\b" } }, "action": { "decision": "deny" } @@ -141,11 +141,11 @@ Agent Control is built as a monorepo with these components: ``` ┌──────────────────────────────────────────────────────────────────┐ -│ Your Application │ +│ Your Application │ │ ┌────────────────────────────────────────────────────────────┐ │ -│ │ @control() decorator │ │ -│ │ │ │ │ -│ │ ▼ │ │ +│ │ @control() decorator │ │ +│ │ │ │ │ +│ │ ▼ │ │ │ │ ┌──────────┐ ┌─────────────────┐ ┌──────────────┐ │ │ │ │ │ Input │───▶│ Agent Control │───▶│ Output │ │ │ │ │ │ │ │ Engine │ │ │ │ │ @@ -155,19 +155,19 @@ Agent Control is built as a monorepo with these components: │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ Agent Control Server │ +│ Agent Control Server │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ -│ │ Controls │ │ Policies │ │ Plugins │ │ Agents │ │ +│ │ Controls │ │ Policies │ │ Evaluators │ │ Agents │ │ │ │ API │ │ API │ │ Registry │ │ API │ │ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ └──────────────────────────────────────────────────────────────────┘ │ ▼ ┌──────────────────────────────────────────────────────────────────┐ -│ Plugin Ecosystem │ +│ Evaluator Ecosystem │ │ ┌────────────┐ ┌────────────┐ ┌────────────┐ ┌────────────┐ │ │ │ Regex │ │ List │ │ Luna-2 │ │ Custom │ │ -│ │ Evaluator │ │ Evaluator │ │ Plugin │ │ Plugins │ │ +│ │ Evaluator │ │ Evaluator │ │ Evaluator │ │ Evaluators │ │ │ └────────────┘ └────────────┘ └────────────┘ └────────────┘ │ └──────────────────────────────────────────────────────────────────┘ ``` @@ -179,7 +179,7 @@ Agent Control is built as a monorepo with these components: | SDK | `agent-control` | Python client library with `@control()` decorator | | Server | `agent-control-server` | FastAPI service that stores and evaluates controls | | Engine | `agent-control-engine` | Core evaluation logic (can run locally or server-side) | -| Plugins | `agent-control-plugins` | Extensible evaluators for different detection methods | +| Evaluators | `agent-control-evaluators` | Extensible evaluators for different detection methods | | Models | `agent-control-models` | Shared Pydantic models for type-safe communication | | UI | `ui/` | Next.js web dashboard for control management | @@ -202,7 +202,7 @@ Agent Control includes powerful evaluators out of the box. Pattern matching using Google RE2 (safe from ReDoS attacks). -**Plugin name**: `regex` +**Evaluator name**: `regex` **Configuration**: @@ -216,7 +216,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). ```json // Block Social Security Numbers { - "plugin": "regex", + "name": "regex", "config": { "pattern": "\\b\\d{3}-\\d{2}-\\d{4}\\b" } @@ -224,7 +224,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). // Block credit card numbers (case-insensitive) { - "plugin": "regex", + "name": "regex", "config": { "pattern": "card.*\\d{4}[- ]?\\d{4}[- ]?\\d{4}[- ]?\\d{4}", "flags": ["IGNORECASE"] @@ -233,7 +233,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). // Block AWS access keys { - "plugin": "regex", + "name": "regex", "config": { "pattern": "AKIA[0-9A-Z]{16}" } @@ -248,7 +248,7 @@ Pattern matching using Google RE2 (safe from ReDoS attacks). Flexible value matching with multiple modes and logic options. -**Plugin name**: `list` +**Evaluator name**: `list` **Configuration**: @@ -267,7 +267,7 @@ Flexible value matching with multiple modes and logic options. ```json // Block admin/root keywords { - "plugin": "list", + "name": "list", "config": { "values": ["admin", "root", "sudo", "superuser"], "logic": "any", @@ -278,7 +278,7 @@ Flexible value matching with multiple modes and logic options. // Require approval keyword (trigger if NOT found) { - "plugin": "list", + "name": "list", "config": { "values": ["APPROVED", "VERIFIED"], "match_on": "no_match", @@ -288,7 +288,7 @@ Flexible value matching with multiple modes and logic options. // Allowlist: only permit specific tools { - "plugin": "list", + "name": "list", "config": { "values": ["search", "calculate", "lookup"], "match_on": "no_match" @@ -300,16 +300,16 @@ Flexible value matching with multiple modes and logic options. --- -### Luna-2 Plugin +### Luna-2 Evaluator AI-powered detection using Galileo's Luna-2 small language models. Provides real-time, low-latency evaluation for complex patterns that can't be caught with regex or lists. -**Plugin name**: `galileo-luna2` +**Evaluator name**: `galileo-luna2` **Installation**: Luna-2 requires an optional dependency: ```bash -pip install agent-control-plugins[luna2] +pip install agent-control-evaluators[luna2] ``` **Requirements**: Set `GALILEO_API_KEY` environment variable where evaluations run (on the server for server-side controls, or in the client environment for local controls). @@ -348,7 +348,7 @@ pip install agent-control-plugins[luna2] ```json // Block toxic inputs (score > 0.5) { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "input_toxicity", "operator": "gt", @@ -359,7 +359,7 @@ pip install agent-control-plugins[luna2] // Block prompt injection attempts { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "prompt_injection", "operator": "gt", @@ -370,7 +370,7 @@ pip install agent-control-plugins[luna2] // Flag potential hallucinations (warn but allow) { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "metric": "hallucination", "operator": "gt", @@ -380,7 +380,7 @@ pip install agent-control-plugins[luna2] // Using central stage (pre-defined in Galileo) { - "plugin": "galileo-luna2", + "name": "galileo-luna2", "config": { "stage_type": "central", "stage_name": "production-safety", @@ -393,43 +393,43 @@ pip install agent-control-plugins[luna2] --- -### Custom Plugins +### Custom Evaluators -You can create custom plugins to extend Agent Control with your own detection capabilities. +You can create custom evaluators to extend Agent Control with your own detection capabilities. -**Plugin Interface**: +**Evaluator Interface**: ```python from typing import Any from pydantic import BaseModel from agent_control_models import ( EvaluatorResult, - PluginEvaluator, - PluginMetadata, - register_plugin, + Evaluator, + EvaluatorMetadata, + register_evaluator, ) -class MyPluginConfig(BaseModel): - """Configuration schema for your plugin.""" +class MyEvaluatorConfig(BaseModel): + """Configuration schema for your evaluator.""" threshold: float = 0.5 custom_option: str = "default" -@register_plugin -class MyCustomPlugin(PluginEvaluator[MyPluginConfig]): - """Your custom evaluator plugin.""" +@register_evaluator +class MyCustomEvaluator(Evaluator[MyEvaluatorConfig]): + """Your custom evaluator.""" - metadata = PluginMetadata( - name="my-custom-plugin", + metadata = EvaluatorMetadata( + name="my-custom-evaluator", version="1.0.0", description="Detects custom patterns using proprietary logic", requires_api_key=True, timeout_ms=5000, ) - config_model = MyPluginConfig + config_model = MyEvaluatorConfig - def __init__(self, config: MyPluginConfig) -> None: + def __init__(self, config: MyEvaluatorConfig) -> None: super().__init__(config) # Set up clients, load models, etc. @@ -454,18 +454,18 @@ class MyCustomPlugin(PluginEvaluator[MyPluginConfig]): ) ``` -**Registration**: Plugins are discovered via Python entry points. Add to your `pyproject.toml`: +**Registration**: Evaluators are discovered via Python entry points. Add to your `pyproject.toml`: ```toml -[project.entry-points."agent_control.plugins"] -my-plugin = "my_package.plugin:MyCustomPlugin" +[project.entry-points."agent_control.evaluators"] +my-evaluator = "my_package.evaluator:MyCustomEvaluator" ``` -**Optional Dependencies**: Override `is_available()` if your plugin has optional dependencies: +**Optional Dependencies**: Override `is_available()` if your evaluator has optional dependencies: ```python -@register_plugin -class MyPlugin(PluginEvaluator[MyConfig]): +@register_evaluator +class MyEvaluator(Evaluator[MyConfig]): @classmethod def is_available(cls) -> bool: try: @@ -475,14 +475,14 @@ class MyPlugin(PluginEvaluator[MyConfig]): return False ``` -When `is_available()` returns `False`, the plugin is silently skipped during registration. +When `is_available()` returns `False`, the evaluator is silently skipped during registration. **Best Practices**: | Practice | Why | |----------|-----| | Use Pydantic for config | Automatic validation and documentation | -| Implement timeouts | Prevent slow plugins from blocking agents | +| Implement timeouts | Prevent slow evaluators from blocking agents | | Return confidence scores | Enable threshold-based filtering | | Include metadata | Helps with debugging and observability | | Handle errors gracefully | Respect the `on_error` configuration | @@ -581,7 +581,7 @@ async with AgentControlClient() as client: "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r"\d{3}-\d{2}-\d{4}"} }, "action": {"decision": "deny"} @@ -603,7 +603,7 @@ async with AgentControlClient() as client: "path": "input.path" }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r"^/(etc|var|usr|root)/"} }, "action": {"decision": "deny"} @@ -791,11 +791,11 @@ Agent Control supports multiple API keys for zero-downtime rotation: | `AGENT_CONTROL_API_KEYS` | — | Valid API keys (comma-separated) | | `AGENT_CONTROL_ADMIN_API_KEYS` | — | Admin API keys (comma-separated) | -**Plugins**: +**Evaluators**: | Variable | Default | Description | |----------|---------|-------------| -| `GALILEO_API_KEY` | — | API key for Luna-2 plugin | +| `GALILEO_API_KEY` | — | API key for Luna-2 evaluator | ### SDK Environment Variables @@ -877,14 +877,14 @@ make alembic-upgrade 4. Verify the selector path matches your data structure 5. Test the evaluator pattern/values independently -### Luna-2 Plugin Errors +### Luna-2 Evaluator Errors -1. Ensure `httpx` is installed: `pip install agent-control-plugins[luna2]` +1. Ensure `httpx` is installed: `pip install agent-control-evaluators[luna2]` 2. Ensure `GALILEO_API_KEY` is set 3. Check network connectivity to Galileo API 4. Verify the metric name is valid 5. Check `on_error` setting if failures are silently allowed -**Plugin Not Found**: If `galileo-luna2` doesn't appear in `list_plugins()`: +**Evaluator Not Found**: If `galileo-luna2` doesn't appear in `list_evaluators()`: - Verify `httpx` is installed (Luna-2's `is_available()` returns `False` without it) -- Check server logs for plugin discovery messages +- Check server logs for evaluator discovery messages diff --git a/docs/plugins/json.md b/docs/evaluators/json.md similarity index 96% rename from docs/plugins/json.md rename to docs/evaluators/json.md index 444f173f..010dccff 100644 --- a/docs/plugins/json.md +++ b/docs/evaluators/json.md @@ -1,18 +1,18 @@ -# JSON Plugin Quickstart Guide +# JSON Evaluator Quickstart Guide A practical guide for configuring JSON validation controls for LLM outputs and tool steps. --- -## What is the JSON Plugin? +## What is the JSON Evaluator? -The JSON Validator Plugin validates JSON data from LLM responses and tool steps before they're used or executed. It acts as a quality and safety layer, ensuring structured outputs meet your requirements, preventing malformed data, and enforcing business rules. +The JSON Validator Evaluator validates JSON data from LLM responses and tool steps before they're used or executed. It acts as a quality and safety layer, ensuring structured outputs meet your requirements, preventing malformed data, and enforcing business rules. **Technical Foundation**: Uses [jsonschema](https://python-jsonschema.readthedocs.io/) for JSON Schema validation (Draft 7+) with custom field-level validators for simpler checks. > **💡 JSON Schema vs Field-Level Validation** > -> **JSON Schema can handle ALL the validation checks** this plugin provides: +> **JSON Schema can handle ALL the validation checks** this evaluator provides: > - Required fields → `"required": ["field1"]` > - Type checking → `"type": "string"` > - Numeric ranges → `"minimum": 0, "maximum": 100` @@ -435,7 +435,7 @@ One powerful use case for the JSON Validator is enabling **LLM retry loops** whe ### How It Works 1. **LLM generates response** → JSON output -2. **Plugin validates** → Returns error if validation fails +2. **Evaluator validates** → Returns error if validation fails 3. **Error fed back to LLM** → Clear error message explains what's wrong 4. **LLM retries** → Generates corrected output 5. **Repeat** → Until validation passes or max retries reached @@ -720,14 +720,14 @@ Checks execute in fixed order (cannot be changed). ## See Also -- **regex plugin** - Simple pattern matching without JSON structure validation -- **list plugin** - Check if values are in/not in a list +- **regex evaluator** - Simple pattern matching without JSON structure validation +- **list evaluator** - Check if values are in/not in a list - [JSON Schema specification](https://json-schema.org/draft-07/schema) - [RE2 syntax](https://github.com/google/re2/wiki/Syntax) --- -**Plugin Version:** 1.0.0 +**Evaluator Version:** 1.0.0 **Timeout:** 15 seconds (default) **Thread Safe:** Yes **ReDoS Safe:** Yes (uses RE2) diff --git a/docs/plugins/sql.md b/docs/evaluators/sql.md similarity index 91% rename from docs/plugins/sql.md rename to docs/evaluators/sql.md index aad8605e..5ca18ae3 100644 --- a/docs/plugins/sql.md +++ b/docs/evaluators/sql.md @@ -1,12 +1,12 @@ -# SQL Plugin Quickstart Guide +# SQL Evaluator Quickstart Guide A practical guide for configuring SQL validation controls in your AI agent. --- -## What is the SQL Plugin? +## What is the SQL Evaluator? -The SQL Plugin validates SQL query strings (e.g., from LLM responses) before they execute against your database. It acts as a security and safety layer, preventing dangerous operations, enforcing access policies, and ensuring data isolation. +The SQL Evaluator validates SQL query strings (e.g., from LLM responses) before they execute against your database. It acts as a security and safety layer, preventing dangerous operations, enforcing access policies, and ensuring data isolation. **Technical Foundation**: Uses [sqlglot](https://github.com/tobymao/sqlglot) with the Rust-accelerated parser (`sqlglot[rs]`) for high-performance SQL parsing and AST-based validation. @@ -36,9 +36,9 @@ The SQL Plugin validates SQL query strings (e.g., from LLM responses) before the > **⚠️ Important Security Note** > -> This plugin validates query structure and enforces access rules, but **it is not a complete defense against SQL injection**. The primary defense against SQL injection is using **prepared statements** (parameterized queries) at the database layer. This plugin provides an additional security layer by validating query syntax and enforcing policies, but should not be relied upon as the sole protection mechanism. +> This evaluator validates query structure and enforces access rules, but **it is not a complete defense against SQL injection**. The primary defense against SQL injection is using **prepared statements** (parameterized queries) at the database layer. This evaluator provides an additional security layer by validating query syntax and enforcing policies, but should not be relied upon as the sole protection mechanism. > -> **Best Practice**: Always use prepared statements/parameterized queries when executing SQL from untrusted sources. This plugin complements, but does not replace, proper database security practices. +> **Best Practice**: Always use prepared statements/parameterized queries when executing SQL from untrusted sources. This evaluator complements, but does not replace, proper database security practices. **Key Benefits:** - **Security**: Block dangerous operations (DROP, DELETE, TRUNCATE) @@ -65,7 +65,7 @@ The SQL Plugin validates SQL query strings (e.g., from LLM responses) before the ## Configuration Options -The SQL Plugin validates queries in this order: +The SQL Evaluator validates queries in this order: 1. **Syntax** - SQL must be parseable (invalid SQL returns error, not block) 2. **Multi-Statement** - Control multi-statement queries @@ -75,7 +75,7 @@ The SQL Plugin validates queries in this order: 6. **Limits** - Enforce LIMIT clauses and max values 7. **Query Complexity** - Limit subquery depth, JOINs, and set operations -> **Note**: When SQL cannot be parsed, the plugin returns `matched=True` (validation fails). See [Error Handling Behavior](#error-handling-behavior) for details. +> **Note**: When SQL cannot be parsed, the evaluator returns `matched=True` (validation fails). See [Error Handling Behavior](#error-handling-behavior) for details. > **🔒 Security**: All checks recursively validate subqueries, CTEs, and nested SELECT statements to prevent security bypasses. @@ -154,7 +154,7 @@ The SQL Plugin validates queries in this order: > **💡 Complete List of Supported Operations** > -> The plugin recognizes and can control the following SQL operations: +> The evaluator recognizes and can control the following SQL operations: > > **DML (Data Manipulation)**: > - `SELECT`, `INSERT`, `UPDATE`, `DELETE`, `MERGE` @@ -320,7 +320,7 @@ The SQL Plugin validates queries in this order: > **💡 Deep Pagination Protection**: `max_result_window` limits the sum of LIMIT + OFFSET to prevent expensive queries. Similar to Elasticsearch's `index.max_result_window`, this stops attackers from using large OFFSET values like `LIMIT 10 OFFSET 1000000` which can cause severe database performance degradation. -> **⚠️ Indeterminate LIMIT Values**: The plugin cannot determine LIMIT values for `LIMIT ALL`, `LIMIT (SELECT ...)`, or parameter placeholders (`LIMIT $1`, `LIMIT ?`). See the "Fail-Safe Behavior" section below for how these cases are handled. +> **⚠️ Indeterminate LIMIT Values**: The evaluator cannot determine LIMIT values for `LIMIT ALL`, `LIMIT (SELECT ...)`, or parameter placeholders (`LIMIT $1`, `LIMIT ?`). See the "Fail-Safe Behavior" section below for how these cases are handled. --- @@ -491,15 +491,15 @@ Quick reference of all configuration options: ## Error Handling Behavior -The plugin handles edge cases as follows: +The evaluator handles edge cases as follows: -> **Note**: The `error` field is only set for plugin errors (crashes, timeouts, missing dependencies), not for validation failures. Invalid SQL is a validation failure, not a plugin error. +> **Note**: The `error` field is only set for evaluator errors (crashes, timeouts, missing dependencies), not for validation failures. Invalid SQL is a validation failure, not an evaluator error. ### Parse Failures When SQL cannot be parsed (malformed SQL, unsupported syntax): - Returns `matched=True` - invalid SQL fails validation -- No `error` field - this is a validation result, not a plugin error +- No `error` field - this is a validation result, not an evaluator error - The `message` field contains the parse error details ```sql @@ -532,7 +532,7 @@ When SQL parses but operation type cannot be determined: ### Confidence Levels -The plugin returns a `confidence` score with each validation result: +The evaluator returns a `confidence` score with each validation result: - **`confidence=1.0`**: Definite result - Used for all validation outcomes (pass or fail) @@ -611,8 +611,8 @@ Validate with representative queries before deploying to production. Ensure your ✅ **Monitor blocked queries** Review what's being blocked to tune your rules. Too restrictive = agent can't work; too permissive = security gaps. Use the `query_hash` field from metadata to track queries without exposing SQL content in logs. -✅ **Check the error field for plugin failures** -The `error` field is only set for plugin errors (crashes, timeouts), not validation failures. If `error` is set, the plugin couldn't complete evaluation. +✅ **Check the error field for evaluator failures** +The `error` field is only set for evaluator errors (crashes, timeouts), not validation failures. If `error` is set, the evaluator couldn't complete evaluation. ✅ **Understand validation order** Options are evaluated sequentially: @@ -621,7 +621,7 @@ Options are evaluated sequentially: Earlier checks (like multi-statement) happen before later ones (like complexity). A query blocked by operations won't reach limit or complexity checks. ✅ **Invalid SQL is blocked** -Unparseable SQL returns `matched=True` - invalid SQL fails validation. This is a validation result, not a plugin error. +Unparseable SQL returns `matched=True` - invalid SQL fails validation. This is a validation result, not an evaluator error. ✅ **Understand allow/block interaction** - For tables/schemas: Use `allowed_*` **OR** `blocked_*`, not both (mutually exclusive) diff --git a/docs/observability.md b/docs/observability.md index 1c51fc35..40ae0b76 100644 --- a/docs/observability.md +++ b/docs/observability.md @@ -450,7 +450,7 @@ curl -X POST "http://localhost:8000/api/v1/observability/events/query" \ "confidence": 1.0, "timestamp": "2026-01-20T16:04:59.004038Z", "execution_duration_ms": null, - "evaluator_plugin": "regex", + "evaluator": "regex", "selector_path": null, "error_message": null, "metadata": { diff --git a/engine/README.md b/engine/README.md index bdeea08f..8a232f21 100644 --- a/engine/README.md +++ b/engine/README.md @@ -4,40 +4,40 @@ Core evaluation logic for Agent Control. ## Responsibilities -- **Plugin Discovery**: Auto-discover plugins via Python entry points +- **Evaluator Discovery**: Auto-discover evaluators via Python entry points - **Selector Evaluation**: Extract data from payloads using selector paths -- **Evaluator Execution**: Run plugin evaluators against selected data -- **Caching**: Cache plugin instances for performance +- **Evaluator Execution**: Run evaluators against selected data +- **Caching**: Cache evaluator instances for performance -## Plugin Discovery +## Evaluator Discovery -The engine provides the public API for plugin discovery: +The engine provides the public API for evaluator discovery: ```python -from agent_control_engine import discover_plugins, list_plugins +from agent_control_engine import discover_evaluators, list_evaluators -# Discover all plugins (runs once, safe to call multiple times) -discover_plugins() +# Discover all evaluators (runs once, safe to call multiple times) +discover_evaluators() -# Get all available plugins -plugins = list_plugins() # Returns dict[str, PluginClass] +# Get all available evaluators +evaluators = list_evaluators() # Returns dict[str, EvaluatorClass] -# Access a specific plugin -regex_plugin = plugins.get("regex") +# Access a specific evaluator +regex_evaluator = evaluators.get("regex") ``` -Plugins are discovered via the `agent_control.plugins` entry point group. Discovery: +Evaluators are discovered via the `agent_control.evaluators` entry point group. Discovery: 1. Scans all installed packages for the entry point -2. Loads each plugin class +2. Loads each evaluator class 3. Checks `is_available()` to verify dependencies -4. Registers available plugins +4. Registers available evaluators ## Key Functions | Function | Description | |----------|-------------| -| `discover_plugins()` | Scan entry points and register plugins | -| `list_plugins()` | Get all registered plugins (triggers discovery) | -| `ensure_plugins_discovered()` | Ensure discovery has run | -| `get_evaluator(config)` | Get cached evaluator instance | +| `discover_evaluators()` | Scan entry points and register evaluators | +| `list_evaluators()` | Get all registered evaluators (triggers discovery) | +| `ensure_evaluators_discovered()` | Ensure discovery has run | +| `get_evaluator_instance(config)` | Get cached evaluator instance | | `evaluate_control(control, payload)` | Evaluate a single control | diff --git a/engine/pyproject.toml b/engine/pyproject.toml index 4a7440d7..d585ddd1 100644 --- a/engine/pyproject.toml +++ b/engine/pyproject.toml @@ -5,7 +5,7 @@ description = "Control execution engine for Agent Control" requires-python = ">=3.12" dependencies = [ "agent-control-models>=0.1.0", - "agent-control-plugins>=0.1.0", + "agent-control-evaluators>=0.1.0", "google-re2>=1.1", ] authors = [ @@ -31,4 +31,4 @@ packages = ["src/agent_control_engine"] [tool.uv.sources] agent-control-models = { workspace = true } -agent-control-plugins = { workspace = true } +agent-control-evaluators = { workspace = true } diff --git a/engine/src/agent_control_engine/__init__.py b/engine/src/agent_control_engine/__init__.py index 2c298111..08b8b9b2 100644 --- a/engine/src/agent_control_engine/__init__.py +++ b/engine/src/agent_control_engine/__init__.py @@ -1,17 +1,17 @@ -"""Agent Control Engine - Rule execution logic and plugin system.""" +"""Agent Control Engine - Rule execution logic and evaluator system.""" from .discovery import ( - discover_plugins, - ensure_plugins_discovered, - list_plugins, - reset_discovery, + discover_evaluators, + ensure_evaluators_discovered, + list_evaluators, + reset_evaluator_discovery, ) __version__ = "0.1.0" __all__ = [ - "discover_plugins", - "ensure_plugins_discovered", - "list_plugins", - "reset_discovery", + "discover_evaluators", + "ensure_evaluators_discovered", + "list_evaluators", + "reset_evaluator_discovery", ] diff --git a/engine/src/agent_control_engine/core.py b/engine/src/agent_control_engine/core.py index e3e4a811..94f13c20 100644 --- a/engine/src/agent_control_engine/core.py +++ b/engine/src/agent_control_engine/core.py @@ -20,7 +20,7 @@ EvaluatorResult, ) -from .evaluators import get_evaluator +from .evaluators import get_evaluator_instance from .selectors import select_data logger = logging.getLogger(__name__) @@ -159,8 +159,8 @@ async def evaluate_control(eval_task: _EvalTask) -> None: """Evaluate a single control, respecting cancellation and timeout.""" async with semaphore: try: - evaluator = get_evaluator(eval_task.item.control.evaluator) - # Use plugin's timeout or fall back to default + evaluator = get_evaluator_instance(eval_task.item.control.evaluator) + # Use evaluator's timeout or fall back to default timeout = evaluator.get_timeout_seconds() if timeout <= 0: timeout = DEFAULT_EVALUATOR_TIMEOUT @@ -184,7 +184,7 @@ async def evaluate_control(eval_task: _EvalTask) -> None: error_msg = f"TimeoutError: Evaluator exceeded {timeout}s timeout" logger.warning( f"Evaluator timeout for control '{eval_task.item.name}' " - f"(plugin: {eval_task.item.control.evaluator.plugin}): {error_msg}" + f"(evaluator: {eval_task.item.control.evaluator.name}): {error_msg}" ) eval_task.result = EvaluatorResult( matched=False, @@ -198,7 +198,7 @@ async def evaluate_control(eval_task: _EvalTask) -> None: error_msg = f"{type(e).__name__}: {e}" logger.warning( f"Evaluator error for control '{eval_task.item.name}' " - f"(plugin: {eval_task.item.control.evaluator.plugin}): {error_msg}" + f"(evaluator: {eval_task.item.control.evaluator.name}): {error_msg}" ) eval_task.result = EvaluatorResult( matched=False, diff --git a/engine/src/agent_control_engine/discovery.py b/engine/src/agent_control_engine/discovery.py index 67cb3f7c..dd4563f4 100644 --- a/engine/src/agent_control_engine/discovery.py +++ b/engine/src/agent_control_engine/discovery.py @@ -1,4 +1,4 @@ -"""Plugin discovery via entry points.""" +"""Evaluator discovery via entry points.""" from __future__ import annotations @@ -8,10 +8,10 @@ from typing import Any from agent_control_models import ( - PluginEvaluator, - get_all_plugins, - get_plugin, - register_plugin, + Evaluator, + get_all_evaluators, + get_evaluator, + register_evaluator, ) logger = logging.getLogger(__name__) @@ -20,18 +20,18 @@ _DISCOVERY_LOCK = threading.Lock() -def discover_plugins() -> int: - """Discover and register plugins via entry points. +def discover_evaluators() -> int: + """Discover and register evaluators via entry points. - All plugins (built-in and third-party) are discovered via the - 'agent_control.plugins' entry point group. Plugins are only registered + All evaluators (built-in and third-party) are discovered via the + 'agent_control.evaluators' entry point group. Evaluators are only registered if their `is_available()` method returns True. Safe to call multiple times - only runs discovery once. Thread-safe via lock. Returns: - Number of plugins discovered + Number of evaluators discovered """ global _DISCOVERY_COMPLETE @@ -46,44 +46,44 @@ def discover_plugins() -> int: discovered = 0 - # Discover ALL plugins (built-in and third-party) via entry points. - # Only register plugins where is_available() returns True. + # Discover ALL evaluators (built-in and third-party) via entry points. + # Only register evaluators where is_available() returns True. try: - eps = entry_points(group="agent_control.plugins") + eps = entry_points(group="agent_control.evaluators") for ep in eps: try: - plugin_class = ep.load() - name = plugin_class.metadata.name + evaluator_class = ep.load() + name = evaluator_class.metadata.name # Skip if already registered - if get_plugin(name) is not None: + if get_evaluator(name) is not None: continue - # Check if plugin dependencies are satisfied - if not plugin_class.is_available(): - logger.debug(f"Plugin '{name}' not available, skipping") + # Check if evaluator dependencies are satisfied + if not evaluator_class.is_available(): + logger.debug(f"Evaluator '{name}' not available, skipping") continue - register_plugin(plugin_class) - logger.debug(f"Registered plugin: {name}") + register_evaluator(evaluator_class) + logger.debug(f"Registered evaluator: {name}") discovered += 1 except Exception as e: - logger.warning(f"Failed to load plugin '{ep.name}': {e}") + logger.warning(f"Failed to load evaluator '{ep.name}': {e}") except Exception as e: logger.debug(f"Entry point discovery not available: {e}") _DISCOVERY_COMPLETE = True - logger.debug(f"Plugin discovery complete: {discovered} new plugins") + logger.debug(f"Evaluator discovery complete: {discovered} new evaluators") return discovered -def ensure_plugins_discovered() -> None: - """Ensure plugin discovery has run. Call this before using plugins.""" +def ensure_evaluators_discovered() -> None: + """Ensure evaluator discovery has run. Call this before using evaluators.""" if not _DISCOVERY_COMPLETE: - discover_plugins() + discover_evaluators() -def reset_discovery() -> None: +def reset_evaluator_discovery() -> None: """Reset discovery state. Useful for testing.""" global _DISCOVERY_COMPLETE with _DISCOVERY_LOCK: @@ -91,17 +91,17 @@ def reset_discovery() -> None: # ============================================================================= -# Public plugin API +# Public evaluator API # ============================================================================= -def list_plugins() -> dict[str, type[PluginEvaluator[Any]]]: - """List all registered plugins. +def list_evaluators() -> dict[str, type[Evaluator[Any]]]: + """List all registered evaluators. - This function ensures plugin discovery has run before returning results. + This function ensures evaluator discovery has run before returning results. Returns: - Dictionary mapping plugin names to plugin classes + Dictionary mapping evaluator names to evaluator classes """ - ensure_plugins_discovered() - return get_all_plugins() + ensure_evaluators_discovered() + return get_all_evaluators() diff --git a/engine/src/agent_control_engine/evaluators.py b/engine/src/agent_control_engine/evaluators.py index 022efe77..c9c43717 100644 --- a/engine/src/agent_control_engine/evaluators.py +++ b/engine/src/agent_control_engine/evaluators.py @@ -1,4 +1,4 @@ -"""Unified evaluator factory using plugin registry with caching.""" +"""Unified evaluator factory using evaluator registry with caching.""" import json import logging @@ -6,9 +6,9 @@ from collections import OrderedDict from typing import Any -from agent_control_models import EvaluatorConfig, PluginEvaluator +from agent_control_models import Evaluator, EvaluatorConfig -from .discovery import list_plugins +from .discovery import list_evaluators logger = logging.getLogger(__name__) @@ -33,8 +33,8 @@ def _parse_cache_size() -> int: EVALUATOR_CACHE_SIZE = max(_parse_cache_size(), MIN_CACHE_SIZE) -# LRU cache for evaluator instances: cache_key -> PluginEvaluator instance -_EVALUATOR_CACHE: OrderedDict[str, PluginEvaluator[Any]] = OrderedDict() +# LRU cache for evaluator instances: cache_key -> Evaluator instance +_EVALUATOR_CACHE: OrderedDict[str, Evaluator[Any]] = OrderedDict() def _config_hash(config: dict[str, Any]) -> str: @@ -42,48 +42,48 @@ def _config_hash(config: dict[str, Any]) -> str: return json.dumps(config, sort_keys=True, default=str) -def get_evaluator(evaluator_config: EvaluatorConfig) -> PluginEvaluator[Any]: +def get_evaluator_instance(evaluator_config: EvaluatorConfig) -> Evaluator[Any]: """Get or create a cached evaluator instance from configuration. Uses LRU caching to reuse evaluator instances with the same config. - Cache key is: {plugin_name}:{config_hash} + Cache key is: {evaluator_name}:{config_hash} - WARNING: Plugin instances are cached and reused across requests! - Plugin implementations MUST be stateless - do not store mutable - request-scoped state on the plugin instance. See PluginEvaluator + WARNING: Evaluator instances are cached and reused across requests! + Evaluator implementations MUST be stateless - do not store mutable + request-scoped state on the evaluator instance. See Evaluator docstring for details on safe patterns. Args: - evaluator_config: The evaluator configuration with plugin name and config + evaluator_config: The evaluator configuration with evaluator name and config Returns: - PluginEvaluator instance (cached or new) + Evaluator instance (cached or new) Raises: - ValueError: If plugin not found + ValueError: If evaluator not found """ # Build cache key - cache_key = f"{evaluator_config.plugin}:{_config_hash(evaluator_config.config)}" + cache_key = f"{evaluator_config.name}:{_config_hash(evaluator_config.config)}" # Check cache if cache_key in _EVALUATOR_CACHE: # Move to end (most recently used) _EVALUATOR_CACHE.move_to_end(cache_key) - logger.debug(f"Cache hit for evaluator: {evaluator_config.plugin}") + logger.debug(f"Cache hit for evaluator: {evaluator_config.name}") return _EVALUATOR_CACHE[cache_key] # Cache miss - create new instance - plugins = list_plugins() - plugin_cls = plugins.get(evaluator_config.plugin) + evaluators = list_evaluators() + evaluator_cls = evaluators.get(evaluator_config.name) - if plugin_cls is None: + if evaluator_cls is None: raise ValueError( - f"Plugin '{evaluator_config.plugin}' not found. " - f"Available plugins: {', '.join(plugins.keys())}" + f"Evaluator '{evaluator_config.name}' not found. " + f"Available evaluators: {', '.join(evaluators.keys())}" ) - logger.debug(f"Cache miss, creating evaluator: {evaluator_config.plugin}") - instance = plugin_cls.from_dict(evaluator_config.config) + logger.debug(f"Cache miss, creating evaluator: {evaluator_config.name}") + instance = evaluator_cls.from_dict(evaluator_config.config) # Evict oldest if cache is full while len(_EVALUATOR_CACHE) >= EVALUATOR_CACHE_SIZE: @@ -98,3 +98,5 @@ def get_evaluator(evaluator_config: EvaluatorConfig) -> PluginEvaluator[Any]: def clear_evaluator_cache() -> None: """Clear all cached evaluator instances. Useful for testing.""" _EVALUATOR_CACHE.clear() + + diff --git a/engine/tests/conftest.py b/engine/tests/conftest.py index 8592dc97..5cd71b1d 100644 --- a/engine/tests/conftest.py +++ b/engine/tests/conftest.py @@ -2,18 +2,18 @@ import pytest -from agent_control_engine.discovery import reset_discovery +from agent_control_engine.discovery import reset_evaluator_discovery from agent_control_engine.evaluators import clear_evaluator_cache -from agent_control_models import clear_plugins +from agent_control_models import clear_evaluators @pytest.fixture(autouse=True) -def clean_plugin_state() -> None: - """Clean up plugin registry and discovery state before each test. +def clean_evaluator_state() -> None: + """Clean up evaluator registry and discovery state before each test. This fixture runs automatically for all tests to ensure isolation. Tests that mock entry_points won't pollute the registry for other tests. """ - clear_plugins() - reset_discovery() + clear_evaluators() + reset_evaluator_discovery() clear_evaluator_cache() diff --git a/engine/tests/test_core.py b/engine/tests/test_core.py index b4066ed6..3f3356eb 100644 --- a/engine/tests/test_core.py +++ b/engine/tests/test_core.py @@ -16,12 +16,12 @@ from agent_control_models import ( ControlDefinition, EvaluationRequest, + Evaluator, EvaluatorConfig, + EvaluatorMetadata, EvaluatorResult, - PluginEvaluator, - PluginMetadata, Step, - register_plugin, + register_evaluator, ) from pydantic import BaseModel @@ -31,12 +31,12 @@ class SimpleConfig(BaseModel): - """Simple config for test plugins.""" + """Simple config for test evaluators.""" value: str = "default" -# Shared state for coordination between test plugins +# Shared state for coordination between test evaluators _execution_log: list[str] = [] _blocker_event: asyncio.Event | None = None @@ -48,10 +48,10 @@ def reset_test_state() -> None: _blocker_event = asyncio.Event() -class AllowPlugin(PluginEvaluator[SimpleConfig]): - """Plugin that always allows (matched=False).""" +class AllowEvaluator(Evaluator[SimpleConfig]): + """Evaluator that always allows (matched=False).""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-allow", version="1.0.0", description="Always allows", @@ -69,10 +69,10 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return result -class DenyPlugin(PluginEvaluator[SimpleConfig]): - """Plugin that always denies (matched=True).""" +class DenyEvaluator(Evaluator[SimpleConfig]): + """Evaluator that always denies (matched=True).""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-deny", version="1.0.0", description="Always denies", @@ -90,13 +90,13 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return result -class BlockerPlugin(PluginEvaluator[SimpleConfig]): - """Plugin that blocks until cancelled or event is set. +class BlockerEvaluator(Evaluator[SimpleConfig]): + """Evaluator that blocks until cancelled or event is set. Used to test cancellation behavior. """ - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-blocker", version="1.0.0", description="Blocks until cancelled", @@ -119,10 +119,10 @@ async def evaluate(self, data: Any) -> EvaluatorResult: raise -class SlowPlugin(PluginEvaluator[SimpleConfig]): - """Plugin that sleeps briefly before returning.""" +class SlowEvaluator(Evaluator[SimpleConfig]): + """Evaluator that sleeps briefly before returning.""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-slow", version="1.0.0", description="Sleeps then allows", @@ -150,15 +150,15 @@ class MockControlWithIdentity: @pytest.fixture(autouse=True) -def setup_test_plugins(): - """Register test plugins and reset state before each test.""" +def setup_test_evaluators(): + """Register test evaluators and reset state before each test.""" reset_test_state() clear_evaluator_cache() - # Register plugins (may already be registered) - for plugin_cls in [AllowPlugin, DenyPlugin, BlockerPlugin, SlowPlugin]: + # Register evaluators (may already be registered) + for evaluator_cls in [AllowEvaluator, DenyEvaluator, BlockerEvaluator, SlowEvaluator]: try: - register_plugin(plugin_cls) + register_evaluator(evaluator_cls) except ValueError: pass # Already registered @@ -171,7 +171,7 @@ def setup_test_plugins(): def make_control( control_id: int, name: str, - plugin: str, + evaluator: str, action: str = "deny", config_value: str = "default", *, @@ -209,7 +209,7 @@ def make_control( scope=scope, selector=selector or {"path": "*"}, evaluator=EvaluatorConfig( - plugin=plugin, + name=evaluator, config={"value": config_value}, ), action={"decision": action}, @@ -465,10 +465,10 @@ async def test_no_matches_when_all_allow(self): # ============================================================================= -class ErrorPlugin(PluginEvaluator[SimpleConfig]): - """Plugin that always raises an exception.""" +class ErrorEvaluator(Evaluator[SimpleConfig]): + """Evaluator that always raises an exception.""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-error", version="1.0.0", description="Always raises an error", @@ -481,16 +481,16 @@ async def evaluate(self, data: Any) -> EvaluatorResult: class TimeoutConfig(BaseModel): - """Config for timeout plugin with custom timeout.""" + """Config for timeout evaluator with custom timeout.""" value: str = "default" timeout_ms: int = 100 # Very short timeout for testing -class TimeoutPlugin(PluginEvaluator[TimeoutConfig]): - """Plugin that sleeps longer than its timeout.""" +class TimeoutEvaluator(Evaluator[TimeoutConfig]): + """Evaluator that sleeps longer than its timeout.""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-timeout", version="1.0.0", description="Sleeps longer than timeout", @@ -514,10 +514,10 @@ class TestErrorHandling: """Tests for error handling - fail-closed for deny controls, error field.""" @pytest.fixture(autouse=True) - def register_error_plugin(self): - """Register ErrorPlugin for these tests.""" + def register_error_evaluator(self): + """Register ErrorEvaluator for these tests.""" try: - register_plugin(ErrorPlugin) + register_evaluator(ErrorEvaluator) except ValueError: pass # Already registered @@ -525,11 +525,11 @@ def register_error_plugin(self): async def test_evaluator_error_fails_closed_for_deny(self): """Test that deny controls fail closed when they error. - Given: A deny control with a plugin that throws an exception + Given: A deny control with an evaluator that throws an exception When: The engine processes the request Then: The request is marked unsafe (fail-closed) and confidence is 0 """ - # Given: A deny control with an error-throwing plugin + # Given: A deny control with an error-throwing evaluator controls = [ make_control(1, "error_control", "test-error", action="deny", config_value="err"), ] @@ -552,7 +552,7 @@ async def test_evaluator_error_fails_closed_for_deny(self): # Error should be captured assert result.errors is not None assert len(result.errors) == 1 - # The plugin should have started + # The evaluator should have started assert "error:err:start" in _execution_log @pytest.mark.asyncio @@ -626,17 +626,17 @@ async def test_error_with_log_action_fails_open(self): assert result.errors is not None @pytest.mark.asyncio - async def test_missing_plugin_error_sets_error_field(self): - """Test that missing plugin error sets error field in result. + async def test_missing_evaluator_error_sets_error_field(self): + """Test that missing evaluator error sets error field in result. - Given: A deny control with a plugin that doesn't exist + Given: A deny control with an evaluator that doesn't exist When: The engine processes the request Then: The error field is set, is_safe=False (deny fails closed) """ - # Given: A deny control with non-existent plugin + # Given: A deny control with non-existent evaluator controls = [ make_control( - 1, "missing_plugin", "nonexistent-plugin", action="deny", config_value="m" + 1, "missing_evaluator", "nonexistent-evaluator", action="deny", config_value="m" ), ] engine = ControlEngine(controls) @@ -657,9 +657,9 @@ async def test_missing_plugin_error_sets_error_field(self): # Error should be captured assert result.errors is not None assert len(result.errors) == 1 - assert result.errors[0].control_name == "missing_plugin" + assert result.errors[0].control_name == "missing_evaluator" assert result.errors[0].result.error is not None - assert "nonexistent-plugin" in result.errors[0].result.error.lower() + assert "nonexistent-evaluator" in result.errors[0].result.error.lower() @pytest.mark.asyncio async def test_errors_array_exposes_evaluator_failures(self): @@ -887,10 +887,10 @@ async def test_confidence_zero_when_deny_errors_despite_other_successes(self): # ============================================================================= -class PayloadEchoPlugin(PluginEvaluator[SimpleConfig]): - """Plugin that inspects full payload when path is omitted ("*").""" +class PayloadEchoEvaluator(Evaluator[SimpleConfig]): + """Evaluator that inspects full payload when path is omitted ("*").""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-payload-echo", version="1.0.0", description="Echo payload info", @@ -910,9 +910,9 @@ async def evaluate(self, data: Any) -> EvaluatorResult: class TestSelectorStepScoping: @pytest.fixture(autouse=True) - def register_payload_plugin(self): + def register_payload_evaluator(self): try: - register_plugin(PayloadEchoPlugin) + register_evaluator(PayloadEchoEvaluator) except ValueError: pass @@ -1018,7 +1018,7 @@ async def test_or_semantics_names_or_regex(self): @pytest.mark.asyncio async def test_path_optional_defaults_to_star(self): - # Given: path omitted; plugin should receive full payload + # Given: path omitted; evaluator should receive full payload controls = [ make_control( 1, @@ -1049,7 +1049,7 @@ def test_invalid_step_name_regex_rejected(self): execution="server", scope={"step_types": ["tool"], "stages": ["pre"], "step_name_regex": "("}, selector={"path": "input"}, - evaluator=EvaluatorConfig(plugin="test-allow", config={"value": "x"}), + evaluator=EvaluatorConfig(name="test-allow", config={"value": "x"}), action={"decision": "log"}, ) @@ -1058,10 +1058,10 @@ class TestTimeoutEnforcement: """Tests for per-evaluator timeout enforcement.""" @pytest.fixture(autouse=True) - def register_timeout_plugin(self): - """Register TimeoutPlugin for these tests.""" + def register_timeout_evaluator(self): + """Register TimeoutEvaluator for these tests.""" try: - register_plugin(TimeoutPlugin) + register_evaluator(TimeoutEvaluator) except ValueError: pass # Already registered @@ -1069,13 +1069,13 @@ def register_timeout_plugin(self): async def test_evaluator_timeout_is_enforced(self): """Test that evaluators are killed after their timeout expires. - Given: A control with a plugin that sleeps longer than its timeout + Given: A control with an evaluator that sleeps longer than its timeout When: The engine processes the request Then: The evaluation times out and error is captured """ import time - # Given: A control with a timeout plugin (100ms timeout, 5s sleep) + # Given: A control with a timeout evaluator (100ms timeout, 5s sleep) controls = [ MockControlWithIdentity( id=1, @@ -1087,7 +1087,7 @@ async def test_evaluator_timeout_is_enforced(self): scope={"step_types": ["llm_inference"], "stages": ["pre"]}, selector={"path": "input"}, evaluator=EvaluatorConfig( - plugin="test-timeout", + name="test-timeout", config={"value": "t1", "timeout_ms": 100}, ), action={"decision": "deny"}, @@ -1109,7 +1109,7 @@ async def test_evaluator_timeout_is_enforced(self): # Then: Should complete quickly (timeout, not full 5s sleep) assert elapsed < 1.0, f"Expected timeout ~0.1s but took {elapsed:.2f}s" - # And: Plugin should have started + # And: Evaluator should have started assert "timeout:t1:start" in _execution_log # But not finished (was killed) assert "timeout:t1:end" not in _execution_log @@ -1125,8 +1125,8 @@ async def test_evaluator_timeout_is_enforced(self): assert result.confidence == 0.0 @pytest.mark.asyncio - async def test_timeout_does_not_affect_fast_plugins(self): - """Test that fast plugins complete normally without timeout issues. + async def test_timeout_does_not_affect_fast_evaluators(self): + """Test that fast evaluators complete normally without timeout issues. Given: A mix of fast and slow (timing out) controls When: The engine processes the request @@ -1145,7 +1145,7 @@ async def test_timeout_does_not_affect_fast_plugins(self): scope={"step_types": ["llm_inference"], "stages": ["pre"]}, selector={"path": "input"}, evaluator=EvaluatorConfig( - plugin="test-timeout", + name="test-timeout", config={"value": "slow", "timeout_ms": 100}, ), action={"decision": "log"}, # Log, not deny - so fails open @@ -1162,11 +1162,11 @@ async def test_timeout_does_not_affect_fast_plugins(self): ) result = await engine.process(request) - # Then: Fast plugin should have completed normally + # Then: Fast evaluator should have completed normally assert "allow:f1:start" in _execution_log assert "allow:f1:end" in _execution_log - # And: Slow plugin should have timed out + # And: Slow evaluator should have timed out assert "timeout:slow:start" in _execution_log assert "timeout:slow:end" not in _execution_log @@ -1207,10 +1207,10 @@ async def test_concurrency_limited_to_max(self, monkeypatch: pytest.MonkeyPatch) _max_concurrent = 0 _lock = asyncio.Lock() - class ConcurrencyTracker(PluginEvaluator[SimpleConfig]): - """Plugin that tracks concurrent execution count.""" + class ConcurrencyTracker(Evaluator[SimpleConfig]): + """Evaluator that tracks concurrent execution count.""" - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="test-concurrency", version="1.0.0", description="Tracks concurrency", @@ -1228,7 +1228,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=1.0, message="ok") try: - register_plugin(ConcurrencyTracker) + register_evaluator(ConcurrencyTracker) except ValueError: pass @@ -1259,7 +1259,7 @@ async def evaluate(self, data: Any) -> EvaluatorResult: def make_control_with_execution( control_id: int, name: str, - plugin: str, + evaluator: str, action: str = "deny", config_value: str = "default", *, @@ -1291,7 +1291,7 @@ def make_control_with_execution( scope=scope, selector={"path": path}, evaluator=EvaluatorConfig( - plugin=plugin, + name=evaluator, config={"value": config_value}, ), action={"decision": action}, diff --git a/engine/tests/test_discovery.py b/engine/tests/test_discovery.py index 855fec9e..6e0af44f 100644 --- a/engine/tests/test_discovery.py +++ b/engine/tests/test_discovery.py @@ -1,48 +1,48 @@ -"""Tests for plugin auto-discovery.""" +"""Tests for evaluator auto-discovery.""" from typing import Any from unittest.mock import MagicMock, patch from pydantic import BaseModel -from agent_control_engine import discover_plugins, ensure_plugins_discovered, list_plugins -from agent_control_engine.discovery import reset_discovery +from agent_control_engine import discover_evaluators, ensure_evaluators_discovered, list_evaluators +from agent_control_engine.discovery import reset_evaluator_discovery from agent_control_models import ( + Evaluator, + EvaluatorMetadata, EvaluatorResult, - PluginEvaluator, - PluginMetadata, - clear_plugins, - get_plugin, - register_plugin, + clear_evaluators, + get_evaluator, + register_evaluator, ) -class TestDiscoverPlugins: - """Tests for discover_plugins() function.""" +class TestDiscoverEvaluators: + """Tests for discover_evaluators() function.""" - def test_discover_plugins_loads_builtins(self) -> None: - """Test that built-in plugins are loaded.""" - discover_plugins() + def test_discover_evaluators_loads_builtins(self) -> None: + """Test that built-in evaluators are loaded.""" + discover_evaluators() - plugins = list_plugins() - assert "regex" in plugins - assert "list" in plugins + evaluators = list_evaluators() + assert "regex" in evaluators + assert "list" in evaluators @patch("agent_control_engine.discovery.entry_points") - def test_discover_plugins_loads_entry_points( + def test_discover_evaluators_loads_entry_points( self, mock_entry_points: MagicMock ) -> None: - """Test that entry point plugins are discovered.""" + """Test that entry point evaluators are discovered.""" - # Create mock plugin + # Create mock evaluator class MockConfig(BaseModel): pass - class MockPlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( - name="mock-ep-plugin", + class MockEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( + name="mock-ep-evaluator", version="1.0.0", - description="Test plugin", + description="Test evaluator", ) config_model = MockConfig @@ -50,82 +50,82 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") mock_ep = MagicMock() - mock_ep.name = "mock-ep-plugin" - mock_ep.load.return_value = MockPlugin + mock_ep.name = "mock-ep-evaluator" + mock_ep.load.return_value = MockEvaluator mock_entry_points.return_value = [mock_ep] - count = discover_plugins() + count = discover_evaluators() - mock_entry_points.assert_called_once_with(group="agent_control.plugins") - plugins = list_plugins() - assert "mock-ep-plugin" in plugins + mock_entry_points.assert_called_once_with(group="agent_control.evaluators") + evaluators = list_evaluators() + assert "mock-ep-evaluator" in evaluators # Count only includes entry-point registrations (not built-ins loaded via import) assert count >= 1 @patch("agent_control_engine.discovery.entry_points") - def test_discover_plugins_handles_load_error( + def test_discover_evaluators_handles_load_error( self, mock_entry_points: MagicMock ) -> None: - """Test graceful handling of plugin load errors.""" + """Test graceful handling of evaluator load errors.""" mock_ep = MagicMock() - mock_ep.name = "bad-plugin" + mock_ep.name = "bad-evaluator" mock_ep.load.side_effect = ImportError("Missing dependency") mock_entry_points.return_value = [mock_ep] # Should not raise - discover_plugins() + discover_evaluators() - def test_discover_plugins_only_runs_once(self) -> None: + def test_discover_evaluators_only_runs_once(self) -> None: """Test that discovery only runs once.""" - count1 = discover_plugins() - count2 = discover_plugins() + count1 = discover_evaluators() + count2 = discover_evaluators() - # First call loads plugins, second call returns 0 (already discovered) + # First call loads evaluators, second call returns 0 (already discovered) assert count2 == 0 - # Verify plugins are available (count may be 0 if no entry-point plugins) - plugins = list_plugins() - assert "regex" in plugins - assert "list" in plugins + # Verify evaluators are available (count may be 0 if no entry-point evaluators) + evaluators = list_evaluators() + assert "regex" in evaluators + assert "list" in evaluators - def test_ensure_plugins_discovered_triggers_discovery(self) -> None: - """Test that ensure_plugins_discovered triggers discovery.""" - ensure_plugins_discovered() + def test_ensure_evaluators_discovered_triggers_discovery(self) -> None: + """Test that ensure_evaluators_discovered triggers discovery.""" + ensure_evaluators_discovered() - plugins = list_plugins() - # Should have at least built-in plugins - assert isinstance(plugins, dict) - assert "regex" in plugins - assert "list" in plugins + evaluators = list_evaluators() + # Should have at least built-in evaluators + assert isinstance(evaluators, dict) + assert "regex" in evaluators + assert "list" in evaluators def test_reset_discovery_allows_rediscovery(self) -> None: - """Test that reset_discovery allows discovery to run again.""" - discover_plugins() - plugins1 = list_plugins() - assert "regex" in plugins1 + """Test that reset_evaluator_discovery allows discovery to run again.""" + discover_evaluators() + evaluators1 = list_evaluators() + assert "regex" in evaluators1 # After reset, discovery should run again - reset_discovery() - clear_plugins() + reset_evaluator_discovery() + clear_evaluators() - discover_plugins() - plugins2 = list_plugins() - assert "regex" in plugins2 - assert "list" in plugins2 + discover_evaluators() + evaluators2 = list_evaluators() + assert "regex" in evaluators2 + assert "list" in evaluators2 @patch("agent_control_engine.discovery.entry_points") - def test_discover_plugins_skips_unavailable( + def test_discover_evaluators_skips_unavailable( self, mock_entry_points: MagicMock ) -> None: - """Test that plugins with is_available() returning False are skipped.""" + """Test that evaluators with is_available() returning False are skipped.""" class MockConfig(BaseModel): pass - class UnavailablePlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( - name="unavailable-plugin", + class UnavailableEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( + name="unavailable-evaluator", version="1.0.0", - description="Plugin with missing deps", + description="Evaluator with missing deps", ) config_model = MockConfig @@ -137,31 +137,31 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") mock_ep = MagicMock() - mock_ep.name = "unavailable-plugin" - mock_ep.load.return_value = UnavailablePlugin + mock_ep.name = "unavailable-evaluator" + mock_ep.load.return_value = UnavailableEvaluator mock_entry_points.return_value = [mock_ep] - count = discover_plugins() + count = discover_evaluators() - # Plugin should NOT be registered - plugins = list_plugins() - assert "unavailable-plugin" not in plugins + # Evaluator should NOT be registered + evaluators = list_evaluators() + assert "unavailable-evaluator" not in evaluators assert count == 0 @patch("agent_control_engine.discovery.entry_points") - def test_discover_plugins_registers_available( + def test_discover_evaluators_registers_available( self, mock_entry_points: MagicMock ) -> None: - """Test that plugins with is_available() returning True are registered.""" + """Test that evaluators with is_available() returning True are registered.""" class MockConfig(BaseModel): pass - class AvailablePlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( - name="available-plugin", + class AvailableEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( + name="available-evaluator", version="1.0.0", - description="Plugin with all deps", + description="Evaluator with all deps", ) config_model = MockConfig @@ -173,30 +173,30 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") mock_ep = MagicMock() - mock_ep.name = "available-plugin" - mock_ep.load.return_value = AvailablePlugin + mock_ep.name = "available-evaluator" + mock_ep.load.return_value = AvailableEvaluator mock_entry_points.return_value = [mock_ep] - count = discover_plugins() + count = discover_evaluators() - # Plugin should be registered - plugins = list_plugins() - assert "available-plugin" in plugins + # Evaluator should be registered + evaluators = list_evaluators() + assert "available-evaluator" in evaluators assert count == 1 class TestIsAvailable: - """Tests for the is_available() plugin method.""" + """Tests for the is_available() evaluator method.""" def test_base_class_is_available_returns_true(self) -> None: - """Test that base PluginEvaluator.is_available() returns True by default.""" + """Test that base Evaluator.is_available() returns True by default.""" class MockConfig(BaseModel): pass - class TestPlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( - name="test-plugin", + class TestEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( + name="test-evaluator", version="1.0.0", description="Test", ) @@ -206,24 +206,24 @@ async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") # Default is_available() should return True - assert TestPlugin.is_available() is True + assert TestEvaluator.is_available() is True -class TestRegisterPluginRespectsIsAvailable: - """Tests that @register_plugin decorator respects is_available().""" +class TestRegisterEvaluatorRespectsIsAvailable: + """Tests that @register_evaluator decorator respects is_available().""" - def test_register_plugin_skips_unavailable(self) -> None: - """Test that @register_plugin skips plugins where is_available() returns False.""" + def test_register_evaluator_skips_unavailable(self) -> None: + """Test that @register_evaluator skips evaluators where is_available() returns False.""" class MockConfig(BaseModel): pass - @register_plugin - class UnavailablePlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( + @register_evaluator + class UnavailableEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( name="test-unavailable-decorated", version="1.0.0", - description="Plugin with unavailable deps", + description="Evaluator with unavailable deps", ) config_model = MockConfig @@ -234,21 +234,21 @@ def is_available(cls) -> bool: async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") - # Plugin should NOT be registered despite using @register_plugin - assert get_plugin("test-unavailable-decorated") is None + # Evaluator should NOT be registered despite using @register_evaluator + assert get_evaluator("test-unavailable-decorated") is None - def test_register_plugin_registers_available(self) -> None: - """Test that @register_plugin registers plugins where is_available() returns True.""" + def test_register_evaluator_registers_available(self) -> None: + """Test that @register_evaluator registers evaluators where is_available() returns True.""" class MockConfig(BaseModel): pass - @register_plugin - class AvailablePlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( + @register_evaluator + class AvailableEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( name="test-available-decorated", version="1.0.0", - description="Plugin with all deps", + description="Evaluator with all deps", ) config_model = MockConfig @@ -259,26 +259,26 @@ def is_available(cls) -> bool: async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") - # Plugin should be registered - assert get_plugin("test-available-decorated") is not None + # Evaluator should be registered + assert get_evaluator("test-available-decorated") is not None - def test_register_plugin_default_is_available(self) -> None: - """Test that @register_plugin works when is_available() is not overridden.""" + def test_register_evaluator_default_is_available(self) -> None: + """Test that @register_evaluator works when is_available() is not overridden.""" class MockConfig(BaseModel): pass - @register_plugin - class DefaultPlugin(PluginEvaluator[MockConfig]): - metadata = PluginMetadata( + @register_evaluator + class DefaultEvaluator(Evaluator[MockConfig]): + metadata = EvaluatorMetadata( name="test-default-available", version="1.0.0", - description="Plugin with default is_available", + description="Evaluator with default is_available", ) config_model = MockConfig async def evaluate(self, data: Any) -> EvaluatorResult: return EvaluatorResult(matched=False, confidence=0.0, message="test") - # Plugin should be registered (default is_available returns True) - assert get_plugin("test-default-available") is not None + # Evaluator should be registered (default is_available returns True) + assert get_evaluator("test-default-available") is not None diff --git a/engine/tests/test_plugin_evaluators.py b/engine/tests/test_evaluator_integrations.py similarity index 59% rename from engine/tests/test_plugin_evaluators.py rename to engine/tests/test_evaluator_integrations.py index 3f0e08fe..cdf3bc65 100644 --- a/engine/tests/test_plugin_evaluators.py +++ b/engine/tests/test_evaluator_integrations.py @@ -1,37 +1,37 @@ -"""Tests for plugin system integration with the unified architecture. +"""Tests for evaluator system integration with the unified architecture. -These tests verify the plugin system works correctly with the engine. +These tests verify the evaluator system works correctly with the engine. """ from typing import Any -# Import to ensure built-in plugins are registered -import agent_control_plugins # noqa: F401 +# Import to ensure built-in evaluators are registered +import agent_control_evaluators # noqa: F401 import pytest -from agent_control_engine.evaluators import get_evaluator +from agent_control_engine.evaluators import get_evaluator_instance from agent_control_models import ( + Evaluator, EvaluatorConfig, + EvaluatorMetadata, EvaluatorResult, - PluginEvaluator, - PluginMetadata, - register_plugin, + register_evaluator, ) from pydantic import BaseModel class MockConfig(BaseModel): - """Config for mock plugin.""" + """Config for mock evaluator.""" threshold: float = 0.5 -class MockTestPlugin(PluginEvaluator[MockConfig]): - """Mock plugin for engine testing.""" +class MockTestEvaluator(Evaluator[MockConfig]): + """Mock evaluator for engine testing.""" - metadata = PluginMetadata( - name="test-mock-plugin", + metadata = EvaluatorMetadata( + name="test-mock-evaluator", version="1.0.0", - description="Test plugin for engine tests", + description="Test evaluator for engine tests", ) config_model = MockConfig @@ -48,51 +48,51 @@ async def evaluate(self, data: Any) -> EvaluatorResult: ) -class TestPluginArchitecture: - """Tests verifying the plugin architecture.""" +class TestEvaluatorArchitecture: + """Tests verifying the evaluator architecture.""" - def test_plugin_is_abc_subclass(self): - """Test PluginEvaluator is an ABC.""" - # Given/When: Checking PluginEvaluator base class + def test_evaluator_is_abc_subclass(self): + """Test Evaluator is an ABC.""" + # Given/When: Checking Evaluator base class from abc import ABC # Then: Should be subclass of ABC - assert issubclass(PluginEvaluator, ABC) + assert issubclass(Evaluator, ABC) - def test_plugin_has_required_attributes(self): - """Test plugins have required class attributes.""" - # Given/When: Checking MockTestPlugin + def test_evaluator_has_required_attributes(self): + """Test evaluators have required class attributes.""" + # Given/When: Checking MockTestEvaluator # Then: Should have required attributes - assert hasattr(MockTestPlugin, "metadata") - assert hasattr(MockTestPlugin, "config_model") - assert MockTestPlugin.metadata.name == "test-mock-plugin" + assert hasattr(MockTestEvaluator, "metadata") + assert hasattr(MockTestEvaluator, "config_model") + assert MockTestEvaluator.metadata.name == "test-mock-evaluator" - def test_plugin_from_dict(self): - """Test creating plugin from dict config.""" - # Given/When: Creating plugin from dict - plugin = MockTestPlugin.from_dict({"threshold": 0.7}) + def test_evaluator_from_dict(self): + """Test creating evaluator from dict config.""" + # Given/When: Creating evaluator from dict + evaluator = MockTestEvaluator.from_dict({"threshold": 0.7}) # Then: Config should be parsed correctly - assert isinstance(plugin.config, MockConfig) - assert plugin.config.threshold == 0.7 + assert isinstance(evaluator.config, MockConfig) + assert evaluator.config.threshold == 0.7 -class TestMockPluginEvaluation: - """Tests for mock plugin evaluation.""" +class TestMockEvaluatorEvaluation: + """Tests for mock evaluator evaluation.""" @pytest.fixture(autouse=True) def register_mock(self): - """Register mock plugin for tests.""" - register_plugin(MockTestPlugin) + """Register mock evaluator for tests.""" + register_evaluator(MockTestEvaluator) yield - # Don't clear - other tests need built-in plugins + # Don't clear - other tests need built-in evaluators @pytest.mark.asyncio async def test_evaluate_matched(self): """Test evaluation when threshold exceeded.""" - # Given: Mock plugin with threshold 0.5 - config = EvaluatorConfig(plugin="test-mock-plugin", config={"threshold": 0.5}) - evaluator = get_evaluator(config) + # Given: Mock evaluator with threshold 0.5 + config = EvaluatorConfig(name="test-mock-evaluator", config={"threshold": 0.5}) + evaluator = get_evaluator_instance(config) # When: Evaluating value above threshold result = await evaluator.evaluate(0.8) @@ -106,9 +106,9 @@ async def test_evaluate_matched(self): @pytest.mark.asyncio async def test_evaluate_not_matched(self): """Test evaluation when below threshold.""" - # Given: Mock plugin with threshold 0.9 - config = EvaluatorConfig(plugin="test-mock-plugin", config={"threshold": 0.9}) - evaluator = get_evaluator(config) + # Given: Mock evaluator with threshold 0.9 + config = EvaluatorConfig(name="test-mock-evaluator", config={"threshold": 0.9}) + evaluator = get_evaluator_instance(config) # When: Evaluating value below threshold result = await evaluator.evaluate(0.3) @@ -118,10 +118,10 @@ async def test_evaluate_not_matched(self): @pytest.mark.asyncio async def test_multiple_evaluations(self): - """Test multiple evaluations with same plugin.""" - # Given: Mock plugin with threshold 0.5 - config = EvaluatorConfig(plugin="test-mock-plugin", config={"threshold": 0.5}) - evaluator = get_evaluator(config) + """Test multiple evaluations with same evaluator.""" + # Given: Mock evaluator with threshold 0.5 + config = EvaluatorConfig(name="test-mock-evaluator", config={"threshold": 0.5}) + evaluator = get_evaluator_instance(config) # When: Evaluating multiple values results = [ @@ -136,53 +136,53 @@ async def test_multiple_evaluations(self): assert results[2].matched is True # 0.9 > 0.5 -class TestPluginMetadata: - """Tests for plugin metadata.""" +class TestEvaluatorMetadata: + """Tests for evaluator metadata.""" def test_access_metadata(self): - """Test that plugin metadata is accessible.""" - # Given/When: Accessing MockTestPlugin metadata + """Test that evaluator metadata is accessible.""" + # Given/When: Accessing MockTestEvaluator metadata # Then: All fields should be correct - assert MockTestPlugin.metadata.name == "test-mock-plugin" - assert MockTestPlugin.metadata.version == "1.0.0" - assert MockTestPlugin.metadata.description == "Test plugin for engine tests" + assert MockTestEvaluator.metadata.name == "test-mock-evaluator" + assert MockTestEvaluator.metadata.version == "1.0.0" + assert MockTestEvaluator.metadata.description == "Test evaluator for engine tests" def test_config_schema(self): """Test that config model provides JSON schema.""" # Given/When: Getting JSON schema from config model - schema = MockTestPlugin.config_model.model_json_schema() + schema = MockTestEvaluator.config_model.model_json_schema() # Then: Schema should include threshold property assert "properties" in schema assert "threshold" in schema["properties"] -class TestBuiltInPlugins: - """Tests for built-in plugins.""" +class TestBuiltInEvaluators: + """Tests for built-in evaluators.""" - def test_regex_plugin_registered(self): - """Test regex plugin is registered.""" - # Given/When: Getting regex plugin - from agent_control_engine import list_plugins - plugin = list_plugins().get("regex") + def test_regex_evaluator_registered(self): + """Test regex evaluator is registered.""" + # Given/When: Getting regex evaluator + from agent_control_engine import list_evaluators + evaluator = list_evaluators().get("regex") # Then: Should be registered with correct name - assert plugin is not None - assert plugin.metadata.name == "regex" + assert evaluator is not None + assert evaluator.metadata.name == "regex" - def test_list_plugin_registered(self): - """Test list plugin is registered.""" - # Given/When: Getting list plugin - from agent_control_engine import list_plugins - plugin = list_plugins().get("list") + def test_list_evaluator_registered(self): + """Test list evaluator is registered.""" + # Given/When: Getting list evaluator + from agent_control_engine import list_evaluators + evaluator = list_evaluators().get("list") # Then: Should be registered with correct name - assert plugin is not None - assert plugin.metadata.name == "list" + assert evaluator is not None + assert evaluator.metadata.name == "list" -class TestRegexPluginFlags: - """Tests for regex plugin flag handling.""" +class TestRegexEvaluatorFlags: + """Tests for regex evaluator flag handling.""" @pytest.mark.asyncio async def test_regex_case_sensitive_by_default(self): @@ -194,10 +194,10 @@ async def test_regex_case_sensitive_by_default(self): """ # Given: Regex for "SECRET" without flags config = EvaluatorConfig( - plugin="regex", + name="regex", config={"pattern": "SECRET"} ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: Exact case matches result = await evaluator.evaluate("the SECRET is here") @@ -220,10 +220,10 @@ async def test_regex_ignorecase_flag(self): """ # Given: Regex for "SECRET" with IGNORECASE flag config = EvaluatorConfig( - plugin="regex", + name="regex", config={"pattern": "SECRET", "flags": ["IGNORECASE"]} ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: All case variations should match result = await evaluator.evaluate("the SECRET is here") @@ -248,10 +248,10 @@ async def test_regex_short_i_flag(self): """ # Given: Regex with short "I" flag config = EvaluatorConfig( - plugin="regex", + name="regex", config={"pattern": "password", "flags": ["I"]} ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: All case variations should match result = await evaluator.evaluate("PASSWORD") @@ -273,10 +273,10 @@ async def test_regex_ignorecase_lowercase_flag(self): """ # Given: Regex with lowercase flag variant config = EvaluatorConfig( - plugin="regex", + name="regex", config={"pattern": "admin", "flags": ["ignorecase"]} ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: Should work with lowercase flag result = await evaluator.evaluate("ADMIN") @@ -284,4 +284,3 @@ async def test_regex_ignorecase_lowercase_flag(self): result = await evaluator.evaluate("admin") assert result.matched is True - diff --git a/engine/tests/test_evaluators.py b/engine/tests/test_evaluators.py index 6700e4e2..49bb5c61 100644 --- a/engine/tests/test_evaluators.py +++ b/engine/tests/test_evaluators.py @@ -1,27 +1,27 @@ """Tests for unified evaluator factory.""" import pytest -from agent_control_engine import list_plugins +from agent_control_engine import list_evaluators from agent_control_engine.evaluators import ( clear_evaluator_cache, - get_evaluator, + get_evaluator_instance, ) from agent_control_models import ( EvaluatorConfig, - RegexConfig, + RegexEvaluatorConfig, ) -from agent_control_plugins import ListPlugin, RegexPlugin +from agent_control_evaluators import ListEvaluator, RegexEvaluator -class TestRegexPlugin: - """Tests for the regex plugin via the evaluator factory.""" +class TestRegexEvaluator: + """Tests for the regex evaluator via the evaluator factory.""" @pytest.mark.asyncio async def test_basic_match(self): """Test regex matches SSN pattern.""" # Given: A regex evaluator with SSN pattern - config = EvaluatorConfig(plugin="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) - evaluator = get_evaluator(config) + config = EvaluatorConfig(name="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) + evaluator = get_evaluator_instance(config) # When: Evaluating text containing SSN result = await evaluator.evaluate("My SSN is 123-45-6789") @@ -34,8 +34,8 @@ async def test_basic_match(self): async def test_no_match(self): """Test regex doesn't match when pattern not found.""" # Given: A regex evaluator with SSN pattern - config = EvaluatorConfig(plugin="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) - evaluator = get_evaluator(config) + config = EvaluatorConfig(name="regex", config={"pattern": r"\d{3}-\d{2}-\d{4}"}) + evaluator = get_evaluator_instance(config) # When: Evaluating text without pattern result = await evaluator.evaluate("No numbers here") @@ -48,8 +48,8 @@ async def test_no_match(self): async def test_non_string_input(self): """Test non-string input is converted to string.""" # Given: A regex evaluator - config = EvaluatorConfig(plugin="regex", config={"pattern": r"123"}) - evaluator = get_evaluator(config) + config = EvaluatorConfig(name="regex", config={"pattern": r"123"}) + evaluator = get_evaluator_instance(config) # When: Evaluating non-string input result = await evaluator.evaluate(12345) @@ -61,8 +61,8 @@ async def test_non_string_input(self): async def test_none_input(self): """Test handling of None input.""" # Given: A regex evaluator - config = EvaluatorConfig(plugin="regex", config={"pattern": r".*"}) - evaluator = get_evaluator(config) + config = EvaluatorConfig(name="regex", config={"pattern": r".*"}) + evaluator = get_evaluator_instance(config) # When: Evaluating None result = await evaluator.evaluate(None) @@ -76,14 +76,14 @@ def test_invalid_regex_pattern(self): # Given/When: Creating config with invalid pattern # Then: Should raise ValueError with pytest.raises(ValueError): - RegexConfig(pattern="[") + RegexEvaluatorConfig(pattern="[") @pytest.mark.asyncio async def test_empty_pattern_matches_everything(self): """Test empty pattern matches everything.""" # Given: A regex evaluator with empty pattern - config = EvaluatorConfig(plugin="regex", config={"pattern": ""}) - evaluator = get_evaluator(config) + config = EvaluatorConfig(name="regex", config={"pattern": ""}) + evaluator = get_evaluator_instance(config) # When: Evaluating any text result = await evaluator.evaluate("something") @@ -92,18 +92,18 @@ async def test_empty_pattern_matches_everything(self): assert result.matched is True -class TestListPlugin: - """Tests for the list plugin via the evaluator factory.""" +class TestListEvaluator: + """Tests for the list evaluator via the evaluator factory.""" @pytest.mark.asyncio async def test_any_match(self): """Test list evaluator with any/match logic.""" # Given: A list evaluator with blocklist items config = EvaluatorConfig( - plugin="list", + name="list", config={"values": ["bad", "evil"], "logic": "any", "match_on": "match"}, ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: Blocklist items match, others don't assert (await evaluator.evaluate("bad")).matched is True @@ -115,10 +115,10 @@ async def test_any_no_match(self): """Test list evaluator as allowlist (any/no_match).""" # Given: A list evaluator as allowlist config = EvaluatorConfig( - plugin="list", + name="list", config={"values": ["safe", "ok"], "logic": "any", "match_on": "no_match"}, ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: Allowlist items don't match, others do assert (await evaluator.evaluate("safe")).matched is False @@ -130,10 +130,10 @@ async def test_all_match(self): """Test list evaluator with all/match logic.""" # Given: A list evaluator with all/match logic config = EvaluatorConfig( - plugin="list", + name="list", config={"values": ["valid1", "valid2"], "logic": "all", "match_on": "match"}, ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: Matches only when all values present assert (await evaluator.evaluate(["valid1", "valid2"])).matched is True @@ -145,47 +145,47 @@ async def test_case_insensitive(self): """Test case-insensitive matching.""" # Given: A case-insensitive list evaluator config = EvaluatorConfig( - plugin="list", + name="list", config={"values": ["MixedCase"], "case_sensitive": False, "match_on": "match"}, ) - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) # When/Then: Matches regardless of case assert (await evaluator.evaluate("mixedcase")).matched is True assert (await evaluator.evaluate("MIXEDCASE")).matched is True -class TestGetEvaluator: - """Tests for the get_evaluator factory function.""" +class TestGetEvaluatorInstance: + """Tests for the get_evaluator_instance factory function.""" - def test_get_evaluator_returns_plugin_instance(self): - """Test factory returns correct plugin type.""" + def test_get_evaluator_instance_returns_correct_type(self): + """Test factory returns correct evaluator type.""" # Given: An evaluator config - config = EvaluatorConfig(plugin="regex", config={"pattern": "abc"}) + config = EvaluatorConfig(name="regex", config={"pattern": "abc"}) # When: Getting evaluator - evaluator = get_evaluator(config) + evaluator = get_evaluator_instance(config) - # Then: Returns correct plugin type - assert isinstance(evaluator, RegexPlugin) + # Then: Returns correct evaluator type + assert isinstance(evaluator, RegexEvaluator) assert evaluator.config.pattern == "abc" - def test_get_evaluator_unknown_plugin(self): - """Test error when plugin not found.""" - # Given: Config for nonexistent plugin - config = EvaluatorConfig(plugin="nonexistent", config={}) + def test_get_evaluator_instance_unknown_evaluator(self): + """Test error when evaluator not found.""" + # Given: Config for nonexistent evaluator + config = EvaluatorConfig(name="nonexistent", config={}) # When/Then: Should raise ValueError with pytest.raises(ValueError, match="not found"): - get_evaluator(config) + get_evaluator_instance(config) - def test_list_plugins(self): - """Test listing available plugins.""" - # Given/When: Getting available plugins - plugins = list_plugins() + def test_list_evaluators(self): + """Test listing available evaluators.""" + # Given/When: Getting available evaluators + evaluators = list_evaluators() - # Then: Should include built-in plugins - assert "regex" in plugins - assert "list" in plugins + # Then: Should include built-in evaluators + assert "regex" in evaluators + assert "list" in evaluators class TestEvaluatorCache: @@ -202,12 +202,12 @@ def teardown_method(self): def test_evaluator_cache_hit(self): """Test that same config returns same cached instance.""" # Given: An evaluator config - config = EvaluatorConfig(plugin="regex", config={"pattern": "test"}) + config = EvaluatorConfig(name="regex", config={"pattern": "test"}) # When: First call creates instance - evaluator1 = get_evaluator(config) + evaluator1 = get_evaluator_instance(config) # When: Second call with same config - evaluator2 = get_evaluator(config) + evaluator2 = get_evaluator_instance(config) # Then: Should return same cached instance assert evaluator1 is evaluator2, "Same config should return cached instance" @@ -215,45 +215,45 @@ def test_evaluator_cache_hit(self): def test_evaluator_cache_miss_different_config(self): """Test that different configs return different instances.""" # Given: Two different configs - config1 = EvaluatorConfig(plugin="regex", config={"pattern": "test1"}) - config2 = EvaluatorConfig(plugin="regex", config={"pattern": "test2"}) + config1 = EvaluatorConfig(name="regex", config={"pattern": "test1"}) + config2 = EvaluatorConfig(name="regex", config={"pattern": "test2"}) # When: Getting evaluators - evaluator1 = get_evaluator(config1) - evaluator2 = get_evaluator(config2) + evaluator1 = get_evaluator_instance(config1) + evaluator2 = get_evaluator_instance(config2) # Then: Should return different instances assert evaluator1 is not evaluator2, "Different configs should return different instances" - def test_evaluator_cache_miss_different_plugin(self): - """Test that same config but different plugins return different instances.""" - # Given: Two configs with different plugins - config1 = EvaluatorConfig(plugin="regex", config={"pattern": "bad"}) - config2 = EvaluatorConfig(plugin="list", config={"values": ["bad"]}) + def test_evaluator_cache_miss_different_evaluator(self): + """Test that same config but different evaluators return different instances.""" + # Given: Two configs with different evaluators + config1 = EvaluatorConfig(name="regex", config={"pattern": "bad"}) + config2 = EvaluatorConfig(name="list", config={"values": ["bad"]}) # When: Getting evaluators - evaluator1 = get_evaluator(config1) - evaluator2 = get_evaluator(config2) + evaluator1 = get_evaluator_instance(config1) + evaluator2 = get_evaluator_instance(config2) - # Then: Should return different plugin types + # Then: Should return different evaluator types assert evaluator1 is not evaluator2 - assert isinstance(evaluator1, RegexPlugin) - assert isinstance(evaluator2, ListPlugin) + assert isinstance(evaluator1, RegexEvaluator) + assert isinstance(evaluator2, ListEvaluator) def test_evaluator_cache_clear_all(self): """Test that clear_evaluator_cache clears all entries.""" # Given: Two cached evaluators - config1 = EvaluatorConfig(plugin="regex", config={"pattern": "test1"}) - config2 = EvaluatorConfig(plugin="list", config={"values": ["test"]}) - evaluator1a = get_evaluator(config1) - evaluator2a = get_evaluator(config2) + config1 = EvaluatorConfig(name="regex", config={"pattern": "test1"}) + config2 = EvaluatorConfig(name="list", config={"values": ["test"]}) + evaluator1a = get_evaluator_instance(config1) + evaluator2a = get_evaluator_instance(config2) # When: Clearing cache clear_evaluator_cache() # When: Getting instances again - evaluator1b = get_evaluator(config1) - evaluator2b = get_evaluator(config2) + evaluator1b = get_evaluator_instance(config1) + evaluator2b = get_evaluator_instance(config2) # Then: Both should be new instances assert evaluator1a is not evaluator1b, "Should be new instance after clear" diff --git a/evaluators/README.md b/evaluators/README.md new file mode 100644 index 00000000..aa806cef --- /dev/null +++ b/evaluators/README.md @@ -0,0 +1,23 @@ +# agent-control-evaluators + +Evaluator implementations for agent-control. + +## Built-in Evaluators + +- **regex** - Pattern matching using regular expressions +- **list** - Value matching against allow/deny lists +- **json** - JSON schema validation +- **sql** - SQL query validation using sqlglot + +## Optional Evaluators + +- **luna2** - Galileo Luna-2 integration (requires `luna2` extra) + +## Installation + +```bash +pip install agent-control-evaluators + +# With Luna-2 support +pip install agent-control-evaluators[luna2] +``` diff --git a/plugins/pyproject.toml b/evaluators/pyproject.toml similarity index 55% rename from plugins/pyproject.toml rename to evaluators/pyproject.toml index 8d86b847..fde84fa0 100644 --- a/plugins/pyproject.toml +++ b/evaluators/pyproject.toml @@ -1,7 +1,7 @@ [project] -name = "agent-control-plugins" +name = "agent-control-evaluators" version = "0.1.0" -description = "Plugin implementations for agent-control" +description = "Evaluator implementations for agent-control" readme = "README.md" requires-python = ">=3.10" license = { text = "MIT" } @@ -19,18 +19,19 @@ luna2 = ["httpx>=0.24.0"] all = ["httpx>=0.24.0"] dev = ["pytest>=8.0.0", "pytest-asyncio>=0.23.0"] -[project.entry-points."agent_control.plugins"] -regex = "agent_control_plugins.builtin.regex:RegexPlugin" -list = "agent_control_plugins.builtin.list:ListPlugin" -luna2 = "agent_control_plugins.luna2.plugin:Luna2Plugin" +[project.entry-points."agent_control.evaluators"] +regex = "agent_control_evaluators.builtin.regex:RegexEvaluator" +list = "agent_control_evaluators.builtin.list:ListEvaluator" +json = "agent_control_evaluators.builtin.json:JSONEvaluator" +sql = "agent_control_evaluators.builtin.sql:SQLEvaluator" +luna2 = "agent_control_evaluators.luna2.evaluator:Luna2Evaluator" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["src/agent_control_plugins"] +packages = ["src/agent_control_evaluators"] [tool.uv.sources] agent-control-models = { workspace = true } - diff --git a/evaluators/src/agent_control_evaluators/__init__.py b/evaluators/src/agent_control_evaluators/__init__.py new file mode 100644 index 00000000..ef9ba126 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/__init__.py @@ -0,0 +1,32 @@ +"""Agent Control Evaluators. + +This package contains evaluator implementations for agent-control. +Built-in evaluators (regex, list, json, sql) are registered automatically on import. + +Available evaluators: + - regex: Regular expression matching (built-in) + - list: List-based value matching (built-in) + - json: JSON validation (built-in) + - sql: SQL query validation (built-in) + - galileo-luna2: Galileo Luna-2 runtime protection (pip install agent-control-evaluators[luna2]) + +Custom evaluators are Evaluator classes deployed with the engine. +Their schemas are registered via initAgent for validation purposes. +""" + +from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator + +# Import built-in evaluators to auto-register them +from .builtin import JSONEvaluator, ListEvaluator, RegexEvaluator, SQLEvaluator + +__version__ = "0.1.0" + +__all__ = [ + "Evaluator", + "EvaluatorMetadata", + "register_evaluator", + "RegexEvaluator", + "ListEvaluator", + "JSONEvaluator", + "SQLEvaluator", +] diff --git a/evaluators/src/agent_control_evaluators/builtin/__init__.py b/evaluators/src/agent_control_evaluators/builtin/__init__.py new file mode 100644 index 00000000..6b82f363 --- /dev/null +++ b/evaluators/src/agent_control_evaluators/builtin/__init__.py @@ -0,0 +1,11 @@ +"""Built-in evaluators for agent-control. + +These evaluators are automatically registered when this module is imported. +""" + +from .json import JSONEvaluator +from .list import ListEvaluator +from .regex import RegexEvaluator +from .sql import SQLEvaluator + +__all__ = ["JSONEvaluator", "ListEvaluator", "RegexEvaluator", "SQLEvaluator"] diff --git a/plugins/src/agent_control_plugins/builtin/json.py b/evaluators/src/agent_control_evaluators/builtin/json.py similarity index 97% rename from plugins/src/agent_control_plugins/builtin/json.py rename to evaluators/src/agent_control_evaluators/builtin/json.py index fcff4521..5e3eb15f 100644 --- a/plugins/src/agent_control_plugins/builtin/json.py +++ b/evaluators/src/agent_control_evaluators/builtin/json.py @@ -1,4 +1,4 @@ -"""JSON validation plugin with schema, type, required field, constraint, and pattern checks.""" +"""JSON validation evaluator with schema, type, required field, constraint, and pattern checks.""" import asyncio import json @@ -6,18 +6,18 @@ import re2 from agent_control_models import ( + Evaluator, + EvaluatorMetadata, EvaluatorResult, - JSONControlEvaluatorPluginConfig, - PluginEvaluator, - PluginMetadata, - register_plugin, + JSONEvaluatorConfig, + register_evaluator, ) from jsonschema import Draft7Validator -@register_plugin -class JSONControlEvaluatorPlugin(PluginEvaluator[JSONControlEvaluatorPluginConfig]): - """Comprehensive JSON validation plugin. +@register_evaluator +class JSONEvaluator(Evaluator[JSONEvaluatorConfig]): + """Comprehensive JSON validation evaluator. Validates JSON data in this order (fail-fast, simple to complex): 1. JSON syntax/validity - Parse and validate JSON structure @@ -53,7 +53,7 @@ class JSONControlEvaluatorPlugin(PluginEvaluator[JSONControlEvaluatorPluginConfi {"field_patterns": {"email": "^[a-z0-9._%+-]+@[a-z0-9.-]+\\\\.[a-z]+$"}} """ - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="json", version="1.0.0", description=( @@ -62,9 +62,9 @@ class JSONControlEvaluatorPlugin(PluginEvaluator[JSONControlEvaluatorPluginConfi ), timeout_ms=15000, # Longer timeout for schema validation ) - config_model = JSONControlEvaluatorPluginConfig + config_model = JSONEvaluatorConfig - def __init__(self, config: JSONControlEvaluatorPluginConfig) -> None: + def __init__(self, config: JSONEvaluatorConfig) -> None: super().__init__(config) # Pre-compile schema validator (thread-safe, immutable) @@ -193,7 +193,6 @@ def _handle_parse_error(self, error: str) -> EvaluatorResult: matched=True, confidence=1.0, message=f"Invalid JSON blocked: {error}", - error=error, ) def _check_schema(self, data: dict | list) -> EvaluatorResult | None: diff --git a/plugins/src/agent_control_plugins/builtin/list.py b/evaluators/src/agent_control_evaluators/builtin/list.py similarity index 90% rename from plugins/src/agent_control_plugins/builtin/list.py rename to evaluators/src/agent_control_evaluators/builtin/list.py index 9e2ad484..227e448f 100644 --- a/plugins/src/agent_control_plugins/builtin/list.py +++ b/evaluators/src/agent_control_evaluators/builtin/list.py @@ -1,21 +1,21 @@ -"""List plugin for value matching.""" +"""List evaluator for value matching.""" import re from typing import Any import re2 from agent_control_models import ( + Evaluator, + EvaluatorMetadata, EvaluatorResult, - ListConfig, - PluginEvaluator, - PluginMetadata, - register_plugin, + ListEvaluatorConfig, + register_evaluator, ) -@register_plugin -class ListPlugin(PluginEvaluator[ListConfig]): - """List-based value matching plugin. +@register_evaluator +class ListEvaluator(Evaluator[ListEvaluatorConfig]): + """List-based value matching evaluator. Checks if data matches values in a list. Supports: - any/all logic (match any value vs match all values) @@ -28,14 +28,14 @@ class ListPlugin(PluginEvaluator[ListConfig]): {"values": ["approved"], "match_on": "no_match"} # Require approval """ - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="list", version="1.0.0", description="List-based value matching with flexible logic", ) - config_model = ListConfig + config_model = ListEvaluatorConfig - def __init__(self, config: ListConfig) -> None: + def __init__(self, config: ListEvaluatorConfig) -> None: super().__init__(config) self._values = [str(v) for v in config.values] self._regex: Any = self._build_regex() diff --git a/plugins/src/agent_control_plugins/builtin/regex.py b/evaluators/src/agent_control_evaluators/builtin/regex.py similarity index 82% rename from plugins/src/agent_control_plugins/builtin/regex.py rename to evaluators/src/agent_control_evaluators/builtin/regex.py index ecabfe42..7c3d04ae 100644 --- a/plugins/src/agent_control_plugins/builtin/regex.py +++ b/evaluators/src/agent_control_evaluators/builtin/regex.py @@ -1,20 +1,20 @@ -"""Regex plugin for pattern matching.""" +"""Regex evaluator for pattern matching.""" from typing import Any import re2 from agent_control_models import ( + Evaluator, + EvaluatorMetadata, EvaluatorResult, - PluginEvaluator, - PluginMetadata, - RegexConfig, - register_plugin, + RegexEvaluatorConfig, + register_evaluator, ) -@register_plugin -class RegexPlugin(PluginEvaluator[RegexConfig]): - """Regular expression pattern matching plugin. +@register_evaluator +class RegexEvaluator(Evaluator[RegexEvaluatorConfig]): + """Regular expression pattern matching evaluator. Matches data against a regex pattern using Google RE2 for safety (protects against ReDoS attacks). @@ -27,14 +27,14 @@ class RegexPlugin(PluginEvaluator[RegexConfig]): {"pattern": "secret", "flags": ["IGNORECASE"]} # Case-insensitive """ - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="regex", version="1.0.0", description="Regular expression pattern matching (RE2)", ) - config_model = RegexConfig + config_model = RegexEvaluatorConfig - def __init__(self, config: RegexConfig) -> None: + def __init__(self, config: RegexEvaluatorConfig) -> None: super().__init__(config) # Build pattern with flags pattern = config.pattern diff --git a/plugins/src/agent_control_plugins/builtin/sql.py b/evaluators/src/agent_control_evaluators/builtin/sql.py similarity index 98% rename from plugins/src/agent_control_plugins/builtin/sql.py rename to evaluators/src/agent_control_evaluators/builtin/sql.py index 6c2af493..b0bc5ebb 100644 --- a/plugins/src/agent_control_plugins/builtin/sql.py +++ b/evaluators/src/agent_control_evaluators/builtin/sql.py @@ -1,4 +1,4 @@ -"""Comprehensive SQL validation plugin. +"""Comprehensive SQL validation evaluator. Supports multi-statement, operation, table, column, and limit checking. """ @@ -11,11 +11,11 @@ import sqlglot from agent_control_models import ( + Evaluator, + EvaluatorMetadata, EvaluatorResult, - PluginEvaluator, - PluginMetadata, - SQLControlEvaluatorPluginConfig, - register_plugin, + SQLEvaluatorConfig, + register_evaluator, ) from sqlglot import exp @@ -53,9 +53,9 @@ class QueryAnalysis: defined_ctes: set[str] = field(default_factory=set) -@register_plugin -class SQLControlEvaluatorPlugin(PluginEvaluator[SQLControlEvaluatorPluginConfig]): - """Comprehensive SQL validation plugin. +@register_evaluator +class SQLEvaluator(Evaluator[SQLEvaluatorConfig]): + """Comprehensive SQL validation evaluator. Validates SQL queries in this order: 1. Multi-Statement: Control whether multiple SQL statements are allowed @@ -95,7 +95,7 @@ class SQLControlEvaluatorPlugin(PluginEvaluator[SQLControlEvaluatorPluginConfig] } """ - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="sql", version="1.0.0", description=( @@ -104,7 +104,7 @@ class SQLControlEvaluatorPlugin(PluginEvaluator[SQLControlEvaluatorPluginConfig] ), timeout_ms=10000, ) - config_model = SQLControlEvaluatorPluginConfig + config_model = SQLEvaluatorConfig # SQL operation type mappings DDL_OPERATIONS = { @@ -134,7 +134,7 @@ class SQLControlEvaluatorPlugin(PluginEvaluator[SQLControlEvaluatorPluginConfig] "SET TRANSACTION", } - def __init__(self, config: SQLControlEvaluatorPluginConfig) -> None: + def __init__(self, config: SQLEvaluatorConfig) -> None: super().__init__(config) # Pre-process operation controls @@ -698,16 +698,16 @@ async def evaluate(self, data: Any) -> EvaluatorResult: try: return await asyncio.to_thread(self._evaluate_sync, data) except Exception as e: - # Unexpected plugin error - fail open with error field set + # Unexpected evaluator error - fail open with error field set logger.error( - "SQL plugin unexpected error", + "SQL evaluator unexpected error", exc_info=True, extra={"error_type": type(e).__name__}, ) return EvaluatorResult( matched=False, confidence=0.0, - message="SQL plugin encountered an unexpected error", + message="SQL evaluator encountered an unexpected error", error=f"{type(e).__name__}: {str(e)[:200]}", ) @@ -786,7 +786,7 @@ def _evaluate_sync(self, data: Any) -> EvaluatorResult: }, ) - # Invalid SQL fails validation (not a plugin error, just bad input) + # Invalid SQL fails validation (not an evaluator error, just bad input) return EvaluatorResult( matched=True, confidence=1.0, diff --git a/plugins/src/agent_control_plugins/luna2/__init__.py b/evaluators/src/agent_control_evaluators/luna2/__init__.py similarity index 71% rename from plugins/src/agent_control_plugins/luna2/__init__.py rename to evaluators/src/agent_control_evaluators/luna2/__init__.py index 32bfc5b2..1ead7014 100644 --- a/plugins/src/agent_control_plugins/luna2/__init__.py +++ b/evaluators/src/agent_control_evaluators/luna2/__init__.py @@ -1,10 +1,10 @@ -"""Galileo Luna-2 plugin for agent-control. +"""Galileo Luna-2 evaluator for agent-control. -This plugin integrates with Galileo's Luna-2 enterprise runtime protection system +This evaluator integrates with Galileo's Luna-2 enterprise runtime protection system using direct HTTP API calls (no SDK dependency required). Installation: - pip install agent-control-plugins[luna2] + pip install agent-control-evaluators[luna2] Environment Variables: GALILEO_API_KEY: Your Galileo API key (required) @@ -15,14 +15,14 @@ https://v2docs.galileo.ai/sdk-api/python/reference/protect """ -from .config import Luna2Config, Luna2Metric, Luna2Operator -from .plugin import LUNA2_AVAILABLE, Luna2Plugin +from .config import Luna2EvaluatorConfig, Luna2Metric, Luna2Operator +from .evaluator import LUNA2_AVAILABLE, Luna2Evaluator __all__ = [ - "Luna2Config", + "Luna2EvaluatorConfig", "Luna2Metric", "Luna2Operator", - "Luna2Plugin", + "Luna2Evaluator", "LUNA2_AVAILABLE", ] diff --git a/plugins/src/agent_control_plugins/luna2/client.py b/evaluators/src/agent_control_evaluators/luna2/client.py similarity index 99% rename from plugins/src/agent_control_plugins/luna2/client.py rename to evaluators/src/agent_control_evaluators/luna2/client.py index c05c69a6..192fd868 100644 --- a/plugins/src/agent_control_plugins/luna2/client.py +++ b/evaluators/src/agent_control_evaluators/luna2/client.py @@ -380,4 +380,3 @@ async def __aenter__(self) -> "GalileoProtectClient": async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: """Async context manager exit.""" await self.close() - diff --git a/plugins/src/agent_control_plugins/luna2/config.py b/evaluators/src/agent_control_evaluators/luna2/config.py similarity index 93% rename from plugins/src/agent_control_plugins/luna2/config.py rename to evaluators/src/agent_control_evaluators/luna2/config.py index 1b871427..44e4563a 100644 --- a/plugins/src/agent_control_plugins/luna2/config.py +++ b/evaluators/src/agent_control_evaluators/luna2/config.py @@ -1,4 +1,4 @@ -"""Configuration models for Luna-2 plugin.""" +"""Configuration models for Luna-2 evaluator.""" from typing import Any, Literal, Union @@ -20,8 +20,8 @@ Luna2Operator = Literal["gt", "lt", "gte", "lte", "eq", "contains", "any"] -class Luna2Config(BaseModel): - """Configuration for Luna-2 plugin. +class Luna2EvaluatorConfig(BaseModel): + """Configuration for Luna-2 evaluator. Two stage types are supported: - local: Define rules at runtime (requires metric, operator, target_value) @@ -29,7 +29,7 @@ class Luna2Config(BaseModel): Example (local stage with numeric threshold - recommended): ```python - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="gt", @@ -40,7 +40,7 @@ class Luna2Config(BaseModel): Example (central stage - recommended for production): ```python - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="central", stage_name="production-guard", galileo_project="my-project", @@ -105,7 +105,7 @@ class Luna2Config(BaseModel): ) @model_validator(mode="after") - def validate_stage_config(self) -> "Luna2Config": + def validate_stage_config(self) -> "Luna2EvaluatorConfig": """Validate that required fields are present based on stage_type.""" if self.stage_type == "local": if not self.metric: @@ -118,4 +118,3 @@ def validate_stage_config(self) -> "Luna2Config": if not self.stage_name: raise ValueError("'stage_name' is required for central stage") return self - diff --git a/plugins/src/agent_control_plugins/luna2/plugin.py b/evaluators/src/agent_control_evaluators/luna2/evaluator.py similarity index 91% rename from plugins/src/agent_control_plugins/luna2/plugin.py rename to evaluators/src/agent_control_evaluators/luna2/evaluator.py index e40c90d2..5efaf321 100644 --- a/plugins/src/agent_control_plugins/luna2/plugin.py +++ b/evaluators/src/agent_control_evaluators/luna2/evaluator.py @@ -1,6 +1,6 @@ -"""Luna-2 plugin implementation using direct API calls. +"""Luna-2 evaluator implementation using direct API calls. -This plugin calls the Galileo Protect API directly via HTTP, without requiring +This evaluator calls the Galileo Protect API directly via HTTP, without requiring the full galileo-sdk package. Only httpx is needed as a dependency. """ @@ -9,13 +9,13 @@ from typing import Any from agent_control_models import ( + Evaluator, + EvaluatorMetadata, EvaluatorResult, - PluginEvaluator, - PluginMetadata, - register_plugin, + register_evaluator, ) -from .config import Luna2Config +from .config import Luna2EvaluatorConfig logger = logging.getLogger(__name__) @@ -44,11 +44,11 @@ Ruleset = None # type: ignore -@register_plugin -class Luna2Plugin(PluginEvaluator[Luna2Config]): - """Galileo Luna-2 runtime protection plugin. +@register_evaluator +class Luna2Evaluator(Evaluator[Luna2EvaluatorConfig]): + """Galileo Luna-2 runtime protection evaluator. - This plugin uses Galileo's Luna-2 enterprise model for real-time + This evaluator uses Galileo's Luna-2 enterprise model for real-time safety and quality checks on agent inputs and outputs. It calls the Galileo Protect API directly via HTTP. @@ -67,9 +67,9 @@ class Luna2Plugin(PluginEvaluator[Luna2Config]): Example: ```python - from agent_control_plugins.luna2 import Luna2Plugin, Luna2Config + from agent_control_evaluators.luna2 import Luna2Evaluator, Luna2EvaluatorConfig - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="gt", @@ -77,8 +77,8 @@ class Luna2Plugin(PluginEvaluator[Luna2Config]): galileo_project="my-project", ) - plugin = Luna2Plugin(config) - result = await plugin.evaluate("some text") + evaluator = Luna2Evaluator(config) + result = await evaluator.evaluate("some text") ``` Environment Variables: @@ -86,25 +86,25 @@ class Luna2Plugin(PluginEvaluator[Luna2Config]): GALILEO_CONSOLE_URL: Galileo Console URL (optional). """ - metadata = PluginMetadata( + metadata = EvaluatorMetadata( name="galileo-luna2", version="2.0.0", description="Galileo Luna-2 enterprise runtime protection (direct API)", requires_api_key=True, timeout_ms=10000, ) - config_model = Luna2Config + config_model = Luna2EvaluatorConfig @classmethod def is_available(cls) -> bool: """Check if httpx dependency is installed.""" return LUNA2_AVAILABLE - def __init__(self, config: Luna2Config) -> None: - """Initialize Luna-2 plugin with configuration. + def __init__(self, config: Luna2EvaluatorConfig) -> None: + """Initialize Luna-2 evaluator with configuration. Args: - config: Validated Luna2Config instance. + config: Validated Luna2EvaluatorConfig instance. Raises: ValueError: If GALILEO_API_KEY is not set. diff --git a/plugins/src/agent_control_plugins/py.typed b/evaluators/src/agent_control_evaluators/py.typed similarity index 100% rename from plugins/src/agent_control_plugins/py.typed rename to evaluators/src/agent_control_evaluators/py.typed diff --git a/evaluators/tests/__init__.py b/evaluators/tests/__init__.py new file mode 100644 index 00000000..9cf66e72 --- /dev/null +++ b/evaluators/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for agent_control_evaluators package.""" diff --git a/evaluators/tests/test_base.py b/evaluators/tests/test_base.py new file mode 100644 index 00000000..e5bdc5da --- /dev/null +++ b/evaluators/tests/test_base.py @@ -0,0 +1,141 @@ +"""Tests for evaluator base classes. + +Architecture: Evaluators take config at __init__, evaluate() only takes data. +""" + +import pytest +from typing import Any + +from pydantic import BaseModel + +from agent_control_models import EvaluatorResult, Evaluator, EvaluatorMetadata + + +class MockConfig(BaseModel): + """Config model for mock evaluator.""" + + should_match: bool = False + timeout_ms: int = 5000 + + +class MockEvaluator(Evaluator[MockConfig]): + """A mock evaluator for testing.""" + + metadata = EvaluatorMetadata( + name="mock-evaluator", + version="1.0.0", + description="A mock evaluator for testing", + requires_api_key=False, + timeout_ms=5000, + ) + config_model = MockConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Simple mock evaluation.""" + return EvaluatorResult( + matched=self.config.should_match, + confidence=1.0, + message="Mock evaluation", + metadata={"data": str(data)}, + ) + + +class TestEvaluatorMetadata: + """Tests for EvaluatorMetadata dataclass.""" + + def test_metadata_with_defaults(self): + """Test metadata with default values.""" + metadata = EvaluatorMetadata( + name="test-evaluator", + version="1.0.0", + description="Test evaluator", + ) + + assert metadata.name == "test-evaluator" + assert metadata.version == "1.0.0" + assert metadata.description == "Test evaluator" + assert metadata.requires_api_key is False + assert metadata.timeout_ms == 10000 + + def test_metadata_with_all_fields(self): + """Test metadata with all fields specified.""" + metadata = EvaluatorMetadata( + name="full-evaluator", + version="2.0.0", + description="Full evaluator", + requires_api_key=True, + timeout_ms=15000, + ) + + assert metadata.name == "full-evaluator" + assert metadata.version == "2.0.0" + assert metadata.requires_api_key is True + assert metadata.timeout_ms == 15000 + + +class TestEvaluator: + """Tests for Evaluator base class.""" + + def test_evaluator_is_abstract(self): + """Test that Evaluator is an ABC.""" + from abc import ABC + assert issubclass(Evaluator, ABC) + + def test_mock_evaluator_metadata(self): + """Test that mock evaluator has correct metadata.""" + assert MockEvaluator.metadata.name == "mock-evaluator" + assert MockEvaluator.metadata.version == "1.0.0" + assert MockEvaluator.metadata.timeout_ms == 5000 + + @pytest.mark.asyncio + async def test_mock_evaluator_evaluate(self): + """Test mock evaluator evaluation.""" + evaluator = MockEvaluator.from_dict({"should_match": True}) + + result = await evaluator.evaluate("test data") + + assert result.matched is True + assert result.confidence == 1.0 + assert result.metadata["data"] == "test data" + + @pytest.mark.asyncio + async def test_mock_evaluator_evaluate_no_match(self): + """Test mock evaluator evaluation without match.""" + evaluator = MockEvaluator.from_dict({"should_match": False}) + + result = await evaluator.evaluate("test data") + + assert result.matched is False + + def test_evaluator_config_stored(self): + """Test that evaluator stores config.""" + evaluator = MockEvaluator.from_dict({"should_match": True}) + + assert isinstance(evaluator.config, MockConfig) + assert evaluator.config.should_match is True + + def test_get_timeout_seconds_from_config(self): + """Test timeout conversion from config.""" + evaluator = MockEvaluator.from_dict({"timeout_ms": 3000}) + + assert evaluator.get_timeout_seconds() == 3.0 + + def test_get_timeout_seconds_different_values(self): + """Test timeout with different values.""" + evaluator1 = MockEvaluator.from_dict({"timeout_ms": 7500}) + evaluator2 = MockEvaluator.from_dict({"timeout_ms": 1000}) + + assert evaluator1.get_timeout_seconds() == 7.5 + assert evaluator2.get_timeout_seconds() == 1.0 + + def test_get_timeout_seconds_from_default(self): + """Test timeout uses metadata default when not in config.""" + evaluator = MockEvaluator.from_dict({}) # No timeout_ms in config + + # MockConfig has default timeout_ms=5000 + assert evaluator.get_timeout_seconds() == 5.0 + + def test_cannot_instantiate_abstract_class(self): + """Test that Evaluator cannot be instantiated directly.""" + with pytest.raises(TypeError, match="abstract"): + Evaluator({}) # type: ignore diff --git a/plugins/tests/test_json.py b/evaluators/tests/test_json.py similarity index 63% rename from plugins/tests/test_json.py rename to evaluators/tests/test_json.py index 1ae49e8a..77d52243 100644 --- a/plugins/tests/test_json.py +++ b/evaluators/tests/test_json.py @@ -1,8 +1,8 @@ -"""Tests for JSON validation plugin.""" +"""Tests for JSON validation evaluator.""" import pytest -from agent_control_models import JSONControlEvaluatorPluginConfig -from agent_control_plugins.builtin.json import JSONControlEvaluatorPlugin +from agent_control_models import JSONEvaluatorConfig +from agent_control_evaluators.builtin.json import JSONEvaluator class TestJSONParsing: @@ -11,40 +11,40 @@ class TestJSONParsing: @pytest.mark.asyncio async def test_dict_input(self): """Test that dict input is accepted as-is.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["id"])) - result = await plugin.evaluate({"id": 123}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["id"])) + result = await evaluator.evaluate({"id": 123}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_json_string_input(self): """Test that JSON string input is parsed correctly.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["id"])) - result = await plugin.evaluate('{"id": 123}') + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["id"])) + result = await evaluator.evaluate('{"id": 123}') assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_invalid_json_blocked_by_default(self): """Test that invalid JSON is blocked by default.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["id"])) - result = await plugin.evaluate("{invalid json") + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["id"])) + result = await evaluator.evaluate("{invalid json") assert result.matched is True # Blocked by default assert "Invalid JSON blocked" in result.message @pytest.mark.asyncio async def test_invalid_json_allowed_when_configured(self): """Test that invalid JSON is allowed when allow_invalid_json=True.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(required_fields=["id"], allow_invalid_json=True) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(required_fields=["id"], allow_invalid_json=True) ) - result = await plugin.evaluate("{invalid json") + result = await evaluator.evaluate("{invalid json") assert result.matched is False assert "Invalid JSON allowed" in result.message @pytest.mark.asyncio async def test_none_input(self): """Test that None input is handled gracefully.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["id"])) - result = await plugin.evaluate(None) + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["id"])) + result = await evaluator.evaluate(None) assert result.matched is True assert "None" in result.message @@ -60,16 +60,16 @@ async def test_valid_schema(self): "required": ["id", "name"], "properties": {"id": {"type": "integer"}, "name": {"type": "string"}}, } - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(json_schema=schema)) - result = await plugin.evaluate({"id": 1, "name": "test"}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(json_schema=schema)) + result = await evaluator.evaluate({"id": 1, "name": "test"}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_invalid_schema_missing_required(self): """Test that missing required fields fail schema validation.""" schema = {"type": "object", "required": ["id", "name"]} - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(json_schema=schema)) - result = await plugin.evaluate({"id": 1}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(json_schema=schema)) + result = await evaluator.evaluate({"id": 1}) assert result.matched is True # Failed assert "Schema validation failed" in result.message assert "'name' is a required property" in result.message @@ -78,8 +78,8 @@ async def test_invalid_schema_missing_required(self): async def test_invalid_schema_wrong_type(self): """Test that wrong type fails schema validation.""" schema = {"type": "object", "properties": {"id": {"type": "integer"}}} - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(json_schema=schema)) - result = await plugin.evaluate({"id": "not-an-int"}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(json_schema=schema)) + result = await evaluator.evaluate({"id": "not-an-int"}) assert result.matched is True # Failed assert "Schema validation failed" in result.message @@ -96,19 +96,19 @@ async def test_nested_object_validation(self): } }, } - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(json_schema=schema)) - result = await plugin.evaluate({"user": {"id": 123}}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(json_schema=schema)) + result = await evaluator.evaluate({"user": {"id": 123}}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_array_validation(self): """Test schema validation on arrays.""" schema = {"type": "array", "items": {"type": "integer"}} - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(json_schema=schema)) - result = await plugin.evaluate([1, 2, 3]) + evaluator = JSONEvaluator(JSONEvaluatorConfig(json_schema=schema)) + result = await evaluator.evaluate([1, 2, 3]) assert result.matched is False # Validation passed - result = await plugin.evaluate([1, "not-int", 3]) + result = await evaluator.evaluate([1, "not-int", 3]) assert result.matched is True # Failed @@ -118,42 +118,42 @@ class TestRequiredFieldsValidation: @pytest.mark.asyncio async def test_all_present(self): """Test that all required fields present passes validation.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["id", "name", "email"])) - result = await plugin.evaluate({"id": 1, "name": "test", "email": "test@example.com"}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["id", "name", "email"])) + result = await evaluator.evaluate({"id": 1, "name": "test", "email": "test@example.com"}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_missing_field(self): """Test that missing required field fails validation.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["id", "name"])) - result = await plugin.evaluate({"id": 1}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["id", "name"])) + result = await evaluator.evaluate({"id": 1}) assert result.matched is True # Failed assert "Missing required fields: name" in result.message @pytest.mark.asyncio async def test_null_allowed(self): """Test that null values are allowed when configured.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(required_fields=["id"], allow_null_required=True) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(required_fields=["id"], allow_null_required=True) ) - result = await plugin.evaluate({"id": None}) + result = await evaluator.evaluate({"id": None}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_null_disallowed(self): """Test that null values fail when disallowed.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(required_fields=["id"], allow_null_required=False) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(required_fields=["id"], allow_null_required=False) ) - result = await plugin.evaluate({"id": None}) + result = await evaluator.evaluate({"id": None}) assert result.matched is True # Failed assert "null not allowed" in result.message @pytest.mark.asyncio async def test_nested_required_fields(self): """Test required fields validation on nested paths.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["user.id", "user.email"])) - result = await plugin.evaluate({"user": {"id": 123, "email": "test@example.com"}}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["user.id", "user.email"])) + result = await evaluator.evaluate({"user": {"id": 123, "email": "test@example.com"}}) assert result.matched is False # Validation passed @@ -163,8 +163,8 @@ class TestTypesValidation: @pytest.mark.asyncio async def test_all_types_match(self): """Test that all types matching passes validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_types={ "id": "string", "age": "integer", @@ -176,7 +176,7 @@ async def test_all_types_match(self): } ) ) - result = await plugin.evaluate( + result = await evaluator.evaluate( { "id": "123", "age": 25, @@ -192,85 +192,80 @@ async def test_all_types_match(self): @pytest.mark.asyncio async def test_type_mismatch(self): """Test that type mismatch fails validation.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(field_types={"id": "string"})) - result = await plugin.evaluate({"id": 123}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(field_types={"id": "string"})) + result = await evaluator.evaluate({"id": 123}) assert result.matched is True # Failed assert "expected string, got integer" in result.message @pytest.mark.asyncio async def test_missing_field(self): """Test that missing field fails type validation.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(field_types={"id": "string"})) - result = await plugin.evaluate({"name": "test"}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(field_types={"id": "string"})) + result = await evaluator.evaluate({"name": "test"}) assert result.matched is True # Failed assert "field not found" in result.message @pytest.mark.asyncio async def test_nested_field_types(self): """Test type checking on nested fields.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_types={"user.id": "integer", "user.name": "string"}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_types={"user.id": "integer", "user.name": "string"}) ) - result = await plugin.evaluate({"user": {"id": 123, "name": "test"}}) + result = await evaluator.evaluate({"user": {"id": 123, "name": "test"}}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_extra_fields_allowed(self): """Test that extra fields are allowed by default.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_types={"id": "string"}, allow_extra_fields=True) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_types={"id": "string"}, allow_extra_fields=True) ) - result = await plugin.evaluate({"id": "123", "extra": "field"}) + result = await evaluator.evaluate({"id": "123", "extra": "field"}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_extra_fields_denied(self): """Test that extra fields can be denied.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_types={"id": "string"}, allow_extra_fields=False) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_types={"id": "string"}, allow_extra_fields=False) ) - result = await plugin.evaluate({"id": "123", "extra": "field"}) + result = await evaluator.evaluate({"id": "123", "extra": "field"}) assert result.matched is True # Failed assert "Extra fields not allowed" in result.message @pytest.mark.asyncio async def test_array_input_fails_type_check(self): """Test that array input fails type checking gracefully.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(field_types={"id": "string"})) - result = await plugin.evaluate([1, 2, 3]) + evaluator = JSONEvaluator(JSONEvaluatorConfig(field_types={"id": "string"})) + result = await evaluator.evaluate([1, 2, 3]) assert result.matched is True # Failed assert "requires a JSON object, got array" in result.message @pytest.mark.asyncio async def test_nested_fields_with_strict_mode_no_extra_fields(self): - """Test P1 fix: Nested fields with allow_extra_fields=False should not flag parent containers. - - This was a critical bug where specifying field_types={"user.id": "string"} with - allow_extra_fields=False would incorrectly flag "user" as an extra field, even though - it's just a container for the typed field "user.id". - """ - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + """Test P1 fix: Nested fields with allow_extra_fields=False should not flag parent containers.""" + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_types={"user.id": "string"}, allow_extra_fields=False, ) ) # Should pass: "user" is a container, "user.id" is the typed leaf field - result = await plugin.evaluate({"user": {"id": "123"}}) + result = await evaluator.evaluate({"user": {"id": "123"}}) assert result.matched is False # Validation passed assert "Extra fields" not in result.message @pytest.mark.asyncio async def test_nested_fields_strict_mode_detects_actual_extra_leaf_fields(self): """Test that strict mode still catches actual extra leaf fields in nested objects.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_types={"user.id": "string"}, allow_extra_fields=False, ) ) # Should fail: "user.name" is an extra leaf field not in field_types - result = await plugin.evaluate({"user": {"id": "123", "name": "test"}}) + result = await evaluator.evaluate({"user": {"id": "123", "name": "test"}}) assert result.matched is True # Failed assert "Extra fields not allowed" in result.message assert "user.name" in result.message @@ -278,45 +273,41 @@ async def test_nested_fields_strict_mode_detects_actual_extra_leaf_fields(self): @pytest.mark.asyncio async def test_multiple_nested_levels_strict_mode(self): """Test strict mode with multiple levels of nesting.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_types={"user.profile.email": "string"}, allow_extra_fields=False, ) ) # Should pass: "user" and "user.profile" are containers - result = await plugin.evaluate({"user": {"profile": {"email": "test@example.com"}}}) + result = await evaluator.evaluate({"user": {"profile": {"email": "test@example.com"}}}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_nested_fields_with_required_and_strict_mode(self): - """Test nested fields with both required_fields and strict mode. - - When both required_fields and field_types are specified with allow_extra_fields=False, - the plugin should allow fields that are either in field_types OR required_fields. - """ - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + """Test nested fields with both required_fields and strict mode.""" + evaluator = JSONEvaluator( + JSONEvaluatorConfig( required_fields=["user.id", "user.email"], field_types={"user.id": "string"}, allow_extra_fields=False, ) ) # Should pass: both user.id and user.email are allowed (one typed, one required) - result = await plugin.evaluate({"user": {"id": "123", "email": "test@example.com"}}) + result = await evaluator.evaluate({"user": {"id": "123", "email": "test@example.com"}}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_strict_mode_top_level_extra_field_still_detected(self): """Test that top-level extra fields are still detected in strict mode.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_types={"id": "string"}, allow_extra_fields=False, ) ) # Should fail: "extra" is a top-level extra field - result = await plugin.evaluate({"id": "123", "extra": "field"}) + result = await evaluator.evaluate({"id": "123", "extra": "field"}) assert result.matched is True # Failed assert "Extra fields not allowed" in result.message @@ -327,97 +318,97 @@ class TestConstraintsValidation: @pytest.mark.asyncio async def test_numeric_range_within_bounds(self): """Test that numeric value within range passes validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"score": {"min": 0.0, "max": 1.0}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"score": {"min": 0.0, "max": 1.0}}) ) - result = await plugin.evaluate({"score": 0.75}) + result = await evaluator.evaluate({"score": 0.75}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_numeric_range_below_min(self): """Test that value below minimum fails validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"score": {"min": 0.0, "max": 1.0}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"score": {"min": 0.0, "max": 1.0}}) ) - result = await plugin.evaluate({"score": -0.5}) + result = await evaluator.evaluate({"score": -0.5}) assert result.matched is True # Failed assert "below minimum" in result.message @pytest.mark.asyncio async def test_numeric_range_above_max(self): """Test that value above maximum fails validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"score": {"min": 0.0, "max": 1.0}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"score": {"min": 0.0, "max": 1.0}}) ) - result = await plugin.evaluate({"score": 1.5}) + result = await evaluator.evaluate({"score": 1.5}) assert result.matched is True # Failed assert "above maximum" in result.message @pytest.mark.asyncio async def test_integer_range(self): """Test integer range constraints.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"count": {"min": -10, "max": 5}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"count": {"min": -10, "max": 5}}) ) - result = await plugin.evaluate({"count": 3}) + result = await evaluator.evaluate({"count": 3}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"count": 10}) + result = await evaluator.evaluate({"count": 10}) assert result.matched is True # Failed @pytest.mark.asyncio async def test_enum_valid_value(self): """Test that valid enum value passes validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"status": {"enum": ["pending", "approved", "rejected"]}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"status": {"enum": ["pending", "approved", "rejected"]}}) ) - result = await plugin.evaluate({"status": "approved"}) + result = await evaluator.evaluate({"status": "approved"}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_enum_invalid_value(self): """Test that invalid enum value fails validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"status": {"enum": ["pending", "approved", "rejected"]}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"status": {"enum": ["pending", "approved", "rejected"]}}) ) - result = await plugin.evaluate({"status": "invalid"}) + result = await evaluator.evaluate({"status": "invalid"}) assert result.matched is True # Failed assert "not in allowed values" in result.message @pytest.mark.asyncio async def test_string_length_within_range(self): """Test that string length within range passes validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"username": {"min_length": 3, "max_length": 20}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"username": {"min_length": 3, "max_length": 20}}) ) - result = await plugin.evaluate({"username": "test_user"}) + result = await evaluator.evaluate({"username": "test_user"}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_string_length_too_short(self): """Test that string shorter than minimum fails validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"username": {"min_length": 3, "max_length": 20}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"username": {"min_length": 3, "max_length": 20}}) ) - result = await plugin.evaluate({"username": "ab"}) + result = await evaluator.evaluate({"username": "ab"}) assert result.matched is True # Failed assert "below minimum" in result.message @pytest.mark.asyncio async def test_string_length_too_long(self): """Test that string longer than maximum fails validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"username": {"min_length": 3, "max_length": 20}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"username": {"min_length": 3, "max_length": 20}}) ) - result = await plugin.evaluate({"username": "a" * 25}) + result = await evaluator.evaluate({"username": "a" * 25}) assert result.matched is True # Failed assert "above maximum" in result.message @pytest.mark.asyncio async def test_mixed_constraints(self): """Test multiple constraint types on different fields.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_constraints={ "score": {"min": 0.0, "max": 1.0}, "status": {"enum": ["active", "inactive"]}, @@ -425,7 +416,7 @@ async def test_mixed_constraints(self): } ) ) - result = await plugin.evaluate({"score": 0.8, "status": "active", "name": "Test"}) + result = await evaluator.evaluate({"score": 0.8, "status": "active", "name": "Test"}) assert result.matched is False # Validation passed @@ -435,8 +426,8 @@ class TestPatternMatching: @pytest.mark.asyncio async def test_all_patterns_match(self): """Test that all patterns matching passes validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_patterns={ "email": r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]+$", "phone": r"^\+?[1-9]\d{1,14}$", @@ -444,14 +435,14 @@ async def test_all_patterns_match(self): pattern_match_logic="all", ) ) - result = await plugin.evaluate({"email": "test@example.com", "phone": "+1234567890"}) + result = await evaluator.evaluate({"email": "test@example.com", "phone": "+1234567890"}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_pattern_fails_all_mode(self): """Test that one pattern failing fails 'all' mode validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_patterns={ "email": r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]+$", "phone": r"^\+?[1-9]\d{1,14}$", @@ -459,15 +450,15 @@ async def test_pattern_fails_all_mode(self): pattern_match_logic="all", ) ) - result = await plugin.evaluate({"email": "invalid", "phone": "+1234567890"}) + result = await evaluator.evaluate({"email": "invalid", "phone": "+1234567890"}) assert result.matched is True # Failed assert "Pattern validation failed" in result.message @pytest.mark.asyncio async def test_any_pattern_match(self): """Test that any pattern matching passes 'any' mode validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_patterns={ "email": r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]+$", "phone": r"^\+?[1-9]\d{1,14}$", @@ -475,19 +466,19 @@ async def test_any_pattern_match(self): pattern_match_logic="any", ) ) - result = await plugin.evaluate({"email": "test@example.com", "phone": "invalid"}) + result = await evaluator.evaluate({"email": "test@example.com", "phone": "invalid"}) assert result.matched is False # Validation passed (email matched) @pytest.mark.asyncio async def test_no_patterns_match_any_mode(self): """Test that no patterns matching fails 'any' mode validation.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_patterns={"email": r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]+$"}, pattern_match_logic="any", ) ) - result = await plugin.evaluate({"email": "invalid"}) + result = await evaluator.evaluate({"email": "invalid"}) assert result.matched is True # Failed assert "No patterns matched" in result.message @@ -498,54 +489,54 @@ class TestCombinedValidation: @pytest.mark.asyncio async def test_all_checks_pass(self): """Test that all checks passing results in validation success.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( required_fields=["id", "email"], field_types={"id": "string", "email": "string", "age": "integer"}, field_constraints={"age": {"min": 0, "max": 120}}, field_patterns={"email": r"^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]+$"}, ) ) - result = await plugin.evaluate({"id": "123", "email": "test@example.com", "age": 30}) + result = await evaluator.evaluate({"id": "123", "email": "test@example.com", "age": 30}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_fails_at_required_check(self): """Test that validation fails at required fields check.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( required_fields=["id", "email"], field_types={"id": "string", "email": "string"}, ) ) - result = await plugin.evaluate({"id": "123"}) # Missing email + result = await evaluator.evaluate({"id": "123"}) # Missing email assert result.matched is True # Failed assert "Missing required fields" in result.message @pytest.mark.asyncio async def test_fails_at_type_check(self): """Test that validation fails at type check.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( required_fields=["id"], field_types={"id": "integer"}, ) ) - result = await plugin.evaluate({"id": "not-an-int"}) + result = await evaluator.evaluate({"id": "not-an-int"}) assert result.matched is True # Failed assert "Type validation failed" in result.message @pytest.mark.asyncio async def test_fails_at_constraint_check(self): """Test that validation fails at constraint check.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( required_fields=["score"], field_types={"score": "number"}, field_constraints={"score": {"min": 0.0, "max": 1.0}}, ) ) - result = await plugin.evaluate({"score": 1.5}) + result = await evaluator.evaluate({"score": 1.5}) assert result.matched is True # Failed assert "Constraint validation failed" in result.message @@ -558,32 +549,32 @@ def test_invalid_schema_rejected(self): from jsonschema.exceptions import SchemaError with pytest.raises(SchemaError): - JSONControlEvaluatorPluginConfig(json_schema={"type": "invalid-type"}) + JSONEvaluatorConfig(json_schema={"type": "invalid-type"}) def test_invalid_type_name_rejected(self): """Test that invalid type name is rejected at config time.""" with pytest.raises(ValueError, match="Invalid type"): - JSONControlEvaluatorPluginConfig(field_types={"id": "invalid-type"}) + JSONEvaluatorConfig(field_types={"id": "invalid-type"}) def test_invalid_regex_pattern_rejected(self): """Test that invalid regex pattern is rejected at config time.""" with pytest.raises(ValueError, match="Invalid regex"): - JSONControlEvaluatorPluginConfig(field_patterns={"email": "["}) # Invalid regex + JSONEvaluatorConfig(field_patterns={"email": "["}) # Invalid regex def test_empty_enum_rejected(self): """Test that empty enum list is rejected at config time.""" with pytest.raises(ValueError, match="non-empty list"): - JSONControlEvaluatorPluginConfig(field_constraints={"status": {"enum": []}}) + JSONEvaluatorConfig(field_constraints={"status": {"enum": []}}) def test_invalid_min_length_type_rejected(self): """Test that non-integer min_length is rejected at config time.""" with pytest.raises(ValueError, match="must be an integer"): - JSONControlEvaluatorPluginConfig(field_constraints={"name": {"min_length": "invalid"}}) + JSONEvaluatorConfig(field_constraints={"name": {"min_length": "invalid"}}) def test_at_least_one_check_required(self): """Test that at least one validation check must be configured.""" with pytest.raises(ValueError, match="At least one validation check"): - JSONControlEvaluatorPluginConfig() + JSONEvaluatorConfig() class TestNestedValues: @@ -592,30 +583,30 @@ class TestNestedValues: @pytest.mark.asyncio async def test_deep_nesting(self): """Test validation on deeply nested fields.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(required_fields=["a.b.c.d.e"], field_types={"a.b.c.d.e": "integer"}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(required_fields=["a.b.c.d.e"], field_types={"a.b.c.d.e": "integer"}) ) - result = await plugin.evaluate({"a": {"b": {"c": {"d": {"e": 42}}}}}) + result = await evaluator.evaluate({"a": {"b": {"c": {"d": {"e": 42}}}}}) assert result.matched is False # Validation passed @pytest.mark.asyncio async def test_missing_intermediate_key(self): """Test that missing intermediate key is handled gracefully.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(required_fields=["a.b.c"])) - result = await plugin.evaluate({"a": {"x": 1}}) # Missing 'b' + evaluator = JSONEvaluator(JSONEvaluatorConfig(required_fields=["a.b.c"])) + result = await evaluator.evaluate({"a": {"x": 1}}) # Missing 'b' assert result.matched is True # Failed assert "Missing required fields" in result.message @pytest.mark.asyncio async def test_constraints_on_nested_fields(self): """Test constraints on nested field paths.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"user.age": {"min": 0, "max": 120}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"user.age": {"min": 0, "max": 120}}) ) - result = await plugin.evaluate({"user": {"age": 30}}) + result = await evaluator.evaluate({"user": {"age": 30}}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"user": {"age": 150}}) + result = await evaluator.evaluate({"user": {"age": 150}}) assert result.matched is True # Failed @@ -625,46 +616,46 @@ class TestEnumCaseSensitivity: @pytest.mark.asyncio async def test_enum_case_sensitive_default(self): """Test that enum matching is case-sensitive by default.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig(field_constraints={"status": {"enum": ["active", "inactive"]}}) + evaluator = JSONEvaluator( + JSONEvaluatorConfig(field_constraints={"status": {"enum": ["active", "inactive"]}}) ) # Should fail with "Active" (wrong case) - result = await plugin.evaluate({"status": "Active"}) + result = await evaluator.evaluate({"status": "Active"}) assert result.matched is True # Failed validation assert "not in allowed values" in result.message @pytest.mark.asyncio async def test_enum_case_insensitive_enabled(self): """Test case-insensitive enum matching when enabled.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_constraints={"status": {"enum": ["active", "inactive"]}}, case_sensitive_enums=False, ) ) # Should pass with any case - result = await plugin.evaluate({"status": "Active"}) + result = await evaluator.evaluate({"status": "Active"}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"status": "INACTIVE"}) + result = await evaluator.evaluate({"status": "INACTIVE"}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"status": "pending"}) + result = await evaluator.evaluate({"status": "pending"}) assert result.matched is True # Failed - not in enum @pytest.mark.asyncio async def test_enum_case_insensitive_non_strings(self): """Test that non-string enums still use exact matching.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_constraints={"code": {"enum": [1, 2, 3]}}, case_sensitive_enums=False, ) ) - result = await plugin.evaluate({"code": 1}) + result = await evaluator.evaluate({"code": 1}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"code": 4}) + result = await evaluator.evaluate({"code": 4}) assert result.matched is True # Failed validation @@ -674,35 +665,35 @@ class TestPatternFlags: @pytest.mark.asyncio async def test_pattern_case_sensitive_default(self): """Test that pattern matching is case-sensitive by default.""" - plugin = JSONControlEvaluatorPlugin(JSONControlEvaluatorPluginConfig(field_patterns={"code": "^[A-Z]{3}$"})) - result = await plugin.evaluate({"code": "ABC"}) + evaluator = JSONEvaluator(JSONEvaluatorConfig(field_patterns={"code": "^[A-Z]{3}$"})) + result = await evaluator.evaluate({"code": "ABC"}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"code": "abc"}) + result = await evaluator.evaluate({"code": "abc"}) assert result.matched is True # Failed - lowercase @pytest.mark.asyncio async def test_pattern_ignorecase_flag(self): """Test case-insensitive pattern matching with IGNORECASE flag.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_patterns={"code": {"pattern": "^[A-Z]{3}$", "flags": ["IGNORECASE"]}} ) ) - result = await plugin.evaluate({"code": "ABC"}) + result = await evaluator.evaluate({"code": "ABC"}) assert result.matched is False # Validation passed - result = await plugin.evaluate({"code": "abc"}) + result = await evaluator.evaluate({"code": "abc"}) assert result.matched is False # Validation passed (case-insensitive) - result = await plugin.evaluate({"code": "AB"}) + result = await evaluator.evaluate({"code": "AB"}) assert result.matched is True # Failed - wrong length @pytest.mark.asyncio async def test_pattern_mixed_string_and_dict(self): """Test mixed string/dict patterns work together.""" - plugin = JSONControlEvaluatorPlugin( - JSONControlEvaluatorPluginConfig( + evaluator = JSONEvaluator( + JSONEvaluatorConfig( field_patterns={ "email": { "pattern": "^[a-z0-9._%+-]+@[a-z0-9.-]+\\.[a-z]{2,}$", @@ -713,5 +704,5 @@ async def test_pattern_mixed_string_and_dict(self): ) ) # Both should work - result = await plugin.evaluate({"email": "Test@Example.COM", "code": "1234"}) + result = await evaluator.evaluate({"email": "Test@Example.COM", "code": "1234"}) assert result.matched is False # Validation passed diff --git a/plugins/tests/test_sql.py b/evaluators/tests/test_sql.py similarity index 70% rename from plugins/tests/test_sql.py rename to evaluators/tests/test_sql.py index 0e6a7d9c..9667ff89 100644 --- a/plugins/tests/test_sql.py +++ b/evaluators/tests/test_sql.py @@ -1,12 +1,12 @@ -"""Tests for SQL plugin.""" +"""Tests for SQL evaluator.""" from unittest.mock import patch import pytest from pydantic import ValidationError -from agent_control_models import EvaluatorResult, SQLControlEvaluatorPluginConfig -from agent_control_plugins.builtin.sql import SQLControlEvaluatorPlugin +from agent_control_models import EvaluatorResult, SQLEvaluatorConfig +from agent_control_evaluators.builtin.sql import SQLEvaluator class TestEvaluatorResultValidator: @@ -45,20 +45,20 @@ def test_no_error_with_matched_true_is_valid(self): assert result.matched is True -class TestPluginErrorHandling: - """Tests for plugin error handling (unexpected exceptions).""" +class TestEvaluatorErrorHandling: + """Tests for evaluator error handling (unexpected exceptions).""" @pytest.mark.asyncio async def test_unexpected_exception_returns_error(self): - """Should return error field when plugin encounters unexpected exception.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + """Should return error field when evaluator encounters unexpected exception.""" + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) # Simulate an unexpected exception in the internal method with patch.object( - plugin, "_evaluate_sync", side_effect=RuntimeError("Unexpected failure") + evaluator, "_evaluate_sync", side_effect=RuntimeError("Unexpected failure") ): - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") # Check error first (convention: error field takes precedence) assert result.error is not None @@ -72,13 +72,13 @@ async def test_unexpected_exception_returns_error(self): @pytest.mark.asyncio async def test_memory_error_returns_error(self): """Should handle MemoryError gracefully.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) with patch.object( - plugin, "_evaluate_sync", side_effect=MemoryError("Out of memory") + evaluator, "_evaluate_sync", side_effect=MemoryError("Out of memory") ): - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") # Check error first assert result.error is not None @@ -88,31 +88,31 @@ async def test_memory_error_returns_error(self): @pytest.mark.asyncio async def test_keyboard_interrupt_propagates(self): """KeyboardInterrupt should propagate (not be caught).""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) with patch.object( - plugin, "_evaluate_sync", side_effect=KeyboardInterrupt() + evaluator, "_evaluate_sync", side_effect=KeyboardInterrupt() ): with pytest.raises(KeyboardInterrupt): - await plugin.evaluate("SELECT * FROM users") + await evaluator.evaluate("SELECT * FROM users") @pytest.mark.asyncio async def test_normal_validation_still_works_after_error(self): - """Plugin should continue working after an error.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + """Evaluator should continue working after an error.""" + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) # First call fails with patch.object( - plugin, "_evaluate_sync", side_effect=RuntimeError("Temporary failure") + evaluator, "_evaluate_sync", side_effect=RuntimeError("Temporary failure") ): - error_result = await plugin.evaluate("SELECT * FROM users") + error_result = await evaluator.evaluate("SELECT * FROM users") assert error_result.error is not None # Second call should work normally (no patch) - normal_result = await plugin.evaluate("SELECT * FROM users") + normal_result = await evaluator.evaluate("SELECT * FROM users") assert normal_result.error is None assert normal_result.matched is False @@ -123,10 +123,10 @@ class TestSQLMultiStatement: @pytest.mark.asyncio async def test_allow_multi_statements_by_default(self): """Should allow multiple statements by default.""" - config = SQLControlEvaluatorPluginConfig() - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig() + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users; SELECT * FROM orders" ) assert result.error is None @@ -135,16 +135,16 @@ async def test_allow_multi_statements_by_default(self): @pytest.mark.asyncio async def test_block_multi_statements_when_disabled(self): """Should block multiple statements when allow_multi_statements=False.""" - config = SQLControlEvaluatorPluginConfig(allow_multi_statements=False) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allow_multi_statements=False) + evaluator = SQLEvaluator(config) # Single statement should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False # Multiple statements should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users; DELETE FROM logs" ) assert result.error is None @@ -155,18 +155,18 @@ async def test_block_multi_statements_when_disabled(self): @pytest.mark.asyncio async def test_max_statements_limit(self): """Should enforce max_statements limit.""" - config = SQLControlEvaluatorPluginConfig(max_statements=2) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_statements=2) + evaluator = SQLEvaluator(config) # 2 statements should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users; SELECT * FROM orders" ) assert result.error is None assert result.matched is False # 3 statements should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users; SELECT * FROM orders; SELECT 1" ) assert result.error is None @@ -179,7 +179,7 @@ async def test_max_statements_with_allow_false(self): """Should validate that max_statements requires allow_multi_statements.""" # This should raise a validation error during config creation with pytest.raises(ValueError, match="max_statements is only applicable"): - SQLControlEvaluatorPluginConfig(allow_multi_statements=False, max_statements=3) + SQLEvaluatorConfig(allow_multi_statements=False, max_statements=3) class TestSQLOperations: @@ -188,10 +188,10 @@ class TestSQLOperations: @pytest.mark.asyncio async def test_block_drop_operation(self): """Should block DROP operations.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("DROP TABLE users") + result = await evaluator.evaluate("DROP TABLE users") assert result.error is None assert result.matched is True @@ -201,122 +201,122 @@ async def test_block_drop_operation(self): @pytest.mark.asyncio async def test_block_multiple_dangerous_operations(self): """Should block multiple dangerous operations.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( blocked_operations=["DROP", "DELETE", "TRUNCATE"] ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Test DROP - result = await plugin.evaluate("DROP TABLE users") + result = await evaluator.evaluate("DROP TABLE users") assert result.error is None assert result.matched is True # Test DELETE - result = await plugin.evaluate("DELETE FROM users WHERE id = 1") + result = await evaluator.evaluate("DELETE FROM users WHERE id = 1") assert result.error is None assert result.matched is True # Test TRUNCATE - result = await plugin.evaluate("TRUNCATE TABLE users") + result = await evaluator.evaluate("TRUNCATE TABLE users") assert result.error is None assert result.matched is True @pytest.mark.asyncio async def test_allow_safe_operations_when_blocking_dangerous(self): """Should allow safe operations when blocking dangerous ones.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( blocked_operations=["DROP", "DELETE", "TRUNCATE"] ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False - result = await plugin.evaluate("INSERT INTO users (name) VALUES ('test')") + result = await evaluator.evaluate("INSERT INTO users (name) VALUES ('test')") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_allowlist_mode_select_only(self): """Should allow only SELECT when in allowlist mode.""" - config = SQLControlEvaluatorPluginConfig(allowed_operations=["SELECT"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_operations=["SELECT"]) + evaluator = SQLEvaluator(config) # SELECT should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False # Other operations should be blocked - result = await plugin.evaluate("INSERT INTO users (name) VALUES ('test')") + result = await evaluator.evaluate("INSERT INTO users (name) VALUES ('test')") assert result.error is None assert result.matched is True assert "INSERT" in result.metadata["blocked"] - result = await plugin.evaluate("UPDATE users SET name = 'new' WHERE id = 1") + result = await evaluator.evaluate("UPDATE users SET name = 'new' WHERE id = 1") assert result.error is None assert result.matched is True - result = await plugin.evaluate("DELETE FROM users WHERE id = 1") + result = await evaluator.evaluate("DELETE FROM users WHERE id = 1") assert result.error is None assert result.matched is True @pytest.mark.asyncio async def test_block_ddl_flag(self): """Should block all DDL operations when block_ddl=True.""" - config = SQLControlEvaluatorPluginConfig(block_ddl=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(block_ddl=True) + evaluator = SQLEvaluator(config) # DDL operations should be blocked - result = await plugin.evaluate("CREATE TABLE test (id INT)") + result = await evaluator.evaluate("CREATE TABLE test (id INT)") assert result.error is None assert result.matched is True - result = await plugin.evaluate("ALTER TABLE users ADD COLUMN age INT") + result = await evaluator.evaluate("ALTER TABLE users ADD COLUMN age INT") assert result.error is None assert result.matched is True - result = await plugin.evaluate("DROP TABLE users") + result = await evaluator.evaluate("DROP TABLE users") assert result.error is None assert result.matched is True - result = await plugin.evaluate("TRUNCATE TABLE users") + result = await evaluator.evaluate("TRUNCATE TABLE users") assert result.error is None assert result.matched is True # DML operations should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_block_dcl_flag(self): """Should block all DCL operations when block_dcl=True.""" - config = SQLControlEvaluatorPluginConfig(block_dcl=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(block_dcl=True) + evaluator = SQLEvaluator(config) # DCL operations should be blocked - result = await plugin.evaluate("GRANT SELECT ON users TO user1") + result = await evaluator.evaluate("GRANT SELECT ON users TO user1") assert result.error is None assert result.matched is True - result = await plugin.evaluate("REVOKE SELECT ON users FROM user1") + result = await evaluator.evaluate("REVOKE SELECT ON users FROM user1") assert result.error is None assert result.matched is True # Other operations should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_multiple_statements(self): """Should detect blocked operations in multiple statements.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users; DROP TABLE users; SELECT 1" ) assert result.error is None @@ -330,20 +330,20 @@ class TestSQLTableAccess: @pytest.mark.asyncio async def test_allow_specific_tables(self): """Should allow only specific tables.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # Allowed tables should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False - result = await plugin.evaluate("SELECT * FROM orders") + result = await evaluator.evaluate("SELECT * FROM orders") assert result.error is None assert result.matched is False # Other tables should be blocked - result = await plugin.evaluate("SELECT * FROM admin") + result = await evaluator.evaluate("SELECT * FROM admin") assert result.error is None assert result.matched is True assert "admin" in result.message @@ -351,86 +351,86 @@ async def test_allow_specific_tables(self): @pytest.mark.asyncio async def test_block_specific_tables(self): """Should block specific tables.""" - config = SQLControlEvaluatorPluginConfig(blocked_tables=["admin", "secrets"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_tables=["admin", "secrets"]) + evaluator = SQLEvaluator(config) # Blocked tables should be blocked - result = await plugin.evaluate("SELECT * FROM admin") + result = await evaluator.evaluate("SELECT * FROM admin") assert result.error is None assert result.matched is True - result = await plugin.evaluate("SELECT * FROM secrets") + result = await evaluator.evaluate("SELECT * FROM secrets") assert result.error is None assert result.matched is True # Other tables should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_block_system_schemas(self): """Should block system schemas.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( blocked_schemas=["pg_catalog", "information_schema"] ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # System schemas should be blocked - result = await plugin.evaluate("SELECT * FROM pg_catalog.pg_tables") + result = await evaluator.evaluate("SELECT * FROM pg_catalog.pg_tables") assert result.error is None assert result.matched is True assert "pg_catalog" in result.message - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM information_schema.tables" ) assert result.error is None assert result.matched is True # Regular queries should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_qualified_table_names(self): """Should handle qualified table names (schema.table).""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( allowed_schemas=["public"], blocked_tables=["admin"] ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Public schema should pass - result = await plugin.evaluate("SELECT * FROM public.users") + result = await evaluator.evaluate("SELECT * FROM public.users") assert result.error is None assert result.matched is False # Non-public schema should be blocked - result = await plugin.evaluate("SELECT * FROM private.users") + result = await evaluator.evaluate("SELECT * FROM private.users") assert result.error is None assert result.matched is True # Blocked table even in allowed schema should be blocked - result = await plugin.evaluate("SELECT * FROM public.admin") + result = await evaluator.evaluate("SELECT * FROM public.admin") assert result.error is None assert result.matched is True @pytest.mark.asyncio async def test_multiple_tables_in_query(self): """Should check all tables in a query.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # All allowed tables - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users JOIN orders ON users.id = orders.user_id" ) assert result.error is None assert result.matched is False # One disallowed table - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users JOIN admin ON users.id = admin.user_id" ) assert result.error is None @@ -441,36 +441,36 @@ async def test_multiple_tables_in_query(self): async def test_case_sensitivity_tables(self): """Should respect case sensitivity setting for tables.""" # Case insensitive (default) - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( blocked_tables=["admin"], case_sensitive=False ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("SELECT * FROM Admin") + result = await evaluator.evaluate("SELECT * FROM Admin") assert result.error is None assert result.matched is True - result = await plugin.evaluate("SELECT * FROM ADMIN") + result = await evaluator.evaluate("SELECT * FROM ADMIN") assert result.error is None assert result.matched is True # Case sensitive - config = SQLControlEvaluatorPluginConfig(blocked_tables=["admin"], case_sensitive=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_tables=["admin"], case_sensitive=True) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("SELECT * FROM admin") + result = await evaluator.evaluate("SELECT * FROM admin") assert result.error is None assert result.matched is True - result = await plugin.evaluate("SELECT * FROM Admin") + result = await evaluator.evaluate("SELECT * FROM Admin") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_cte_not_treated_as_table_violation(self): """Should not treat CTEs as unauthorized table access.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users"], case_sensitive=False) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users"], case_sensitive=False) + evaluator = SQLEvaluator(config) # CTE 'temp_users' is defined locally, not in allowed_tables # This should pass because CTEs are not external tables @@ -480,7 +480,7 @@ async def test_cte_not_treated_as_table_violation(self): ) SELECT * FROM temp_users """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is False @@ -491,7 +491,7 @@ async def test_cte_not_treated_as_table_violation(self): premium_users AS (SELECT * FROM active_users WHERE premium = true) SELECT * FROM premium_users """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is False @@ -502,7 +502,7 @@ async def test_cte_not_treated_as_table_violation(self): ) SELECT * FROM temp_data """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is True assert "admin" in result.message @@ -514,26 +514,26 @@ class TestSQLColumnPresence: @pytest.mark.asyncio async def test_require_column_in_where_clause(self): """Should require specific column in WHERE clause.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where" ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Query with tenant_id in WHERE - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123" ) assert result.error is None assert result.matched is False # Query without tenant_id in WHERE - should be blocked - result = await plugin.evaluate("SELECT * FROM users WHERE id = 1") + result = await evaluator.evaluate("SELECT * FROM users WHERE id = 1") assert result.error is None assert result.matched is True assert "tenant_id" in result.message # Query with tenant_id in SELECT but not WHERE - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT tenant_id FROM users WHERE id = 1" ) assert result.error is None @@ -542,22 +542,22 @@ async def test_require_column_in_where_clause(self): @pytest.mark.asyncio async def test_require_column_in_select_clause(self): """Should require specific column in SELECT clause.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["id", "created_at"], column_presence_logic="all", column_context="select" ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Query with both columns in SELECT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT id, name, created_at FROM users" ) assert result.error is None assert result.matched is False # Query missing one column - should be blocked - result = await plugin.evaluate("SELECT id, name FROM users") + result = await evaluator.evaluate("SELECT id, name FROM users") assert result.error is None assert result.matched is True assert "created_at" in result.message @@ -565,72 +565,72 @@ async def test_require_column_in_select_clause(self): @pytest.mark.asyncio async def test_require_column_anywhere(self): """Should require column anywhere in query.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id"], column_context=None ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Column in SELECT - should pass - result = await plugin.evaluate("SELECT user_id FROM logs") + result = await evaluator.evaluate("SELECT user_id FROM logs") assert result.error is None assert result.matched is False # Column in WHERE - should pass - result = await plugin.evaluate("SELECT * FROM logs WHERE user_id = 1") + result = await evaluator.evaluate("SELECT * FROM logs WHERE user_id = 1") assert result.error is None assert result.matched is False # Column not present - should be blocked - result = await plugin.evaluate("SELECT * FROM logs WHERE id = 1") + result = await evaluator.evaluate("SELECT * FROM logs WHERE id = 1") assert result.error is None assert result.matched is True @pytest.mark.asyncio async def test_column_presence_any_logic(self): """Should require at least one column with 'any' logic.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id", "admin_id"], column_presence_logic="any", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Has user_id - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM logs WHERE user_id = 1" ) assert result.error is None assert result.matched is False # Has admin_id - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM logs WHERE admin_id = 1" ) assert result.error is None assert result.matched is False # Has neither - should be blocked - result = await plugin.evaluate("SELECT * FROM logs WHERE id = 1") + result = await evaluator.evaluate("SELECT * FROM logs WHERE id = 1") assert result.error is None assert result.matched is True @pytest.mark.asyncio async def test_column_presence_all_logic(self): """Should require all columns with 'all' logic.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id", "timestamp"], column_presence_logic="all", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Has both columns - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM logs WHERE user_id = 1 AND timestamp > '2024-01-01'" ) assert result.error is None assert result.matched is False # Has only one column - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM logs WHERE user_id = 1" ) assert result.error is None @@ -641,34 +641,34 @@ async def test_column_presence_all_logic(self): async def test_case_sensitivity_columns(self): """Should respect case sensitivity for columns.""" # Case insensitive (default) - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", case_sensitive=False, ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE Tenant_ID = 123" ) assert result.error is None assert result.matched is False # Case sensitive - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", case_sensitive=True, ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123" ) assert result.error is None assert result.matched is False - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE Tenant_ID = 123" ) assert result.error is None @@ -678,14 +678,14 @@ async def test_case_sensitivity_columns(self): async def test_column_extraction_with_join_queries(self): """Should extract columns from JOIN queries correctly.""" # Test WHERE context with JOIN - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # JOIN with tenant_id in WHERE - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.id, orders.total FROM users " "JOIN orders ON users.id = orders.user_id " "WHERE users.tenant_id = 123" @@ -694,7 +694,7 @@ async def test_column_extraction_with_join_queries(self): assert result.matched is False # JOIN without tenant_id in WHERE - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.id, orders.total FROM users " "JOIN orders ON users.id = orders.user_id " "WHERE orders.id = 1" @@ -704,15 +704,15 @@ async def test_column_extraction_with_join_queries(self): assert "tenant_id" in result.message # Test SELECT context with JOIN - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id", "tenant_id"], column_context="select", column_presence_logic="all", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # JOIN with both required columns in SELECT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.user_id, users.tenant_id, orders.total " "FROM users JOIN orders ON users.id = orders.user_id" ) @@ -720,7 +720,7 @@ async def test_column_extraction_with_join_queries(self): assert result.matched is False # JOIN missing one required column in SELECT - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.user_id, orders.total " "FROM users JOIN orders ON users.id = orders.user_id" ) @@ -729,14 +729,14 @@ async def test_column_extraction_with_join_queries(self): assert "tenant_id" in result.message # Test columns anywhere (None context) with JOIN - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context=None, ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # tenant_id in SELECT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.tenant_id, orders.total FROM users " "JOIN orders ON users.id = orders.user_id" ) @@ -744,7 +744,7 @@ async def test_column_extraction_with_join_queries(self): assert result.matched is False # tenant_id in WHERE - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.id, orders.total FROM users " "JOIN orders ON users.id = orders.user_id " "WHERE users.tenant_id = 123" @@ -753,7 +753,7 @@ async def test_column_extraction_with_join_queries(self): assert result.matched is False # tenant_id in JOIN condition - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.id, orders.total FROM users " "JOIN orders ON users.tenant_id = orders.tenant_id" ) @@ -761,7 +761,7 @@ async def test_column_extraction_with_join_queries(self): assert result.matched is False # tenant_id not present anywhere - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT users.id, orders.total FROM users " "JOIN orders ON users.id = orders.user_id " "WHERE orders.status = 'active'" @@ -777,16 +777,16 @@ class TestSQLLimits: @pytest.mark.asyncio async def test_require_limit_on_select(self): """Should require LIMIT clause on SELECT queries.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # SELECT with LIMIT should pass - result = await plugin.evaluate("SELECT * FROM users LIMIT 100") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 100") assert result.error is None assert result.matched is False # SELECT without LIMIT should be blocked - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is True assert "LIMIT" in result.message @@ -795,38 +795,38 @@ async def test_require_limit_on_select(self): @pytest.mark.asyncio async def test_require_limit_only_affects_select(self): """Should only check LIMIT on SELECT statements.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # INSERT without LIMIT should pass (LIMIT not applicable) - result = await plugin.evaluate( + result = await evaluator.evaluate( "INSERT INTO users (name) VALUES ('test')" ) assert result.error is None assert result.matched is False # DELETE without LIMIT should pass - result = await plugin.evaluate("DELETE FROM users WHERE id = 1") + result = await evaluator.evaluate("DELETE FROM users WHERE id = 1") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_max_limit_enforcement(self): """Should enforce maximum LIMIT value.""" - config = SQLControlEvaluatorPluginConfig(max_limit=1000) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_limit=1000) + evaluator = SQLEvaluator(config) # LIMIT within bounds should pass - result = await plugin.evaluate("SELECT * FROM users LIMIT 100") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 100") assert result.error is None assert result.matched is False - result = await plugin.evaluate("SELECT * FROM users LIMIT 1000") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 1000") assert result.error is None assert result.matched is False # LIMIT exceeding max should be blocked - result = await plugin.evaluate("SELECT * FROM users LIMIT 10000") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 10000") assert result.error is None assert result.matched is True assert "10000" in result.message @@ -837,18 +837,18 @@ async def test_max_limit_enforcement(self): @pytest.mark.asyncio async def test_limit_with_offset(self): """Should handle LIMIT with OFFSET correctly.""" - config = SQLControlEvaluatorPluginConfig(max_limit=1000) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_limit=1000) + evaluator = SQLEvaluator(config) # LIMIT + OFFSET within bounds should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users LIMIT 100 OFFSET 50" ) assert result.error is None assert result.matched is False # LIMIT exceeding max with OFFSET should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users LIMIT 5000 OFFSET 10" ) assert result.error is None @@ -857,33 +857,33 @@ async def test_limit_with_offset(self): @pytest.mark.asyncio async def test_limit_all_allowed(self): """Should allow LIMIT ALL (indeterminate limits are allowed).""" - config = SQLControlEvaluatorPluginConfig(max_limit=1000) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_limit=1000) + evaluator = SQLEvaluator(config) # LIMIT ALL should be allowed (indeterminate limits are skipped) - result = await plugin.evaluate("SELECT * FROM users LIMIT ALL") + result = await evaluator.evaluate("SELECT * FROM users LIMIT ALL") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_require_and_max_limit_combined(self): """Should enforce both require_limit and max_limit.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True, max_limit=500) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True, max_limit=500) + evaluator = SQLEvaluator(config) # Valid query with LIMIT - result = await plugin.evaluate("SELECT * FROM users LIMIT 100") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 100") assert result.error is None assert result.matched is False # Missing LIMIT - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is True assert "must have a LIMIT" in result.message # LIMIT too high - result = await plugin.evaluate("SELECT * FROM users LIMIT 1000") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 1000") assert result.error is None assert result.matched is True assert "exceeds maximum" in result.message @@ -891,18 +891,18 @@ async def test_require_and_max_limit_combined(self): @pytest.mark.asyncio async def test_multi_select_statements_limit_check(self): """Should check LIMIT on all SELECT statements.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # All SELECTs have LIMIT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users LIMIT 10; SELECT * FROM orders LIMIT 20" ) assert result.error is None assert result.matched is False # One SELECT missing LIMIT - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users LIMIT 10; SELECT * FROM orders" ) assert result.error is None @@ -915,24 +915,24 @@ class TestCombinedControls: @pytest.mark.asyncio async def test_operation_and_table_restrictions(self): """Should enforce both operation and table restrictions.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( allowed_operations=["SELECT"], allowed_tables=["users", "orders"], ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Both constraints satisfied - should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False # Invalid operation - should be blocked - result = await plugin.evaluate("DELETE FROM users WHERE id = 1") + result = await evaluator.evaluate("DELETE FROM users WHERE id = 1") assert result.error is None assert result.matched is True # Invalid table - should be blocked - result = await plugin.evaluate("SELECT * FROM admin") + result = await evaluator.evaluate("SELECT * FROM admin") assert result.error is None assert result.matched is True @@ -947,40 +947,40 @@ async def test_allowlist_with_block_ddl_enforces_both(self): This tests the fix for the critical security bug where the allowlist was ignored when combined with block_ddl/block_dcl. """ - config = SQLControlEvaluatorPluginConfig(allowed_operations=["SELECT"], block_ddl=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_operations=["SELECT"], block_ddl=True) + evaluator = SQLEvaluator(config) # SELECT should pass (in allowlist, not DDL) - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False # DROP should be blocked (DDL) - result = await plugin.evaluate("DROP TABLE users") + result = await evaluator.evaluate("DROP TABLE users") assert result.error is None assert result.matched is True assert "DROP" in result.metadata["blocked"] # INSERT should be blocked (not in allowlist) - result = await plugin.evaluate("INSERT INTO users (name) VALUES ('test')") + result = await evaluator.evaluate("INSERT INTO users (name) VALUES ('test')") assert result.error is None assert result.matched is True assert "INSERT" in result.metadata["blocked"] # UPDATE should be blocked (not in allowlist) - result = await plugin.evaluate("UPDATE users SET name = 'new'") + result = await evaluator.evaluate("UPDATE users SET name = 'new'") assert result.error is None assert result.matched is True assert "UPDATE" in result.metadata["blocked"] # DELETE should be blocked (not in allowlist) - result = await plugin.evaluate("DELETE FROM users WHERE id = 1") + result = await evaluator.evaluate("DELETE FROM users WHERE id = 1") assert result.error is None assert result.matched is True assert "DELETE" in result.metadata["blocked"] # TRUNCATE should be blocked (both DDL and not in allowlist) - result = await plugin.evaluate("TRUNCATE TABLE users") + result = await evaluator.evaluate("TRUNCATE TABLE users") assert result.error is None assert result.matched is True assert "TRUNCATE" in result.metadata["blocked"] @@ -988,22 +988,22 @@ async def test_allowlist_with_block_ddl_enforces_both(self): @pytest.mark.asyncio async def test_allowlist_with_block_dcl_enforces_both(self): """Test allowed_operations + block_dcl combination.""" - config = SQLControlEvaluatorPluginConfig(allowed_operations=["SELECT"], block_dcl=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_operations=["SELECT"], block_dcl=True) + evaluator = SQLEvaluator(config) # SELECT should pass - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False # GRANT should be blocked (DCL) - result = await plugin.evaluate("GRANT SELECT ON users TO user1") + result = await evaluator.evaluate("GRANT SELECT ON users TO user1") assert result.error is None assert result.matched is True assert "GRANT" in result.metadata["blocked"] # INSERT should be blocked (not in allowlist) - result = await plugin.evaluate("INSERT INTO users (name) VALUES ('test')") + result = await evaluator.evaluate("INSERT INTO users (name) VALUES ('test')") assert result.error is None assert result.matched is True assert "INSERT" in result.metadata["blocked"] @@ -1011,27 +1011,27 @@ async def test_allowlist_with_block_dcl_enforces_both(self): @pytest.mark.asyncio async def test_operation_and_column_presence(self): """Should enforce operation restrictions and column presence.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( allowed_operations=["SELECT"], required_columns=["tenant_id"], column_context="where", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Both constraints satisfied - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123" ) assert result.error is None assert result.matched is False # Missing column - should be blocked - result = await plugin.evaluate("SELECT * FROM users WHERE id = 1") + result = await evaluator.evaluate("SELECT * FROM users WHERE id = 1") assert result.error is None assert result.matched is True # Invalid operation - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "DELETE FROM users WHERE tenant_id = 123" ) assert result.error is None @@ -1040,7 +1040,7 @@ async def test_operation_and_column_presence(self): @pytest.mark.asyncio async def test_all_features_combined(self): """Should enforce all validation types together.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( allowed_operations=["SELECT", "INSERT"], allowed_tables=["users", "orders"], required_columns=["tenant_id"], @@ -1048,45 +1048,45 @@ async def test_all_features_combined(self): require_limit=True, max_limit=1000, ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # All constraints satisfied - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123 LIMIT 100" ) assert result.error is None assert result.matched is False # Missing LIMIT - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123" ) assert result.error is None assert result.matched is True # LIMIT too high - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123 LIMIT 5000" ) assert result.error is None assert result.matched is True # Operation violation - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "DELETE FROM users WHERE tenant_id = 123" ) assert result.error is None assert result.matched is True # Table violation - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM admin WHERE tenant_id = 123 LIMIT 100" ) assert result.error is None assert result.matched is True # Column violation - should be blocked - result = await plugin.evaluate("SELECT * FROM users WHERE id = 1 LIMIT 100") + result = await evaluator.evaluate("SELECT * FROM users WHERE id = 1 LIMIT 100") assert result.error is None assert result.matched is True @@ -1097,10 +1097,10 @@ class TestEdgeCases: @pytest.mark.asyncio async def test_none_input(self): """Should handle None input.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate(None) + result = await evaluator.evaluate(None) assert result.error is None assert result.matched is False assert "No SQL query" in result.message @@ -1108,10 +1108,10 @@ async def test_none_input(self): @pytest.mark.asyncio async def test_empty_string(self): """Should handle empty string input.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("") + result = await evaluator.evaluate("") assert result.error is None assert result.matched is False assert "Empty" in result.message @@ -1119,10 +1119,10 @@ async def test_empty_string(self): @pytest.mark.asyncio async def test_whitespace_only(self): """Should handle whitespace-only input.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate(" ") + result = await evaluator.evaluate(" ") assert result.error is None assert result.matched is False assert "Empty" in result.message @@ -1130,53 +1130,53 @@ async def test_whitespace_only(self): @pytest.mark.asyncio async def test_malformed_sql_blocked(self): """Should block malformed SQL (invalid SQL fails validation).""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("This is not valid SQL at all!!!") + result = await evaluator.evaluate("This is not valid SQL at all!!!") assert result.error is None assert result.matched is True # Invalid SQL is blocked assert result.confidence == 1.0 - assert result.error is None # Not a plugin error, just bad input + assert result.error is None # Not a evaluator error, just bad input assert "pars" in result.message.lower() @pytest.mark.asyncio async def test_empty_config(self): """Should pass all queries with empty config.""" - config = SQLControlEvaluatorPluginConfig() - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig() + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("DROP TABLE users") + result = await evaluator.evaluate("DROP TABLE users") assert result.error is None assert result.matched is False - result = await plugin.evaluate("SELECT * FROM admin") + result = await evaluator.evaluate("SELECT * FROM admin") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_dict_input_with_query_key(self): """Should extract query from dict with 'query' key.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate({"query": "DROP TABLE users"}) + result = await evaluator.evaluate({"query": "DROP TABLE users"}) assert result.error is None assert result.matched is True @pytest.mark.asyncio async def test_non_table_query_with_table_restrictions(self): """Should allow non-table queries even with table restrictions.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # SELECT without FROM clause - result = await plugin.evaluate("SELECT 1") + result = await evaluator.evaluate("SELECT 1") assert result.error is None assert result.matched is False # SELECT with expression - result = await plugin.evaluate("SELECT 1 + 1 AS result") + result = await evaluator.evaluate("SELECT 1 + 1 AS result") assert result.error is None assert result.matched is False @@ -1187,11 +1187,11 @@ class TestSQLSubqueries: @pytest.mark.asyncio async def test_subquery_in_where_clause(self): """Should extract tables from subqueries in WHERE clause.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # Subquery with allowed tables - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM orders WHERE total > 100)" ) @@ -1199,7 +1199,7 @@ async def test_subquery_in_where_clause(self): assert result.matched is False # Subquery with blocked table - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM admin WHERE active = true)" ) @@ -1210,11 +1210,11 @@ async def test_subquery_in_where_clause(self): @pytest.mark.asyncio async def test_subquery_in_from_clause(self): """Should extract tables from subqueries in FROM clause.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # Derived table with allowed table - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM " "(SELECT * FROM users WHERE active = true) AS active_users" ) @@ -1222,7 +1222,7 @@ async def test_subquery_in_from_clause(self): assert result.matched is False # Derived table with blocked table - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM " "(SELECT * FROM admin WHERE role = 'super') AS admins" ) @@ -1233,11 +1233,11 @@ async def test_subquery_in_from_clause(self): @pytest.mark.asyncio async def test_correlated_subquery(self): """Should handle correlated subqueries correctly.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # Correlated subquery with allowed tables - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users u WHERE EXISTS " "(SELECT 1 FROM orders o WHERE o.user_id = u.id)" ) @@ -1245,7 +1245,7 @@ async def test_correlated_subquery(self): assert result.matched is False # Correlated subquery with blocked table - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users u WHERE EXISTS " "(SELECT 1 FROM secrets s WHERE s.user_id = u.id)" ) @@ -1256,11 +1256,11 @@ async def test_correlated_subquery(self): @pytest.mark.asyncio async def test_nested_subqueries(self): """Should handle deeply nested subqueries.""" - config = SQLControlEvaluatorPluginConfig(blocked_tables=["admin", "secrets"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_tables=["admin", "secrets"]) + evaluator = SQLEvaluator(config) # Nested subqueries without blocked tables - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM orders WHERE id IN " "(SELECT order_id FROM payments WHERE status = 'completed'))" @@ -1269,7 +1269,7 @@ async def test_nested_subqueries(self): assert result.matched is False # Nested subquery with blocked table in innermost - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM orders WHERE id IN " "(SELECT order_id FROM admin WHERE verified = true))" @@ -1279,7 +1279,7 @@ async def test_nested_subqueries(self): assert "admin" in result.message # Nested subquery with blocked table in middle - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM secrets WHERE id IN " "(SELECT secret_id FROM logs))" @@ -1291,11 +1291,11 @@ async def test_nested_subqueries(self): @pytest.mark.asyncio async def test_subquery_with_blocked_operations(self): """Should detect blocked operations in subqueries.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE", "DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE", "DROP"]) + evaluator = SQLEvaluator(config) # SELECT with subquery - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM orders)" ) @@ -1303,7 +1303,7 @@ async def test_subquery_with_blocked_operations(self): assert result.matched is False # DELETE in main query - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "DELETE FROM users WHERE id IN " "(SELECT user_id FROM orders WHERE total < 10)" ) @@ -1316,7 +1316,7 @@ async def test_subquery_with_blocked_operations(self): # Blocked operations in subqueries are not currently detected # Leaving this test commented out until Issue #1 is fixed # - # result = await plugin.evaluate( + # result = await evaluator.evaluate( # "SELECT * FROM users WHERE id NOT IN " # "(DELETE FROM orders WHERE total = 0 RETURNING user_id)" # ) @@ -1326,14 +1326,14 @@ async def test_subquery_with_blocked_operations(self): @pytest.mark.asyncio async def test_subquery_with_column_requirements(self): """Should check column requirements in subqueries.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Column in outer query WHERE - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 123 AND id IN " "(SELECT user_id FROM orders WHERE total > 100)" ) @@ -1350,15 +1350,15 @@ async def test_subquery_with_column_requirements(self): @pytest.mark.asyncio async def test_subquery_with_column_in_select(self): """Should extract columns from subquery SELECT clauses with scope=all.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id"], column_context="select", column_context_scope="top_level", # Old behavior: only check outer SELECT ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Column in outer SELECT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT user_id, name FROM users WHERE id IN " "(SELECT id FROM orders)" ) @@ -1366,7 +1366,7 @@ async def test_subquery_with_column_in_select(self): assert result.matched is False # Column only in subquery SELECT - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT id, name FROM users WHERE id IN " "(SELECT user_id FROM orders)" ) @@ -1377,11 +1377,11 @@ async def test_subquery_with_column_in_select(self): @pytest.mark.asyncio async def test_multiple_subqueries(self): """Should handle multiple subqueries in same query.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders", "payments"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders", "payments"]) + evaluator = SQLEvaluator(config) # Multiple subqueries with allowed tables - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM orders) AND id IN " "(SELECT user_id FROM payments)" @@ -1390,7 +1390,7 @@ async def test_multiple_subqueries(self): assert result.matched is False # One subquery with disallowed table - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE id IN " "(SELECT user_id FROM orders) AND id IN " "(SELECT user_id FROM admin)" @@ -1402,11 +1402,11 @@ async def test_multiple_subqueries(self): @pytest.mark.asyncio async def test_subquery_in_join(self): """Should handle subqueries used in JOIN clauses.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # Subquery in JOIN with allowed table - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT u.* FROM users u " "JOIN (SELECT user_id, COUNT(*) as order_count FROM orders " "GROUP BY user_id) o ON u.id = o.user_id" @@ -1415,7 +1415,7 @@ async def test_subquery_in_join(self): assert result.matched is False # Subquery in JOIN with blocked table - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT u.* FROM users u " "JOIN (SELECT user_id, role FROM admin) a ON u.id = a.user_id" ) @@ -1426,11 +1426,11 @@ async def test_subquery_in_join(self): @pytest.mark.asyncio async def test_union_with_subqueries(self): """Should handle UNION with subqueries.""" - config = SQLControlEvaluatorPluginConfig(allowed_tables=["users", "customers"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(allowed_tables=["users", "customers"]) + evaluator = SQLEvaluator(config) # UNION with allowed tables - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT id, name FROM users " "UNION " "SELECT id, name FROM customers" @@ -1439,7 +1439,7 @@ async def test_union_with_subqueries(self): assert result.matched is False # UNION with blocked table - should be blocked - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT id, name FROM users " "UNION " "SELECT id, name FROM admin" @@ -1454,33 +1454,33 @@ class TestSQLDialectConfiguration: def test_dialect_defaults_to_postgres(self): """Should default to postgres dialect.""" - config = SQLControlEvaluatorPluginConfig() + config = SQLEvaluatorConfig() assert config.dialect == "postgres" def test_dialect_can_be_set_to_mysql(self): """Should accept mysql dialect.""" - config = SQLControlEvaluatorPluginConfig(dialect="mysql") + config = SQLEvaluatorConfig(dialect="mysql") assert config.dialect == "mysql" def test_dialect_can_be_set_to_tsql(self): """Should accept tsql dialect.""" - config = SQLControlEvaluatorPluginConfig(dialect="tsql") + config = SQLEvaluatorConfig(dialect="tsql") assert config.dialect == "tsql" def test_dialect_can_be_set_to_oracle(self): """Should accept oracle dialect.""" - config = SQLControlEvaluatorPluginConfig(dialect="oracle") + config = SQLEvaluatorConfig(dialect="oracle") assert config.dialect == "oracle" def test_dialect_can_be_set_to_sqlite(self): """Should accept sqlite dialect.""" - config = SQLControlEvaluatorPluginConfig(dialect="sqlite") + config = SQLEvaluatorConfig(dialect="sqlite") assert config.dialect == "sqlite" def test_invalid_dialect_raises_error(self): """Should reject invalid dialect.""" with pytest.raises(Exception): - SQLControlEvaluatorPluginConfig(dialect="invalid_dialect") + SQLEvaluatorConfig(dialect="invalid_dialect") class TestSQLDialectParsing: @@ -1490,22 +1490,22 @@ class TestSQLDialectParsing: @pytest.mark.asyncio async def test_postgres_double_quoted_identifiers(self): """PostgreSQL should parse double-quoted identifiers.""" - config = SQLControlEvaluatorPluginConfig(dialect="postgres", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="postgres", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Double quotes in PostgreSQL - result = await plugin.evaluate('SELECT * FROM "users"') + result = await evaluator.evaluate('SELECT * FROM "users"') assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_postgres_case_sensitive_identifiers_quoted(self): """PostgreSQL preserves case in quoted identifiers.""" - config = SQLControlEvaluatorPluginConfig(dialect="postgres", allowed_tables=["Users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="postgres", allowed_tables=["Users"]) + evaluator = SQLEvaluator(config) # Quoted identifier in PostgreSQL preserves case - result = await plugin.evaluate('SELECT * FROM "Users"') + result = await evaluator.evaluate('SELECT * FROM "Users"') assert result.error is None assert result.matched is False @@ -1513,22 +1513,22 @@ async def test_postgres_case_sensitive_identifiers_quoted(self): @pytest.mark.asyncio async def test_mysql_backtick_identifiers(self): """MySQL should parse backtick-quoted identifiers.""" - config = SQLControlEvaluatorPluginConfig(dialect="mysql", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="mysql", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Backticks in MySQL - result = await plugin.evaluate("SELECT * FROM `users`") + result = await evaluator.evaluate("SELECT * FROM `users`") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_mysql_column_alias_syntax(self): """MySQL should parse column aliases correctly.""" - config = SQLControlEvaluatorPluginConfig(dialect="mysql", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="mysql", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # MySQL-specific alias syntax - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT id as `user_id`, name as `user_name` FROM users" ) assert result.error is None @@ -1538,26 +1538,26 @@ async def test_mysql_column_alias_syntax(self): @pytest.mark.asyncio async def test_tsql_bracket_quoted_identifiers(self): """T-SQL should parse bracket-quoted identifiers.""" - config = SQLControlEvaluatorPluginConfig(dialect="tsql", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="tsql", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Brackets in T-SQL - result = await plugin.evaluate("SELECT * FROM [users]") + result = await evaluator.evaluate("SELECT * FROM [users]") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_tsql_column_with_spaces(self): """T-SQL should parse column names with spaces in brackets.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( dialect="tsql", allowed_tables=["users"], required_columns=["user id"], ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # T-SQL with spaces in column name using brackets - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT [user id], name FROM [users] WHERE [user id] = 1" ) assert result.error is None @@ -1567,22 +1567,22 @@ async def test_tsql_column_with_spaces(self): @pytest.mark.asyncio async def test_oracle_double_quoted_identifiers(self): """Oracle should parse double-quoted identifiers.""" - config = SQLControlEvaluatorPluginConfig(dialect="oracle", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="oracle", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Double quotes in Oracle - result = await plugin.evaluate('SELECT * FROM "users"') + result = await evaluator.evaluate('SELECT * FROM "users"') assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_oracle_line_comment_syntax(self): """Oracle should parse -- line comments.""" - config = SQLControlEvaluatorPluginConfig(dialect="oracle", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="oracle", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Oracle -- comment syntax - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users -- get all users\n WHERE id > 0" ) assert result.error is None @@ -1592,22 +1592,22 @@ async def test_oracle_line_comment_syntax(self): @pytest.mark.asyncio async def test_sqlite_double_quoted_identifiers(self): """SQLite should parse double-quoted identifiers.""" - config = SQLControlEvaluatorPluginConfig(dialect="sqlite", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="sqlite", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Double quotes in SQLite - result = await plugin.evaluate('SELECT * FROM "users"') + result = await evaluator.evaluate('SELECT * FROM "users"') assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_sqlite_autoincrement_syntax(self): """SQLite should parse AUTOINCREMENT syntax.""" - config = SQLControlEvaluatorPluginConfig(dialect="sqlite", block_ddl=False) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="sqlite", block_ddl=False) + evaluator = SQLEvaluator(config) # SQLite AUTOINCREMENT syntax - result = await plugin.evaluate( + result = await evaluator.evaluate( "CREATE TABLE users (id INTEGER PRIMARY KEY AUTOINCREMENT, " "name TEXT)" ) @@ -1622,10 +1622,10 @@ class TestSQLDialectIntegration: async def test_dialect_with_blocked_operations(self): """Should enforce blocked_operations across dialects.""" for dialect in ["postgres", "mysql", "tsql", "oracle", "sqlite"]: - config = SQLControlEvaluatorPluginConfig(dialect=dialect, blocked_operations=["DROP"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect=dialect, blocked_operations=["DROP"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("DROP TABLE users") + result = await evaluator.evaluate("DROP TABLE users") assert result.error is None assert result.matched is True assert "DROP" in result.metadata["blocked"] @@ -1634,18 +1634,18 @@ async def test_dialect_with_blocked_operations(self): async def test_dialect_with_table_restrictions(self): """Should enforce table restrictions across dialects.""" for dialect in ["postgres", "mysql", "tsql", "oracle", "sqlite"]: - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( dialect=dialect, allowed_tables=["users", "orders"] ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Allowed table - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False # Blocked table - result = await plugin.evaluate("SELECT * FROM admin") + result = await evaluator.evaluate("SELECT * FROM admin") assert result.error is None assert result.matched is True @@ -1653,18 +1653,18 @@ async def test_dialect_with_table_restrictions(self): async def test_dialect_with_limit_enforcement(self): """Should enforce LIMIT constraints across dialects.""" for dialect in ["postgres", "mysql", "tsql", "oracle", "sqlite"]: - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( dialect=dialect, require_limit=True, max_limit=100 ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # No LIMIT - should fail - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is True # With LIMIT - should pass - result = await plugin.evaluate("SELECT * FROM users LIMIT 50") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 50") assert result.error is None assert result.matched is False @@ -1672,22 +1672,22 @@ async def test_dialect_with_limit_enforcement(self): async def test_dialect_with_column_requirements(self): """Should enforce column requirements across dialects.""" for dialect in ["postgres", "mysql", "tsql", "oracle", "sqlite"]: - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( dialect=dialect, required_columns=["tenant_id"], column_context="where", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # With required column - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users WHERE tenant_id = 1" ) assert result.error is None assert result.matched is False # Without required column - should fail - result = await plugin.evaluate("SELECT * FROM users WHERE id = 1") + result = await evaluator.evaluate("SELECT * FROM users WHERE id = 1") assert result.error is None assert result.matched is True @@ -1698,27 +1698,27 @@ class TestSQLDialectEdgeCases: @pytest.mark.asyncio async def test_mysql_case_insensitive_table_names(self): """MySQL table names are case-insensitive on most systems.""" - config = SQLControlEvaluatorPluginConfig(dialect="mysql", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="mysql", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # Both should work (MySQL normalizes to lowercase) - result = await plugin.evaluate("SELECT * FROM users") + result = await evaluator.evaluate("SELECT * FROM users") assert result.error is None assert result.matched is False - result = await plugin.evaluate("SELECT * FROM USERS") + result = await evaluator.evaluate("SELECT * FROM USERS") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_tsql_function_syntax(self): """T-SQL has different function syntax than standard SQL.""" - config = SQLControlEvaluatorPluginConfig(dialect="tsql", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="tsql", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) # T-SQL datetime function - using DATEADD instead of GETDATE # since GETDATE is already parsed correctly - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT TOP 10 * FROM users " "WHERE created > DATEADD(day, -7, GETDATE())" ) @@ -1728,10 +1728,10 @@ async def test_tsql_function_syntax(self): @pytest.mark.asyncio async def test_oracle_schema_prefix(self): """Oracle uses schema.table.column notation.""" - config = SQLControlEvaluatorPluginConfig(dialect="oracle", allowed_tables=["users"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="oracle", allowed_tables=["users"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT u.id, u.name FROM schema.users u" ) assert result.error is None @@ -1741,24 +1741,24 @@ async def test_oracle_schema_prefix(self): async def test_dialect_with_unicode_identifiers(self): """All dialects should handle unicode in identifiers.""" for dialect in ["postgres", "mysql", "tsql", "oracle", "sqlite"]: - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( dialect=dialect, allowed_tables=["usuarios"], # Spanish for "users" ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("SELECT * FROM usuarios") + result = await evaluator.evaluate("SELECT * FROM usuarios") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_sqlite_with_complex_query(self): """SQLite should handle complex queries correctly.""" - config = SQLControlEvaluatorPluginConfig(dialect="sqlite", allowed_tables=["users", "orders"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(dialect="sqlite", allowed_tables=["users", "orders"]) + evaluator = SQLEvaluator(config) # Complex SQLite query with JOIN and WHERE - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT u.name, o.total FROM users u " "JOIN orders o ON u.id = o.user_id " "WHERE o.total > 100" @@ -1773,11 +1773,11 @@ class TestOperationSecurityBypass: @pytest.mark.asyncio async def test_delete_in_cte_is_detected(self): """DELETE in CTE should be detected and blocked.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE"]) + evaluator = SQLEvaluator(config) # DELETE hidden in CTE - result = await plugin.evaluate( + result = await evaluator.evaluate( "WITH deleted AS (" "DELETE FROM users WHERE id = 1 RETURNING *" ") SELECT * FROM deleted" @@ -1789,10 +1789,10 @@ async def test_delete_in_cte_is_detected(self): @pytest.mark.asyncio async def test_update_in_cte_is_detected(self): """UPDATE in CTE should be detected.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["UPDATE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["UPDATE"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "WITH updated AS (" "UPDATE users SET name = 'test' WHERE id = 1 RETURNING *" ") SELECT * FROM updated" @@ -1804,10 +1804,10 @@ async def test_update_in_cte_is_detected(self): @pytest.mark.asyncio async def test_insert_in_nested_cte_is_detected(self): """INSERT in nested CTE should be detected.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["INSERT"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["INSERT"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "WITH outer_cte AS (" " WITH inner_cte AS (" " INSERT INTO users (name) VALUES ('test') RETURNING *" @@ -1821,11 +1821,11 @@ async def test_insert_in_nested_cte_is_detected(self): @pytest.mark.asyncio async def test_select_with_delete_subquery_in_from(self): """DELETE in SELECT's FROM subquery should be detected.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE"]) + evaluator = SQLEvaluator(config) # Use nested SELECT with CTE pattern - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM (" " WITH deleted AS (DELETE FROM users WHERE id = 1 RETURNING *) " " SELECT * FROM deleted" @@ -1838,10 +1838,10 @@ async def test_select_with_delete_subquery_in_from(self): @pytest.mark.asyncio async def test_multiple_operations_in_ctes(self): """Multiple different operations in CTEs should all be detected.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE", "UPDATE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE", "UPDATE"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "WITH deleted AS (DELETE FROM users WHERE id = 1 RETURNING *), " "updated AS (UPDATE orders SET status = 'done' WHERE id = 2 RETURNING *) " "SELECT * FROM deleted UNION ALL SELECT * FROM updated" @@ -1858,15 +1858,15 @@ class TestMultiTenantRLSSecurityBypass: @pytest.mark.asyncio async def test_top_level_scope_blocks_subquery_tenant_filter(self): """top_level scope requires tenant_id in outer WHERE, not subquery.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # tenant_id only in subquery - should FAIL with top_level scope - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users " "WHERE id IN (SELECT user_id FROM orders WHERE tenant_id = 123)" ) @@ -1877,15 +1877,15 @@ async def test_top_level_scope_blocks_subquery_tenant_filter(self): @pytest.mark.asyncio async def test_top_level_scope_passes_with_outer_tenant_filter(self): """top_level scope should pass when tenant_id in outer WHERE.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # tenant_id in outer WHERE - should PASS - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users " "WHERE tenant_id = 123 AND id IN (SELECT user_id FROM orders)" ) @@ -1895,15 +1895,15 @@ async def test_top_level_scope_passes_with_outer_tenant_filter(self): @pytest.mark.asyncio async def test_all_scope_backward_compatible(self): """'all' scope should find tenant_id in any WHERE (backward compatible).""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", column_context_scope="all", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # tenant_id in subquery - should PASS with 'all' scope - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users " "WHERE id IN (SELECT user_id FROM orders WHERE tenant_id = 123)" ) @@ -1913,15 +1913,15 @@ async def test_all_scope_backward_compatible(self): @pytest.mark.asyncio async def test_default_scope_is_all(self): """Default column_context_scope should be 'all' for backward compatibility.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="where", # column_context_scope not specified, should default to "all" ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # Should behave like scope="all" - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users " "WHERE id IN (SELECT user_id FROM orders WHERE tenant_id = 123)" ) @@ -1931,15 +1931,15 @@ async def test_default_scope_is_all(self): @pytest.mark.asyncio async def test_select_context_with_top_level_scope(self): """top_level scope with select context only checks outer SELECT.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["tenant_id"], column_context="select", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # tenant_id only in subquery SELECT - should FAIL - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT id, name FROM users " "WHERE id IN (SELECT tenant_id FROM orders)" ) @@ -1953,11 +1953,11 @@ class TestLimitBypassSubqueries: @pytest.mark.asyncio async def test_subquery_without_limit_is_blocked(self): """Subquery without LIMIT should be blocked when require_limit=True.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # Outer has LIMIT, inner doesn't - should FAIL - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM (SELECT * FROM huge_table) AS t LIMIT 10" ) assert result.error is None @@ -1967,10 +1967,10 @@ async def test_subquery_without_limit_is_blocked(self): @pytest.mark.asyncio async def test_all_subqueries_with_limit_passes(self): """All SELECTs with LIMIT should pass.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM (SELECT * FROM users LIMIT 100) AS t LIMIT 10" ) assert result.error is None @@ -1979,11 +1979,11 @@ async def test_all_subqueries_with_limit_passes(self): @pytest.mark.asyncio async def test_nested_subqueries_all_need_limit(self): """All nested subqueries need LIMIT.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # Deepest subquery missing LIMIT - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM (" " SELECT * FROM (" " SELECT * FROM users" @@ -1997,11 +1997,11 @@ async def test_nested_subqueries_all_need_limit(self): @pytest.mark.asyncio async def test_max_limit_enforced_on_subqueries(self): """max_limit should be enforced on all subqueries.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True, max_limit=100) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True, max_limit=100) + evaluator = SQLEvaluator(config) # Subquery exceeds max_limit - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM (SELECT * FROM users LIMIT 500) AS t LIMIT 10" ) assert result.error is None @@ -2011,10 +2011,10 @@ async def test_max_limit_enforced_on_subqueries(self): @pytest.mark.asyncio async def test_cte_without_limit_is_blocked(self): """CTE SELECT without LIMIT should be blocked.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "WITH user_data AS (SELECT * FROM users) " "SELECT * FROM user_data LIMIT 10" ) @@ -2025,16 +2025,16 @@ async def test_cte_without_limit_is_blocked(self): @pytest.mark.asyncio async def test_max_result_window_enforced(self): """max_result_window should prevent deep pagination.""" - config = SQLControlEvaluatorPluginConfig(max_result_window=10000) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_result_window=10000) + evaluator = SQLEvaluator(config) # Within limit: 100 + 9900 = 10000 - should PASS - result = await plugin.evaluate("SELECT * FROM users LIMIT 100 OFFSET 9900") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 100 OFFSET 9900") assert result.error is None assert result.matched is False # Exceeds limit: 10 + 10000 = 10010 > 10000 - should FAIL - result = await plugin.evaluate("SELECT * FROM users LIMIT 10 OFFSET 10000") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 10 OFFSET 10000") assert result.error is None assert result.matched is True assert ( @@ -2044,22 +2044,22 @@ async def test_max_result_window_enforced(self): @pytest.mark.asyncio async def test_large_offset_without_max_result_window(self): """Without max_result_window, large OFFSET should be allowed.""" - config = SQLControlEvaluatorPluginConfig() # No max_result_window - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig() # No max_result_window + evaluator = SQLEvaluator(config) # Large OFFSET but no restriction - should PASS - result = await plugin.evaluate("SELECT * FROM users LIMIT 10 OFFSET 1000000") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 10 OFFSET 1000000") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_max_result_window_on_subqueries(self): """max_result_window should be enforced on subqueries.""" - config = SQLControlEvaluatorPluginConfig(max_result_window=1000) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_result_window=1000) + evaluator = SQLEvaluator(config) # Subquery exceeds max_result_window - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM (" "SELECT * FROM users LIMIT 10 OFFSET 1000" ") AS t LIMIT 10" @@ -2073,17 +2073,17 @@ async def test_max_result_window_on_subqueries(self): @pytest.mark.asyncio async def test_max_limit_and_max_result_window_together(self): """Both max_limit and max_result_window should be enforced.""" - config = SQLControlEvaluatorPluginConfig(max_limit=100, max_result_window=10000) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_limit=100, max_result_window=10000) + evaluator = SQLEvaluator(config) # Exceeds max_limit - should FAIL - result = await plugin.evaluate("SELECT * FROM users LIMIT 500") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 500") assert result.error is None assert result.matched is True assert "500" in result.message # Within max_limit but exceeds max_result_window - should FAIL - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT * FROM users LIMIT 100 OFFSET 10000" ) assert result.error is None @@ -2093,7 +2093,7 @@ async def test_max_limit_and_max_result_window_together(self): ) # Within both limits - should PASS - result = await plugin.evaluate("SELECT * FROM users LIMIT 100 OFFSET 9000") + result = await evaluator.evaluate("SELECT * FROM users LIMIT 100 OFFSET 9000") assert result.error is None assert result.matched is False @@ -2104,44 +2104,44 @@ class TestSelectColumnExtractionFixed: @pytest.mark.asyncio async def test_column_in_function_is_extracted(self): """Columns in functions should be extracted.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id"], column_context="select", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # user_id in COUNT() function - should be extracted - result = await plugin.evaluate("SELECT COUNT(user_id), name FROM users") + result = await evaluator.evaluate("SELECT COUNT(user_id), name FROM users") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_column_in_expression_is_extracted(self): """Columns in expressions should be extracted.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["price"], column_context="select", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) # price in arithmetic expression - result = await plugin.evaluate("SELECT price * 1.1, name FROM products") + result = await evaluator.evaluate("SELECT price * 1.1, name FROM products") assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_column_in_case_is_extracted(self): """Columns in CASE expressions should be extracted.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["status"], column_context="select", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT CASE WHEN status = 'active' THEN 1 ELSE 0 END FROM users" ) assert result.error is None @@ -2150,15 +2150,15 @@ async def test_column_in_case_is_extracted(self): @pytest.mark.asyncio async def test_multiple_columns_in_coalesce(self): """Multiple columns in COALESCE should be extracted.""" - config = SQLControlEvaluatorPluginConfig( + config = SQLEvaluatorConfig( required_columns=["user_id", "guest_id"], column_presence_logic="any", column_context="select", column_context_scope="top_level", ) - plugin = SQLControlEvaluatorPlugin(config) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate( + result = await evaluator.evaluate( "SELECT COALESCE(user_id, guest_id) FROM sessions" ) assert result.error is None @@ -2171,10 +2171,10 @@ class TestNewOperationDetection: @pytest.mark.asyncio async def test_commit_operation_detected(self): """COMMIT should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["COMMIT"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["COMMIT"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("COMMIT") + result = await evaluator.evaluate("COMMIT") assert result.error is None assert result.matched is True assert "COMMIT" in result.metadata["blocked"] @@ -2182,10 +2182,10 @@ async def test_commit_operation_detected(self): @pytest.mark.asyncio async def test_rollback_operation_detected(self): """ROLLBACK should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["ROLLBACK"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["ROLLBACK"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("ROLLBACK") + result = await evaluator.evaluate("ROLLBACK") assert result.error is None assert result.matched is True assert "ROLLBACK" in result.metadata["blocked"] @@ -2193,10 +2193,10 @@ async def test_rollback_operation_detected(self): @pytest.mark.asyncio async def test_show_operation_detected(self): """SHOW parses to COMMAND (sqlglot fallback for unsupported syntax).""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["COMMAND"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["COMMAND"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("SHOW TABLES") + result = await evaluator.evaluate("SHOW TABLES") assert result.error is None assert result.matched is True assert "COMMAND" in result.metadata["blocked"] @@ -2204,10 +2204,10 @@ async def test_show_operation_detected(self): @pytest.mark.asyncio async def test_describe_operation_detected(self): """DESCRIBE should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DESCRIBE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DESCRIBE"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("DESCRIBE users") + result = await evaluator.evaluate("DESCRIBE users") assert result.error is None assert result.matched is True assert "DESCRIBE" in result.metadata["blocked"] @@ -2215,10 +2215,10 @@ async def test_describe_operation_detected(self): @pytest.mark.asyncio async def test_set_operation_detected(self): """SET should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["SET"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["SET"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("SET search_path = public") + result = await evaluator.evaluate("SET search_path = public") assert result.error is None assert result.matched is True assert "SET" in result.metadata["blocked"] @@ -2226,10 +2226,10 @@ async def test_set_operation_detected(self): @pytest.mark.asyncio async def test_use_operation_detected(self): """USE should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["USE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["USE"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("USE database_name") + result = await evaluator.evaluate("USE database_name") assert result.error is None assert result.matched is True assert "USE" in result.metadata["blocked"] @@ -2237,10 +2237,10 @@ async def test_use_operation_detected(self): @pytest.mark.asyncio async def test_copy_operation_detected(self): """COPY should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["COPY"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["COPY"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("COPY users TO '/tmp/users.csv'") + result = await evaluator.evaluate("COPY users TO '/tmp/users.csv'") assert result.error is None assert result.matched is True assert "COPY" in result.metadata["blocked"] @@ -2249,23 +2249,23 @@ async def test_copy_operation_detected(self): async def test_lock_operation_fails_to_parse(self): """LOCK TABLE fails to parse in sqlglot - blocked as invalid SQL.""" # Need a control configured for parsing to be attempted - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE"]) + evaluator = SQLEvaluator(config) # LOCK TABLE doesn't parse, so it's blocked as invalid SQL - result = await plugin.evaluate("LOCK TABLE users IN ACCESS EXCLUSIVE MODE") + result = await evaluator.evaluate("LOCK TABLE users IN ACCESS EXCLUSIVE MODE") assert result.error is None assert result.matched is True # Invalid SQL is blocked - assert result.error is None # Not a plugin error + assert result.error is None # Not a evaluator error assert "pars" in result.message.lower() @pytest.mark.asyncio async def test_analyze_operation_detected(self): """ANALYZE should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["ANALYZE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["ANALYZE"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("ANALYZE users") + result = await evaluator.evaluate("ANALYZE users") assert result.error is None assert result.matched is True assert "ANALYZE" in result.metadata["blocked"] @@ -2273,10 +2273,10 @@ async def test_analyze_operation_detected(self): @pytest.mark.asyncio async def test_comment_operation_detected(self): """COMMENT should be detected and blockable.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["COMMENT"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["COMMENT"]) + evaluator = SQLEvaluator(config) - result = await plugin.evaluate("COMMENT ON TABLE users IS 'User data'") + result = await evaluator.evaluate("COMMENT ON TABLE users IS 'User data'") assert result.error is None assert result.matched is True assert "COMMENT" in result.metadata["blocked"] @@ -2288,8 +2288,8 @@ class TestQueryComplexityLimits: @pytest.mark.asyncio async def test_subquery_depth_limit_enforced(self): """Deeply nested subqueries should be blocked.""" - config = SQLControlEvaluatorPluginConfig(max_subquery_depth=2) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_subquery_depth=2) + evaluator = SQLEvaluator(config) # Depth 3: exceeds limit query = """ @@ -2301,7 +2301,7 @@ async def test_subquery_depth_limit_enforced(self): ) AS level2 ) AS level1 """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is True assert "subquery depth" in result.message.lower() @@ -2311,8 +2311,8 @@ async def test_subquery_depth_limit_enforced(self): @pytest.mark.asyncio async def test_subquery_depth_within_limit(self): """Shallow subqueries should pass.""" - config = SQLControlEvaluatorPluginConfig(max_subquery_depth=2) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_subquery_depth=2) + evaluator = SQLEvaluator(config) # Depth 2: at limit query = """ @@ -2322,15 +2322,15 @@ async def test_subquery_depth_within_limit(self): ) AS level2 ) AS level1 """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_max_joins_enforced(self): """Too many joins should be blocked.""" - config = SQLControlEvaluatorPluginConfig(max_joins=3) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_joins=3) + evaluator = SQLEvaluator(config) # 4 joins: exceeds limit query = """ @@ -2340,7 +2340,7 @@ async def test_max_joins_enforced(self): JOIN categories ON products.category_id = categories.id JOIN brands ON products.brand_id = brands.id """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is True assert "JOIN" in result.message @@ -2350,8 +2350,8 @@ async def test_max_joins_enforced(self): @pytest.mark.asyncio async def test_max_joins_within_limit(self): """Reasonable number of joins should pass.""" - config = SQLControlEvaluatorPluginConfig(max_joins=3) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_joins=3) + evaluator = SQLEvaluator(config) # 3 joins: at limit query = """ @@ -2360,15 +2360,15 @@ async def test_max_joins_within_limit(self): JOIN products ON orders.product_id = products.id JOIN categories ON products.category_id = categories.id """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is False @pytest.mark.asyncio async def test_max_union_count_enforced(self): """Too many UNION operations should be blocked.""" - config = SQLControlEvaluatorPluginConfig(max_union_count=2) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_union_count=2) + evaluator = SQLEvaluator(config) # 3 UNIONs: exceeds limit query = """ @@ -2380,7 +2380,7 @@ async def test_max_union_count_enforced(self): UNION ALL SELECT * FROM partners """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is True assert "set operations" in result.message.lower() @@ -2390,8 +2390,8 @@ async def test_max_union_count_enforced(self): @pytest.mark.asyncio async def test_max_union_count_within_limit(self): """Reasonable UNION chains should pass.""" - config = SQLControlEvaluatorPluginConfig(max_union_count=2) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_union_count=2) + evaluator = SQLEvaluator(config) # 2 UNIONs: at limit query = """ @@ -2401,7 +2401,7 @@ async def test_max_union_count_within_limit(self): UNION ALL SELECT * FROM vendors """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is False @@ -2412,8 +2412,8 @@ class TestEdgeCasesAlreadyFixed: @pytest.mark.asyncio async def test_union_all_parts_checked_for_limit(self): """Issue #19: All parts of UNION should be checked for LIMIT.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # One part missing LIMIT - should fail query = """ @@ -2421,7 +2421,7 @@ async def test_union_all_parts_checked_for_limit(self): UNION ALL SELECT * FROM customers """ - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is True assert "LIMIT" in result.message @@ -2429,17 +2429,17 @@ async def test_union_all_parts_checked_for_limit(self): @pytest.mark.asyncio async def test_insert_select_validated(self): """Issue #20: SELECT in INSERT...SELECT should be validated.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # INSERT...SELECT without LIMIT - should fail - result = await plugin.evaluate("INSERT INTO backup SELECT * FROM users") + result = await evaluator.evaluate("INSERT INTO backup SELECT * FROM users") assert result.error is None assert result.matched is True assert "LIMIT" in result.message # INSERT...SELECT with LIMIT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "INSERT INTO backup SELECT * FROM users LIMIT 100" ) assert result.error is None @@ -2448,11 +2448,11 @@ async def test_insert_select_validated(self): @pytest.mark.asyncio async def test_create_view_validated(self): """Issue #21: SELECT in CREATE VIEW should be validated.""" - config = SQLControlEvaluatorPluginConfig(require_limit=True) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(require_limit=True) + evaluator = SQLEvaluator(config) # CREATE VIEW without LIMIT - should fail - result = await plugin.evaluate( + result = await evaluator.evaluate( "CREATE VIEW active_users AS SELECT * FROM users WHERE active = true" ) assert result.error is None @@ -2460,7 +2460,7 @@ async def test_create_view_validated(self): assert "LIMIT" in result.message # CREATE VIEW with LIMIT - should pass - result = await plugin.evaluate( + result = await evaluator.evaluate( "CREATE VIEW active_users AS SELECT * FROM users WHERE active = true LIMIT 1000" ) assert result.error is None @@ -2473,12 +2473,12 @@ class TestEnhancedMetadata: @pytest.mark.asyncio async def test_short_query_metadata(self): """Short queries should have full snippet.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE"]) + evaluator = SQLEvaluator(config) # Blocked operation to trigger metadata query = "DELETE FROM users" - result = await plugin.evaluate(query) + result = await evaluator.evaluate(query) assert result.error is None assert result.matched is True assert "query_snippet" in result.metadata or "query" in result.metadata @@ -2491,13 +2491,13 @@ async def test_short_query_metadata(self): @pytest.mark.asyncio async def test_long_query_smart_truncation(self): """Long queries should have beginning and end with ellipsis.""" - config = SQLControlEvaluatorPluginConfig(max_limit=10) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(max_limit=10) + evaluator = SQLEvaluator(config) # Create a very long query that violates max_limit long_query = "SELECT " + ", ".join(f"col{i}" for i in range(100)) + " FROM users WHERE " + " AND ".join(f"field{i} = {i}" for i in range(50)) + " LIMIT 1000" - result = await plugin.evaluate(long_query) + result = await evaluator.evaluate(long_query) assert result.error is None assert result.matched is True assert result.metadata is not None @@ -2510,13 +2510,13 @@ async def test_long_query_smart_truncation(self): @pytest.mark.asyncio async def test_query_hash_consistent(self): """Same query should produce same hash.""" - config = SQLControlEvaluatorPluginConfig(blocked_operations=["DELETE"]) - plugin = SQLControlEvaluatorPlugin(config) + config = SQLEvaluatorConfig(blocked_operations=["DELETE"]) + evaluator = SQLEvaluator(config) query = "DELETE FROM users WHERE id = 1" - result1 = await plugin.evaluate(query) - result2 = await plugin.evaluate(query) + result1 = await evaluator.evaluate(query) + result2 = await evaluator.evaluate(query) assert result1.matched is True assert result2.matched is True diff --git a/examples/README.md b/examples/README.md index 064637a8..a48ff630 100644 --- a/examples/README.md +++ b/examples/README.md @@ -39,7 +39,7 @@ uv run python examples/agent_control_demo/update_controls.py --block-ssn - `setup_controls.py` - Create and configure controls via SDK - `demo_agent.py` - Agent that uses `@control` decorator with server-side policies - `update_controls.py` - Dynamically update controls without code changes -- `agent_luna_demo.py` - Luna-2 plugin integration for AI safety checks +- `agent_luna_demo.py` - Luna-2 evaluator integration for AI safety checks ### 💬 Simple Chatbot (`demo_chatbot.py`) @@ -144,7 +144,7 @@ async with AgentControlClient() as client: scope=ControlScope(step_types=["llm_inference"], stages=["post"]), selector=ControlSelector(path="output"), evaluator=EvaluatorConfig( - plugin="regex", + name="regex", config={"pattern": r"\b\d{3}-\d{2}-\d{4}\b"} ), action=ControlAction(decision="deny") diff --git a/examples/agent_control_demo/setup_controls.py b/examples/agent_control_demo/setup_controls.py index 489926a9..9a4bd653 100644 --- a/examples/agent_control_demo/setup_controls.py +++ b/examples/agent_control_demo/setup_controls.py @@ -129,7 +129,7 @@ async def create_regex_control(client: AgentControlClient) -> int: "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, # Check AFTER "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": r"\b\d{3}-\d{2}-\d{4}\b", # SSN pattern "flags": [] @@ -161,7 +161,7 @@ async def create_list_control(client: AgentControlClient) -> int: "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, # Check BEFORE "selector": {"path": "input"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["DROP", "DELETE", "TRUNCATE", "ALTER", "GRANT"], "logic": "any", # Block if ANY keyword is found @@ -307,7 +307,7 @@ async def update_control(client: AgentControlClient, control_id: int) -> None: "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": [ "DROP", "DELETE", "TRUNCATE", "ALTER", "GRANT", diff --git a/examples/agent_control_demo/update_controls.py b/examples/agent_control_demo/update_controls.py index 3c737895..cf031640 100644 --- a/examples/agent_control_demo/update_controls.py +++ b/examples/agent_control_demo/update_controls.py @@ -62,7 +62,7 @@ async def allow_ssn(client: AgentControlClient, control_id: int) -> None: "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": r"\b\d{3}-\d{2}-\d{4}\b", "flags": [] @@ -102,7 +102,7 @@ async def block_ssn(client: AgentControlClient, control_id: int) -> None: "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": r"\b\d{3}-\d{2}-\d{4}\b", "flags": [] diff --git a/examples/customer_support_agent/README.md b/examples/customer_support_agent/README.md index 51bdd1ac..7b86c803 100644 --- a/examples/customer_support_agent/README.md +++ b/examples/customer_support_agent/README.md @@ -12,8 +12,8 @@ This example demonstrates how to integrate the `agent-control` SDK into an exist ## Quick Start ```bash -# 1. Install SDK and plugins (first time only) -pip install -e sdks/python -e plugins +# 1. Install SDK and evaluators (first time only) +pip install -e sdks/python -e evaluators # 2. Start all services (database, server, UI, demo controls) ./examples/customer_support_agent/demo.sh start @@ -48,10 +48,10 @@ When you open the UI, you'll see the agent with controls already configured. ### First-Time Setup -1. **Install the SDK and plugins**: +1. **Install the SDK and evaluators**: ```bash pip install -e sdks/python - pip install -e plugins + pip install -e evaluators ``` 2. **Install UI dependencies**: @@ -267,7 +267,7 @@ check_stage: post selector: path: output evaluator: - plugin: regex + name: regex config: pattern: '\d{3}-\d{2}-\d{4}' # SSN pattern action: @@ -283,7 +283,7 @@ check_stage: pre selector: path: input evaluator: - plugin: regex + name: regex config: pattern: '(?i)(ignore.*instructions|system:|you are now)' action: @@ -299,7 +299,7 @@ check_stage: pre selector: path: input evaluator: - plugin: luna2 + name: luna2 config: threshold: 0.8 action: diff --git a/examples/customer_support_agent/setup_demo_controls.py b/examples/customer_support_agent/setup_demo_controls.py index 8e6daf32..dc959972 100644 --- a/examples/customer_support_agent/setup_demo_controls.py +++ b/examples/customer_support_agent/setup_demo_controls.py @@ -39,7 +39,7 @@ "enabled": True, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r"\d{3}-\d{2}-\d{4}"}, }, "action": { @@ -57,7 +57,7 @@ "enabled": True, "selector": {"path": "input"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": r"(?i)(ignore.{0,20}(previous|prior|above).{0,20}instructions|you are now|system:|forget everything|disregard)" }, @@ -77,7 +77,7 @@ "enabled": True, "selector": {"path": "input"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b"}, }, "action": { @@ -101,7 +101,7 @@ "tool_names": ["lookup_customer"], # Only applies to this exact tool }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": r"(?i)(select|insert|update|delete|drop|union|--|;)" }, @@ -124,7 +124,7 @@ "tool_names": ["create_ticket"], }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r".*"}, # Always matches }, "action": { @@ -149,7 +149,7 @@ "tool_name_regex": r"(search|lookup)", }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { # Simple profanity pattern for demo "pattern": r"(?i)\b(badword|offensive|inappropriate)\b" @@ -173,7 +173,7 @@ "tool_name_regex": r".*ticket.*", # Any tool with 'ticket' in name }, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["high", "critical", "urgent"], "logic": "any", @@ -203,7 +203,7 @@ "tool_names": ["create_ticket"], }, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { # Email pattern "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b" diff --git a/examples/galileo/README.md b/examples/galileo/README.md index 187c0334..8ef38a90 100644 --- a/examples/galileo/README.md +++ b/examples/galileo/README.md @@ -4,7 +4,7 @@ This directory contains examples demonstrating Agent Control integration with Ga ## Luna-2 Demo (`luna2_demo.py`) -Demonstrates using the Luna-2 plugin with a **CENTRAL stage** for toxicity detection. +Demonstrates using the Luna-2 evaluator with a **CENTRAL stage** for toxicity detection. ### Prerequisites @@ -37,7 +37,7 @@ The demo tests various inputs against a pre-configured toxicity detection stage: ### Central vs Local Stages - **Central Stage** (used in this demo): Rulesets and policies are pre-configured on the Galileo server. Simply reference the stage by name. -- **Local Stage**: Define rulesets at runtime in your code (see plugin documentation). +- **Local Stage**: Define rulesets at runtime in your code (see evaluator documentation). ### Expected Output @@ -73,11 +73,11 @@ Testing toxicity detection with Central Stage... - **"GALILEO_API_KEY environment variable is required"**: Export your API key - **"Project not found"**: Update `PROJECT_NAME` in the script to match your Galileo project - **"Stage not found"**: Update `STAGE_NAME` to match a stage in your project -- **Import errors**: Ensure you installed with `[luna2]` extra: `pip install agent-control-plugins[luna2]` +- **Import errors**: Ensure you installed with `[luna2]` extra: `pip install agent-control-evaluators[luna2]` ### Documentation - [Galileo Protect Overview](https://v2docs.galileo.ai/concepts/protect/overview) - [Luna-2 Python API Reference](https://v2docs.galileo.ai/sdk-api/python/reference/protect) -- [Agent Control Luna-2 Plugin](../../plugins/src/agent_control_plugins/luna2/) +- [Agent Control Luna-2 Evaluator](../../evaluators/src/agent_control_evaluators/luna2/) diff --git a/examples/galileo/luna2_demo.py b/examples/galileo/luna2_demo.py index e9668074..9be689f3 100644 --- a/examples/galileo/luna2_demo.py +++ b/examples/galileo/luna2_demo.py @@ -1,4 +1,4 @@ -"""Demo: Luna-2 Plugin for Toxicity Detection. +"""Demo: Luna-2 Evaluator for Toxicity Detection. This example demonstrates using Galileo Protect with a CENTRAL stage to detect toxic content in user inputs. @@ -37,15 +37,15 @@ # Import our direct API client (no SDK required) try: - from agent_control_plugins.luna2.client import ( + from agent_control_evaluators.luna2.client import ( GalileoProtectClient, Payload, ) GALILEO_AVAILABLE = True except ImportError as e: - print(f"❌ agent-control-plugins not available: {e}") - print(" Install with: pip install agent-control-plugins[luna2]") + print(f"❌ agent-control-evaluators not available: {e}") + print(" Install with: pip install agent-control-evaluators[luna2]") sys.exit(1) diff --git a/examples/galileo/pyproject.toml b/examples/galileo/pyproject.toml index 0cfe44ec..a5e6510e 100644 --- a/examples/galileo/pyproject.toml +++ b/examples/galileo/pyproject.toml @@ -5,7 +5,7 @@ description = "Agent Control Luna-2 Galileo Protect Integration Example" readme = "README.md" requires-python = ">=3.12" dependencies = [ - "agent-control-plugins[luna2]", + "agent-control-evaluators[luna2]", "httpx>=0.24.0", ] diff --git a/examples/langchain/README.md b/examples/langchain/README.md index 584eac75..66e9ef33 100644 --- a/examples/langchain/README.md +++ b/examples/langchain/README.md @@ -6,7 +6,7 @@ This example demonstrates integrating Agent Control with a LangChain SQL agent t ### 1. Start the Agent Control Server -**IMPORTANT: You must start/restart the server to load the SQL plugin!** +**IMPORTANT: You must start/restart the server to load the SQL evaluator!** ```bash # From the repo root @@ -21,9 +21,9 @@ make run # OR: uv run --package agent-control-server uvicorn agent_control_server.main:app --port 8000 ``` -**Verify plugins are loaded:** +**Verify evaluators are loaded:** ```bash -curl http://localhost:8000/api/v1/plugins | python -m json.tool +curl http://localhost:8000/api/v1/evaluators | python -m json.tool # Should show: {"sql": {"name": "sql", "version": "1.0.0", ...}, ...} ``` @@ -96,7 +96,7 @@ The `@control()` decorator: ### 3. Server-Side SQL Control -The server evaluates the SQL using the `sql` plugin: +The server evaluates the SQL using the `sql` evaluator: - Parses the query - Checks for blocked operations (DROP, DELETE, etc.) - Validates LIMIT clauses @@ -111,9 +111,9 @@ If the control check fails with an error: ## Troubleshooting -### "Plugin 'sql' not found" +### "Evaluator 'sql' not found" -**Cause:** Server was started before plugins were installed, or using old code. +**Cause:** Server was started before evaluators were installed, or using old code. **Fix:** ```bash @@ -133,7 +133,7 @@ cd server && make run ### DROP TABLE still executes **Causes:** -1. Server not running or plugins not loaded +1. Server not running or evaluators not loaded 2. Control not assigned to agent's policy 3. Using old decorator code @@ -158,7 +158,7 @@ Step Payload: { ↓ Agent Control Server ↓ -SQL Plugin Evaluation +SQL Evaluator Evaluation ↓ DENY (blocks DROP) or ALLOW (safe query) ↓ diff --git a/examples/langchain/pyproject.toml b/examples/langchain/pyproject.toml index 3ee58f92..19ade2a3 100644 --- a/examples/langchain/pyproject.toml +++ b/examples/langchain/pyproject.toml @@ -6,7 +6,7 @@ requires-python = ">=3.12" dependencies = [ "agent-control-engine", "agent-control-models", - "agent-control-plugins", + "agent-control-evaluators", "agent-control-sdk", "langchain>=0.3.0", "langchain-community>=0.3.0", @@ -42,5 +42,5 @@ include = [ agent-control-sdk = { path = "../../sdks/python", editable = true } agent-control-models = { path = "../../models", editable = true } agent-control-engine = { path = "../../engine", editable = true } -agent-control-plugins = { path = "../../plugins", editable = true } +agent-control-evaluators = { path = "../../evaluators", editable = true } diff --git a/examples/langchain/setup_sql_controls.py b/examples/langchain/setup_sql_controls.py index 7168f41f..6f92f551 100644 --- a/examples/langchain/setup_sql_controls.py +++ b/examples/langchain/setup_sql_controls.py @@ -80,7 +80,7 @@ async def setup_sql_controls(): "path": "input.query" }, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "blocked_operations": ["DROP", "DELETE", "TRUNCATE", "ALTER", "GRANT"], "allow_multi_statements": False, diff --git a/examples/langchain/sql_agent_protection.py b/examples/langchain/sql_agent_protection.py index 581c2639..2d301d59 100644 --- a/examples/langchain/sql_agent_protection.py +++ b/examples/langchain/sql_agent_protection.py @@ -124,7 +124,7 @@ async def safe_query_tool(query: str): print(error_msg) return error_msg except RuntimeError as e: - # Server-side error (e.g., plugin not loaded) + # Server-side error (e.g., evaluator not loaded) error_msg = f"⚠️ Safety check unavailable: {str(e)}" print(error_msg) return error_msg diff --git a/models/src/agent_control_models/__init__.py b/models/src/agent_control_models/__init__.py index 0ad31e93..a8ce893a 100644 --- a/models/src/agent_control_models/__init__.py +++ b/models/src/agent_control_models/__init__.py @@ -20,10 +20,10 @@ ControlSelector, EvaluatorConfig, EvaluatorResult, - JSONControlEvaluatorPluginConfig, - ListConfig, - RegexConfig, - SQLControlEvaluatorPluginConfig, + JSONEvaluatorConfig, + ListEvaluatorConfig, + RegexEvaluatorConfig, + SQLEvaluatorConfig, ) from .errors import ( ERROR_TITLES, @@ -41,6 +41,14 @@ EvaluationResponse, EvaluationResult, ) +from .evaluator import ( + Evaluator, + EvaluatorMetadata, + clear_evaluators, + get_all_evaluators, + get_evaluator, + register_evaluator, +) from .health import HealthResponse from .observability import ( BatchEventsRequest, @@ -52,14 +60,6 @@ StatsRequest, StatsResponse, ) -from .plugin import ( - PluginEvaluator, - PluginMetadata, - clear_plugins, - get_all_plugins, - get_plugin, - register_plugin, -) from .policy import Policy from .server import ( AgentSummary, @@ -106,18 +106,18 @@ "ControlSelector", "EvaluatorConfig", "EvaluatorResult", - # Plugin configs - "JSONControlEvaluatorPluginConfig", - "ListConfig", - "RegexConfig", - "SQLControlEvaluatorPluginConfig", - # Plugin system - "PluginEvaluator", - "PluginMetadata", - "register_plugin", - "get_plugin", - "get_all_plugins", - "clear_plugins", + # Evaluator configs + "JSONEvaluatorConfig", + "ListEvaluatorConfig", + "RegexEvaluatorConfig", + "SQLEvaluatorConfig", + # Evaluator system + "Evaluator", + "EvaluatorMetadata", + "register_evaluator", + "get_evaluator", + "get_all_evaluators", + "clear_evaluators", # Error models (RFC 7807 / Kubernetes / GitHub-style) "ProblemDetail", "ErrorCode", diff --git a/models/src/agent_control_models/controls.py b/models/src/agent_control_models/controls.py index 873c1f12..1d153958 100644 --- a/models/src/agent_control_models/controls.py +++ b/models/src/agent_control_models/controls.py @@ -149,12 +149,12 @@ def validate_stages( # ============================================================================= -# Plugin Config Models (used by plugin implementations) +# Evaluator Config Models (used by evaluator implementations) # ============================================================================= -class RegexConfig(BaseModel): - """Configuration for regex plugin.""" +class RegexEvaluatorConfig(BaseModel): + """Configuration for regex evaluator.""" pattern: str = Field(..., description="Regular expression pattern") flags: list[str] | None = Field(default=None, description="Regex flags") @@ -170,8 +170,8 @@ def validate_pattern(cls, v: str) -> str: return v -class ListConfig(BaseModel): - """Configuration for list plugin.""" +class ListEvaluatorConfig(BaseModel): + """Configuration for list evaluator.""" values: list[str | int | float] = Field( ..., description="List of values to match against" @@ -189,8 +189,8 @@ class ListConfig(BaseModel): case_sensitive: bool = Field(False, description="Whether matching is case sensitive") -class JSONControlEvaluatorPluginConfig(BaseModel): - """Configuration for JSON validation plugin. +class JSONEvaluatorConfig(BaseModel): + """Configuration for JSON validation evaluator. Multiple validation checks can be combined. Checks are evaluated in this order (fail-fast): 1. JSON syntax/validity (always - ensures data is valid JSON) @@ -426,8 +426,8 @@ def validate_has_checks(self) -> Self: return self -class SQLControlEvaluatorPluginConfig(BaseModel): - """Configuration for comprehensive SQL control plugin. +class SQLEvaluatorConfig(BaseModel): + """Configuration for comprehensive SQL control evaluator. Validates SQL query strings using AST-based analysis via sqlglot. Controls are evaluated in order: @@ -725,21 +725,21 @@ def validate_config(self) -> Self: class EvaluatorConfig(BaseModel): - """Evaluator configuration. See GET /plugins for available plugins and schemas. + """Evaluator configuration. See GET /evaluators for available evaluators and schemas. - Plugin reference formats: + Evaluator reference formats: - Built-in: "regex", "list" - Agent-scoped: "my-agent:my-evaluator" (validated in endpoint, not here) """ - plugin: str = Field( + name: str = Field( ..., - description="Plugin name or agent-scoped reference (agent:evaluator)", + description="Evaluator name or agent-scoped reference (agent:evaluator)", examples=["regex", "list", "my-agent:pii-detector"], ) config: dict[str, Any] = Field( ..., - description="Plugin-specific configuration", + description="Evaluator-specific configuration", examples=[ {"pattern": r"\d{3}-\d{2}-\d{4}"}, {"values": ["admin"], "logic": "any"}, @@ -747,23 +747,23 @@ class EvaluatorConfig(BaseModel): ) @model_validator(mode="after") - def validate_plugin_config(self) -> Self: - """Validate config against plugin's schema if plugin is registered. + def validate_evaluator_config(self) -> Self: + """Validate config against evaluator's schema if evaluator is registered. Agent-scoped evaluators (format: agent:evaluator) are validated in the endpoint where we have database access to look up the agent's schema. """ # Agent-scoped evaluators: defer validation to endpoint (needs DB access) - if ":" in self.plugin: + if ":" in self.name: return self - # Built-in plugins: validate config against plugin's config_model - from .plugin import get_plugin + # Built-in evaluators: validate config against evaluator's config_model + from .evaluator import get_evaluator - plugin_cls = get_plugin(self.plugin) - if plugin_cls: - plugin_cls.config_model(**self.config) - # If plugin not found, allow it (might be a server-side registered plugin) + evaluator_cls = get_evaluator(self.name) + if evaluator_cls: + evaluator_cls.config_model(**self.config) + # If evaluator not found, allow it (might be a server-side registered evaluator) return self @@ -797,7 +797,7 @@ class ControlDefinition(BaseModel): # What to check selector: ControlSelector = Field(..., description="What data to select from the payload") - # How to check (unified plugin-based evaluator) + # How to check (unified evaluator-based system) evaluator: EvaluatorConfig = Field(..., description="How to evaluate the selected data") # What to do @@ -816,7 +816,7 @@ class ControlDefinition(BaseModel): "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": { "pattern": r"\b\d{3}-\d{2}-\d{4}\b", }, @@ -834,16 +834,16 @@ class ControlDefinition(BaseModel): class EvaluatorResult(BaseModel): """Result from a control evaluator. - The `error` field indicates plugin failures, NOT validation failures: - - Set `error` for: plugin crashes, timeouts, missing dependencies, external service errors + The `error` field indicates evaluator failures, NOT validation failures: + - Set `error` for: evaluator crashes, timeouts, missing dependencies, external service errors - Do NOT set `error` for: invalid input, syntax errors, schema violations, constraint failures - When `error` is set, `matched` must be False (fail-open on plugin errors). + When `error` is set, `matched` must be False (fail-open on evaluator errors). When `error` is None, `matched` reflects the actual validation result. This distinction allows: - - Clients to distinguish "data violated rules" from "plugin is broken" - - Observability systems to monitor plugin health separately from validation outcomes + - Clients to distinguish "data violated rules" from "evaluator is broken" + - Observability systems to monitor evaluator health separately from validation outcomes """ matched: bool = Field(..., description="Whether the pattern matched") diff --git a/models/src/agent_control_models/errors.py b/models/src/agent_control_models/errors.py index 33133c0c..ba1d0641 100644 --- a/models/src/agent_control_models/errors.py +++ b/models/src/agent_control_models/errors.py @@ -317,9 +317,9 @@ class ProblemDetail(BaseModel): }, { "resource": "Control", - "field": "data.evaluator.plugin", + "field": "data.evaluator.name", "code": "not_found", - "message": "Plugin 'nonexistent' not registered", + "message": "Evaluator 'nonexistent' not registered", }, ], "hint": "Check the evaluator configuration against the schema.", diff --git a/models/src/agent_control_models/plugin.py b/models/src/agent_control_models/evaluator.py similarity index 60% rename from models/src/agent_control_models/plugin.py rename to models/src/agent_control_models/evaluator.py index 4147a030..f1234c60 100644 --- a/models/src/agent_control_models/plugin.py +++ b/models/src/agent_control_models/evaluator.py @@ -1,4 +1,4 @@ -"""Plugin system base classes and registry.""" +"""Evaluator system base classes and registry.""" from __future__ import annotations @@ -20,14 +20,14 @@ @dataclass -class PluginMetadata: - """Metadata about a plugin. +class EvaluatorMetadata: + """Metadata about an evaluator. Attributes: - name: Unique plugin name (e.g., "regex", "galileo-luna2") - version: Plugin version string + name: Unique evaluator name (e.g., "regex", "galileo-luna2") + version: Evaluator version string description: Human-readable description - requires_api_key: Whether the plugin requires an API key + requires_api_key: Whether the evaluator requires an API key timeout_ms: Default timeout in milliseconds """ @@ -38,16 +38,16 @@ class PluginMetadata: timeout_ms: int = 10000 -class PluginEvaluator(ABC, Generic[ConfigT]): # noqa: UP046 +class Evaluator(ABC, Generic[ConfigT]): # noqa: UP046 """Base class for all evaluators (built-in, external, or custom). All evaluators follow the same pattern: 1. Define metadata and config_model as class variables 2. Implement evaluate() method - 3. Register with @register_plugin decorator + 3. Register with @register_evaluator decorator IMPORTANT - Instance Caching & Thread Safety: - Plugin instances are cached and reused across multiple evaluate() calls + Evaluator instances are cached and reused across multiple evaluate() calls when they have the same configuration. This means: - DO NOT store mutable request-scoped state on `self` @@ -74,15 +74,15 @@ async def evaluate(self, data): Example: ```python - from agent_control_models import PluginEvaluator, PluginMetadata, register_plugin + from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator class MyConfig(BaseModel): threshold: float = 0.5 - @register_plugin - class MyPlugin(PluginEvaluator[MyConfig]): - metadata = PluginMetadata( - name="my-plugin", + @register_evaluator + class MyEvaluator(Evaluator[MyConfig]): + metadata = EvaluatorMetadata( + name="my-evaluator", version="1.0.0", description="My custom evaluator", ) @@ -97,11 +97,11 @@ async def evaluate(self, data: Any) -> EvaluatorResult: ``` """ - metadata: ClassVar[PluginMetadata] + metadata: ClassVar[EvaluatorMetadata] config_model: ClassVar[type[BaseModel]] def __init__(self, config: ConfigT) -> None: - """Initialize plugin with validated config. + """Initialize evaluator with validated config. Args: config: Validated configuration (instance of config_model) @@ -110,7 +110,7 @@ def __init__(self, config: ConfigT) -> None: @classmethod def from_dict(cls, config_dict: dict[str, Any]) -> Self: - """Create plugin instance from raw config dict. + """Create evaluator instance from raw config dict. Validates config against config_model before creating instance. @@ -118,7 +118,7 @@ def from_dict(cls, config_dict: dict[str, Any]) -> Self: config_dict: Raw configuration dictionary Returns: - Plugin instance with validated config + Evaluator instance with validated config """ validated = cls.config_model(**config_dict) return cls(validated) # type: ignore[arg-type] @@ -142,81 +142,81 @@ def get_timeout_seconds(self) -> float: @classmethod def is_available(cls) -> bool: - """Check if plugin dependencies are satisfied. + """Check if evaluator dependencies are satisfied. - Override this method for plugins with optional dependencies. + Override this method for evaluators with optional dependencies. Return False to skip registration during discovery. Returns: - True if plugin can be used, False otherwise + True if evaluator can be used, False otherwise """ return True # ============================================================================= -# Plugin Registry +# Evaluator Registry # ============================================================================= -_PLUGIN_REGISTRY: dict[str, type[PluginEvaluator[Any]]] = {} +_EVALUATOR_REGISTRY: dict[str, type[Evaluator[Any]]] = {} -def register_plugin( - plugin_class: type[PluginEvaluator[Any]], -) -> type[PluginEvaluator[Any]]: - """Register a plugin class by its metadata name. +def register_evaluator( + evaluator_class: type[Evaluator[Any]], +) -> type[Evaluator[Any]]: + """Register an evaluator class by its metadata name. - Can be used as a decorator or called directly. Respects the plugin's - is_available() method - plugins with unavailable dependencies are + Can be used as a decorator or called directly. Respects the evaluator's + is_available() method - evaluators with unavailable dependencies are silently skipped. Args: - plugin_class: Plugin class to register + evaluator_class: Evaluator class to register Returns: - The same plugin class (for decorator usage) + The same evaluator class (for decorator usage) Raises: - ValueError: If plugin name already registered + ValueError: If evaluator name already registered """ - name = plugin_class.metadata.name + name = evaluator_class.metadata.name - # Check if plugin dependencies are satisfied - if not plugin_class.is_available(): - logger.debug(f"Plugin '{name}' not available (is_available=False), skipping") - return plugin_class + # Check if evaluator dependencies are satisfied + if not evaluator_class.is_available(): + logger.debug(f"Evaluator '{name}' not available (is_available=False), skipping") + return evaluator_class - if name in _PLUGIN_REGISTRY: + if name in _EVALUATOR_REGISTRY: # Allow re-registration of same class (e.g., during hot reload) - if _PLUGIN_REGISTRY[name] is plugin_class: - return plugin_class - raise ValueError(f"Plugin '{name}' is already registered") + if _EVALUATOR_REGISTRY[name] is evaluator_class: + return evaluator_class + raise ValueError(f"Evaluator '{name}' is already registered") - _PLUGIN_REGISTRY[name] = plugin_class - logger.debug(f"Registered plugin: {name} v{plugin_class.metadata.version}") - return plugin_class + _EVALUATOR_REGISTRY[name] = evaluator_class + logger.debug(f"Registered evaluator: {name} v{evaluator_class.metadata.version}") + return evaluator_class -def get_plugin(name: str) -> type[PluginEvaluator[Any]] | None: - """Get a registered plugin by name. +def get_evaluator(name: str) -> type[Evaluator[Any]] | None: + """Get a registered evaluator by name. Args: - name: Plugin name to look up + name: Evaluator name to look up Returns: - Plugin class if found, None otherwise + Evaluator class if found, None otherwise """ - return _PLUGIN_REGISTRY.get(name) + return _EVALUATOR_REGISTRY.get(name) -def get_all_plugins() -> dict[str, type[PluginEvaluator[Any]]]: - """Get all registered plugins. +def get_all_evaluators() -> dict[str, type[Evaluator[Any]]]: + """Get all registered evaluators. Returns: - Dictionary mapping plugin names to plugin classes + Dictionary mapping evaluator names to evaluator classes """ - return dict(_PLUGIN_REGISTRY) + return dict(_EVALUATOR_REGISTRY) -def clear_plugins() -> None: - """Clear all registered plugins. Useful for testing.""" - _PLUGIN_REGISTRY.clear() +def clear_evaluators() -> None: + """Clear all registered evaluators. Useful for testing.""" + _EVALUATOR_REGISTRY.clear() diff --git a/models/src/agent_control_models/observability.py b/models/src/agent_control_models/observability.py index 667b4b97..6366d0c1 100644 --- a/models/src/agent_control_models/observability.py +++ b/models/src/agent_control_models/observability.py @@ -30,7 +30,7 @@ class ControlExecutionEvent(BaseModel): - Context: agent, control, check stage, applies to - Result: action taken, whether matched, confidence score - Timing: when it happened, how long it took - - Optional details: evaluator plugin, selector path, errors, metadata + - Optional details: evaluator name, selector path, errors, metadata Attributes: control_execution_id: Unique ID for this specific control execution @@ -49,7 +49,7 @@ class ControlExecutionEvent(BaseModel): confidence: Confidence score from the evaluator (0.0-1.0) timestamp: When the control was executed (UTC) execution_duration_ms: How long the control evaluation took - evaluator_plugin: Name of the evaluator plugin used + evaluator_name: Name of the evaluator used selector_path: The selector path used to extract data error_message: Error message if evaluation failed metadata: Additional metadata for extensibility @@ -108,8 +108,8 @@ class ControlExecutionEvent(BaseModel): ) # Optional details - evaluator_plugin: str | None = Field( - default=None, description="Name of the evaluator plugin used" + evaluator_name: str | None = Field( + default=None, description="Name of the evaluator used" ) selector_path: str | None = Field( default=None, description="Selector path used to extract data" @@ -171,7 +171,7 @@ def validate_span_id(cls, v: str) -> str: "confidence": 0.95, "timestamp": "2025-01-09T10:30:00Z", "execution_duration_ms": 15.3, - "evaluator_plugin": "regex", + "evaluator_name": "regex", "selector_path": "input", } ] diff --git a/models/src/agent_control_models/server.py b/models/src/agent_control_models/server.py index e29742d3..1341134c 100644 --- a/models/src/agent_control_models/server.py +++ b/models/src/agent_control_models/server.py @@ -11,7 +11,7 @@ class EvaluatorSchema(BaseModel): """Schema for a custom evaluator registered with an agent. - Custom evaluators are PluginEvaluator classes deployed with the engine. + Custom evaluators are Evaluator classes deployed with the engine. This schema is registered via initAgent for validation and UI purposes. """ @@ -314,8 +314,8 @@ class EvaluatorConfigItem(BaseModel): description: str | None = Field( None, max_length=1000, description="Optional description" ) - plugin: str = Field(..., min_length=1, description="Plugin name (built-in or custom)") - config: dict[str, Any] = Field(..., description="Plugin-specific configuration") + evaluator: str = Field(..., min_length=1, description="Evaluator name (built-in or custom)") + config: dict[str, Any] = Field(..., description="Evaluator-specific configuration") created_at: str | None = Field(None, description="ISO 8601 created timestamp") updated_at: str | None = Field(None, description="ISO 8601 updated timestamp") @@ -333,8 +333,8 @@ class CreateEvaluatorConfigRequest(BaseModel): description: str | None = Field( None, max_length=1000, description="Optional description" ) - plugin: str = Field(..., min_length=1, description="Plugin name (built-in or custom)") - config: dict[str, Any] = Field(..., description="Plugin-specific configuration") + evaluator: str = Field(..., min_length=1, description="Evaluator name (built-in or custom)") + config: dict[str, Any] = Field(..., description="Evaluator-specific configuration") class UpdateEvaluatorConfigRequest(BaseModel): @@ -350,8 +350,8 @@ class UpdateEvaluatorConfigRequest(BaseModel): description: str | None = Field( None, max_length=1000, description="Optional description" ) - plugin: str = Field(..., min_length=1, description="Plugin name (built-in or custom)") - config: dict[str, Any] = Field(..., description="Plugin-specific configuration") + evaluator: str = Field(..., min_length=1, description="Evaluator name (built-in or custom)") + config: dict[str, Any] = Field(..., description="Evaluator-specific configuration") class ListEvaluatorConfigsResponse(BaseModel): diff --git a/plugins/README.md b/plugins/README.md deleted file mode 100644 index 355d7e45..00000000 --- a/plugins/README.md +++ /dev/null @@ -1,144 +0,0 @@ -# Agent Control Plugins - -Plugin implementations for agent-control. - -## Installation - -```bash -# Base package (no plugins) -pip install agent-control-plugins - -# With Luna-2 plugin (uses direct HTTP API, no Galileo SDK required) -pip install agent-control-plugins[luna2] -``` - -## Available Plugins - -### Luna-2 Plugin - -Galileo Luna-2 enterprise runtime protection plugin for real-time safety and quality checks. -This plugin calls the Galileo Protect API directly via HTTP - no Galileo SDK required. - -**Environment Variables:** -- `GALILEO_API_KEY`: Your Galileo API key (required) -- `GALILEO_CONSOLE_URL`: Galileo Console URL (optional, defaults to production) - -```python -import asyncio -from agent_control_plugins.luna2 import Luna2Plugin, Luna2Config - -# Configure -config = Luna2Config( - stage_type="local", - metric="input_toxicity", - operator="gt", - target_value=0.5, # Use numeric values for thresholds - galileo_project="my-project", -) - -# Create plugin instance with config -plugin = Luna2Plugin(config) - -# Evaluate (async) -async def check_content(): - result = await plugin.evaluate(data="Some text to check") - if result.matched: - print("Content flagged!") - return result - -asyncio.run(check_content()) -``` - -### Using the HTTP Client Directly - -You can also use the `GalileoProtectClient` directly for more control: - -```python -import asyncio -from agent_control_plugins.luna2 import GalileoProtectClient, Payload - -async def main(): - async with GalileoProtectClient() as client: - response = await client.invoke_protect( - payload=Payload(input="Hello world", output=""), - project_name="my-project", - stage_name="my-stage", - ) - print(f"Status: {response.status}") - -asyncio.run(main()) -``` - -## Creating Custom Plugins - -Extend `PluginEvaluator` to create your own plugins: - -```python -from typing import Any -from pydantic import BaseModel -from agent_control_models import ( - EvaluatorResult, - PluginEvaluator, - PluginMetadata, - register_plugin, -) - -class MyConfig(BaseModel): - threshold: float = 0.5 - -@register_plugin -class MyPlugin(PluginEvaluator[MyConfig]): - metadata = PluginMetadata( - name="my-plugin", - version="1.0.0", - description="My custom plugin", - ) - config_model = MyConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - # Your evaluation logic - return EvaluatorResult( - matched=True, - confidence=0.9, - message="Evaluation complete" - ) -``` - -### Entry Point Registration (Third-Party Plugins) - -For distributable plugins, register via Python entry points in your `pyproject.toml`: - -```toml -[project.entry-points."agent_control.plugins"] -my-plugin = "my_package.plugin:MyPlugin" -``` - -Plugins registered this way are automatically discovered when Agent Control starts. - -### Optional Dependencies with `is_available()` - -If your plugin has optional dependencies, override `is_available()` to check them: - -```python -# Check if optional dependency is installed -try: - import some_optional_lib - DEPS_AVAILABLE = True -except ImportError: - DEPS_AVAILABLE = False - -@register_plugin -class MyPlugin(PluginEvaluator[MyConfig]): - metadata = PluginMetadata(name="my-plugin", ...) - config_model = MyConfig - - @classmethod - def is_available(cls) -> bool: - """Only register if dependencies are installed.""" - return DEPS_AVAILABLE - - async def evaluate(self, data: Any) -> EvaluatorResult: - ... -``` - -When `is_available()` returns `False`, the plugin is silently skipped during registration. This allows plugins with optional dependencies (like Luna-2 which requires `httpx`) to be included in the package without causing import errors. diff --git a/plugins/src/agent_control_plugins/__init__.py b/plugins/src/agent_control_plugins/__init__.py deleted file mode 100644 index 3ab38173..00000000 --- a/plugins/src/agent_control_plugins/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Agent Control Plugins. - -This package contains plugin implementations for agent-control. -Built-in plugins (regex, list) are registered automatically on import. - -Available plugins: - - regex: Regular expression matching (built-in) - - list: List-based value matching (built-in) - - galileo-luna2: Galileo Luna-2 runtime protection (pip install agent-control-plugins[luna2]) - -Custom evaluators are PluginEvaluator classes deployed with the engine. -Their schemas are registered via initAgent for validation purposes. -""" - -from agent_control_models import PluginEvaluator, PluginMetadata, register_plugin - -# Import built-in plugins to auto-register them -from .builtin import ListPlugin, RegexPlugin, SQLControlEvaluatorPlugin - -__version__ = "0.1.0" - -__all__ = [ - "PluginEvaluator", - "PluginMetadata", - "register_plugin", - "RegexPlugin", - "ListPlugin", - "SQLControlEvaluatorPlugin", -] - diff --git a/plugins/src/agent_control_plugins/base.py b/plugins/src/agent_control_plugins/base.py deleted file mode 100644 index 5c27c400..00000000 --- a/plugins/src/agent_control_plugins/base.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Base classes for agent-control plugins. - -Re-exports from agent_control_models for backward compatibility. -""" - -from agent_control_models import PluginEvaluator, PluginMetadata, register_plugin - -__all__ = ["PluginEvaluator", "PluginMetadata", "register_plugin"] - diff --git a/plugins/src/agent_control_plugins/builtin/__init__.py b/plugins/src/agent_control_plugins/builtin/__init__.py deleted file mode 100644 index f01d0ff9..00000000 --- a/plugins/src/agent_control_plugins/builtin/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -"""Built-in plugins for agent-control. - -These plugins are automatically registered when this module is imported. -""" - -from .json import JSONControlEvaluatorPlugin -from .list import ListPlugin -from .regex import RegexPlugin -from .sql import SQLControlEvaluatorPlugin - -__all__ = ["JSONControlEvaluatorPlugin", "ListPlugin", "RegexPlugin", "SQLControlEvaluatorPlugin"] diff --git a/plugins/tests/__init__.py b/plugins/tests/__init__.py deleted file mode 100644 index fa9cfc8c..00000000 --- a/plugins/tests/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Tests for agent_control_plugins package.""" - diff --git a/plugins/tests/test_base.py b/plugins/tests/test_base.py deleted file mode 100644 index a228b8bb..00000000 --- a/plugins/tests/test_base.py +++ /dev/null @@ -1,145 +0,0 @@ -"""Tests for plugin base classes. - -New architecture: Plugins take config at __init__, evaluate() only takes data. -""" - -import pytest -from typing import Any - -from pydantic import BaseModel - -from agent_control_models import EvaluatorResult, PluginEvaluator, PluginMetadata - - -class MockConfig(BaseModel): - """Config model for mock plugin.""" - - should_match: bool = False - timeout_ms: int = 5000 - - -class MockPlugin(PluginEvaluator[MockConfig]): - """A mock plugin for testing.""" - - metadata = PluginMetadata( - name="mock-plugin", - version="1.0.0", - description="A mock plugin for testing", - requires_api_key=False, - timeout_ms=5000, - config_schema={"type": "object"}, - ) - config_model = MockConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - """Simple mock evaluation.""" - return EvaluatorResult( - matched=self.config.should_match, - confidence=1.0, - message="Mock evaluation", - metadata={"data": str(data)}, - ) - - -class TestPluginMetadata: - """Tests for PluginMetadata dataclass.""" - - def test_metadata_with_defaults(self): - """Test metadata with default values.""" - metadata = PluginMetadata( - name="test-plugin", - version="1.0.0", - description="Test plugin", - ) - - assert metadata.name == "test-plugin" - assert metadata.version == "1.0.0" - assert metadata.description == "Test plugin" - assert metadata.requires_api_key is False - assert metadata.timeout_ms == 10000 - assert metadata.config_schema is None - - def test_metadata_with_all_fields(self): - """Test metadata with all fields specified.""" - metadata = PluginMetadata( - name="full-plugin", - version="2.0.0", - description="Full plugin", - requires_api_key=True, - timeout_ms=15000, - config_schema={"type": "object", "properties": {}}, - ) - - assert metadata.name == "full-plugin" - assert metadata.version == "2.0.0" - assert metadata.requires_api_key is True - assert metadata.timeout_ms == 15000 - assert metadata.config_schema is not None - - -class TestPluginEvaluator: - """Tests for PluginEvaluator base class.""" - - def test_plugin_is_abstract(self): - """Test that PluginEvaluator is an ABC.""" - from abc import ABC - assert issubclass(PluginEvaluator, ABC) - - def test_mock_plugin_metadata(self): - """Test that mock plugin has correct metadata.""" - assert MockPlugin.metadata.name == "mock-plugin" - assert MockPlugin.metadata.version == "1.0.0" - assert MockPlugin.metadata.timeout_ms == 5000 - - @pytest.mark.asyncio - async def test_mock_plugin_evaluate(self): - """Test mock plugin evaluation.""" - plugin = MockPlugin.from_dict({"should_match": True}) - - result = await plugin.evaluate("test data") - - assert result.matched is True - assert result.confidence == 1.0 - assert result.metadata["data"] == "test data" - - @pytest.mark.asyncio - async def test_mock_plugin_evaluate_no_match(self): - """Test mock plugin evaluation without match.""" - plugin = MockPlugin.from_dict({"should_match": False}) - - result = await plugin.evaluate("test data") - - assert result.matched is False - - def test_plugin_config_stored(self): - """Test that plugin stores config.""" - plugin = MockPlugin.from_dict({"should_match": True}) - - assert isinstance(plugin.config, MockConfig) - assert plugin.config.should_match is True - - def test_get_timeout_seconds_from_config(self): - """Test timeout conversion from config.""" - plugin = MockPlugin.from_dict({"timeout_ms": 3000}) - - assert plugin.get_timeout_seconds() == 3.0 - - def test_get_timeout_seconds_different_values(self): - """Test timeout with different values.""" - plugin1 = MockPlugin.from_dict({"timeout_ms": 7500}) - plugin2 = MockPlugin.from_dict({"timeout_ms": 1000}) - - assert plugin1.get_timeout_seconds() == 7.5 - assert plugin2.get_timeout_seconds() == 1.0 - - def test_get_timeout_seconds_from_default(self): - """Test timeout uses metadata default when not in config.""" - plugin = MockPlugin.from_dict({}) # No timeout_ms in config - - # MockConfig has default timeout_ms=5000 - assert plugin.get_timeout_seconds() == 5.0 - - def test_cannot_instantiate_abstract_class(self): - """Test that PluginEvaluator cannot be instantiated directly.""" - with pytest.raises(TypeError, match="abstract"): - PluginEvaluator({}) # type: ignore diff --git a/pyproject.toml b/pyproject.toml index 21f14091..40ba0ae5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ # This monorepo contains: # - models/ - Shared data models (published as agent-control-models) # - engine/ - Evaluation engine (bundled into SDK and server) -# - plugins/ - Plugin implementations (bundled into server) +# - evaluators/ - Evaluator implementations (bundled into server) # - sdks/python/ - Python SDK (published as agent-control-sdk) # - server/ - FastAPI server (published as agent-control-server) @@ -14,7 +14,7 @@ description = "Agent Control - protect your AI agents with controls" requires-python = ">=3.12" [tool.uv.workspace] -members = ["models", "server", "sdks/python", "engine", "plugins"] +members = ["models", "server", "sdks/python", "engine", "evaluators"] [tool.uv] # Require resolution to be compatible with the following environments so that CI and local dev @@ -28,7 +28,7 @@ required-environments = [ [tool.uv.sources] agent-control-models = { workspace = true } agent-control-engine = { workspace = true } -agent-control-plugins = { workspace = true } +agent-control-evaluators = { workspace = true } [tool.ruff] line-length = 100 diff --git a/scripts/build.py b/scripts/build.py index 1423446e..caf4ae3e 100644 --- a/scripts/build.py +++ b/scripts/build.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Build SDK and server packages with vendored dependencies. -This script copies internal packages (models, engine, plugins) into the SDK and server +This script copies internal packages (models, engine, evaluators) into the SDK and server source directories before building, then cleans up afterward. This allows the published wheels to be self-contained without requiring separate PyPI dependencies. @@ -135,7 +135,7 @@ def build_server() -> None: print(f"Building agent-control-server v{version}") # Clean previous builds and vendored code - for pkg in ["agent_control_models", "agent_control_engine", "agent_control_plugins"]: + for pkg in ["agent_control_models", "agent_control_engine", "agent_control_evaluators"]: target = server_src / pkg if target.exists(): shutil.rmtree(target) @@ -154,8 +154,8 @@ def build_server() -> None: server_src / "agent_control_engine", ) shutil.copytree( - ROOT / "plugins" / "src" / "agent_control_plugins", - server_src / "agent_control_plugins", + ROOT / "evaluators" / "src" / "agent_control_evaluators", + server_src / "agent_control_evaluators", ) # Inject bundle metadata for conflict detection @@ -170,7 +170,7 @@ def build_server() -> None: version, ) inject_bundle_metadata( - server_src / "agent_control_plugins" / "__init__.py", + server_src / "agent_control_evaluators" / "__init__.py", "agent-control-server", version, ) @@ -183,7 +183,7 @@ def build_server() -> None: print(f" Built agent-control-server v{version}") finally: # Clean up vendored code (don't commit it) - for pkg in ["agent_control_models", "agent_control_engine", "agent_control_plugins"]: + for pkg in ["agent_control_models", "agent_control_engine", "agent_control_evaluators"]: target = server_src / pkg if target.exists(): shutil.rmtree(target) diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 7268ffab..a443b75e 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -32,7 +32,7 @@ Documentation = "https://github.com/yourusername/agent-control#readme" Repository = "https://github.com/yourusername/agent-control" [project.optional-dependencies] -# Optional: Luna-2 plugin requires additional dependencies +# Optional: Luna-2 evaluator requires additional dependencies luna2 = [ "httpx>=0.24.0", ] @@ -45,7 +45,7 @@ dev = [ "mypy>=1.8.0", "agent-control-models", "agent-control-engine", - "agent-control-plugins", + "agent-control-evaluators", ] [build-system] @@ -76,4 +76,4 @@ known-first-party = ["agent_control"] [tool.uv.sources] agent-control-models = { workspace = true } agent-control-engine = { workspace = true } -agent-control-plugins = { workspace = true } +agent-control-evaluators = { workspace = true } diff --git a/sdks/python/src/agent_control/__init__.py b/sdks/python/src/agent_control/__init__.py index 7a940689..93e336b7 100644 --- a/sdks/python/src/agent_control/__init__.py +++ b/sdks/python/src/agent_control/__init__.py @@ -40,7 +40,7 @@ async def process(input: str) -> str: from typing import TYPE_CHECKING, Any, Literal, TypeVar from uuid import UUID -from . import agents, controls, evaluation, plugins, policies +from . import agents, controls, evaluation, evaluators, policies # Import client and operations modules from .client import AgentControlClient @@ -615,7 +615,7 @@ async def main(): "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r"\\d{3}-\\d{2}-\\d{4}"} }, "action": {"decision": "deny"} @@ -936,7 +936,7 @@ async def main(): "policies", "controls", "evaluation", - "plugins", + "evaluators", # Policy-Control management "add_control_to_policy", diff --git a/sdks/python/src/agent_control/agents.py b/sdks/python/src/agent_control/agents.py index d1b7bd8f..5eb61215 100644 --- a/sdks/python/src/agent_control/agents.py +++ b/sdks/python/src/agent_control/agents.py @@ -3,7 +3,7 @@ from typing import Any, cast from uuid import UUID -from agent_control_engine import ensure_plugins_discovered +from agent_control_engine import ensure_evaluators_discovered from .client import AgentControlClient @@ -40,8 +40,8 @@ async def register_agent( response = await register_agent(client, agent, steps=[...]) print(f"Created: {response['created']}") """ - # Ensure plugins are discovered for local evaluation support - ensure_plugins_discovered() + # Ensure evaluators are discovered for local evaluation support + ensure_evaluators_discovered() if steps is None: steps = [] diff --git a/sdks/python/src/agent_control/control_decorators.py b/sdks/python/src/agent_control/control_decorators.py index 5ac09c3f..b3823717 100644 --- a/sdks/python/src/agent_control/control_decorators.py +++ b/sdks/python/src/agent_control/control_decorators.py @@ -21,7 +21,7 @@ async def chat(message: str) -> str: # The server's policy contains controls that define: # - stage: "pre" or "post" # - selector.path: "input" or "output" - # - evaluator: regex, list, Luna2 plugin, etc. + # - evaluator: regex, list, Luna2 evaluator, etc. # - action: deny, warn, or log """ diff --git a/sdks/python/src/agent_control/controls.py b/sdks/python/src/agent_control/controls.py index c6f49783..bc473dcc 100644 --- a/sdks/python/src/agent_control/controls.py +++ b/sdks/python/src/agent_control/controls.py @@ -164,7 +164,7 @@ async def create_control( "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": r"\\d{3}-\\d{2}-\\d{4}"} }, "action": {"decision": "deny"} diff --git a/sdks/python/src/agent_control/evaluation.py b/sdks/python/src/agent_control/evaluation.py index ddab96cc..77e184d6 100644 --- a/sdks/python/src/agent_control/evaluation.py +++ b/sdks/python/src/agent_control/evaluation.py @@ -11,7 +11,7 @@ # Import models if available try: - from agent_control_engine import list_plugins + from agent_control_engine import list_evaluators from agent_control_engine.core import ControlEngine from agent_control_models import ( ControlDefinition, @@ -257,20 +257,20 @@ async def check_evaluation_with_local( try: control_def = ControlDefinition.model_validate(control_data) - # Validate plugin is available locally - plugin_name = control_def.evaluator.plugin - # Agent-scoped plugins (agent:evaluator) are server-only - if ":" in plugin_name: + # Validate evaluator is available locally + evaluator_name = control_def.evaluator.name + # Agent-scoped evaluators (agent:evaluator) are server-only + if ":" in evaluator_name: raise RuntimeError( f"Control '{c['name']}' is marked execution='sdk' but uses " - f"agent-scoped evaluator '{plugin_name}' which is server-only. " - "Set execution='server' or use a built-in plugin." + f"agent-scoped evaluator '{evaluator_name}' which is server-only. " + "Set execution='server' or use a built-in evaluator." ) - if plugin_name not in list_plugins(): + if evaluator_name not in list_evaluators(): raise RuntimeError( - f"Control '{c['name']}' is marked execution='sdk' but plugin " - f"'{plugin_name}' is not available in the SDK. " - "Install the plugin or set execution='server'." + f"Control '{c['name']}' is marked execution='sdk' but evaluator " + f"'{evaluator_name}' is not available in the SDK. " + "Install the evaluator or set execution='server'." ) local_controls.append(_ControlAdapter( diff --git a/sdks/python/src/agent_control/evaluators/__init__.py b/sdks/python/src/agent_control/evaluators/__init__.py new file mode 100644 index 00000000..6a34537c --- /dev/null +++ b/sdks/python/src/agent_control/evaluators/__init__.py @@ -0,0 +1,56 @@ +"""Evaluator system for agent_control. + +This module provides an evaluator architecture for extending agent_control +with external evaluation systems like Galileo Luna-2, Guardrails AI, etc. + +Evaluator Discovery: + Call `discover_evaluators()` at startup to load evaluators. This loads: + - Built-in evaluators (regex, list, json, sql) from agent_control_evaluators + - Third-party evaluators via the 'agent_control.evaluators' entry point group + + Then use `list_evaluators()` to get available evaluators. + +Luna-2 Evaluator: + When installed with luna2 extras, the Luna-2 types are available: + ```python + from agent_control.evaluators import Luna2Evaluator, Luna2EvaluatorConfig # if luna2 installed + ``` +""" + +from agent_control_engine import ( + discover_evaluators, + ensure_evaluators_discovered, + list_evaluators, +) +from agent_control_models import register_evaluator + +from .base import Evaluator, EvaluatorMetadata + +__all__ = [ + "Evaluator", + "EvaluatorMetadata", + "discover_evaluators", + "ensure_evaluators_discovered", + "list_evaluators", + "register_evaluator", +] + +# Optionally export Luna-2 types when available +try: + from agent_control_evaluators.luna2 import ( # noqa: F401 + LUNA2_AVAILABLE, + Luna2Evaluator, + Luna2EvaluatorConfig, + Luna2Metric, + Luna2Operator, + ) + + __all__.extend([ + "Luna2Evaluator", + "Luna2EvaluatorConfig", + "Luna2Metric", + "Luna2Operator", + "LUNA2_AVAILABLE", + ]) +except ImportError: + pass diff --git a/sdks/python/src/agent_control/evaluators/base.py b/sdks/python/src/agent_control/evaluators/base.py new file mode 100644 index 00000000..38236d81 --- /dev/null +++ b/sdks/python/src/agent_control/evaluators/base.py @@ -0,0 +1,9 @@ +"""Base classes for agent_control evaluators. + +Re-exports from agent_control_models for convenience. +""" + +# Re-export from the models package (where they're defined) +from agent_control_models import Evaluator, EvaluatorMetadata + +__all__ = ["Evaluator", "EvaluatorMetadata"] diff --git a/sdks/python/src/agent_control/plugins/__init__.py b/sdks/python/src/agent_control/plugins/__init__.py deleted file mode 100644 index 8ecbaf12..00000000 --- a/sdks/python/src/agent_control/plugins/__init__.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Plugin system for agent_control. - -This module provides a plugin architecture for extending agent_control -with external evaluation systems like Galileo Luna-2, Guardrails AI, etc. - -Plugin Discovery: - Call `discover_plugins()` at startup to load plugins. This loads: - - Built-in plugins (regex, list) from agent_control_plugins - - Third-party plugins via the 'agent_control.plugins' entry point group - - Then use `list_plugins()` to get available plugins. - -Luna-2 Plugin: - When installed with luna2 extras, the Luna-2 types are available: - ```python - from agent_control.plugins import Luna2Plugin, Luna2Config # if luna2 installed - ``` -""" - -from agent_control_engine import ( - discover_plugins, - ensure_plugins_discovered, - list_plugins, -) -from agent_control_models import register_plugin - -from .base import PluginEvaluator, PluginMetadata - -__all__ = [ - "PluginEvaluator", - "PluginMetadata", - "discover_plugins", - "ensure_plugins_discovered", - "list_plugins", - "register_plugin", -] - -# Optionally export Luna-2 types when available -try: - from agent_control_plugins.luna2 import ( # noqa: F401 - LUNA2_AVAILABLE, - Luna2Config, - Luna2Metric, - Luna2Operator, - Luna2Plugin, - ) - - __all__.extend([ - "Luna2Plugin", - "Luna2Config", - "Luna2Metric", - "Luna2Operator", - "LUNA2_AVAILABLE", - ]) -except ImportError: - pass - diff --git a/sdks/python/src/agent_control/plugins/base.py b/sdks/python/src/agent_control/plugins/base.py deleted file mode 100644 index 04c42a25..00000000 --- a/sdks/python/src/agent_control/plugins/base.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Base classes for agent_control plugins. - -Re-exports from agent_control_models for convenience. -""" - -# Re-export from the models package (where they're defined) -from agent_control_models import PluginEvaluator, PluginMetadata - -__all__ = ["PluginEvaluator", "PluginMetadata"] - diff --git a/sdks/python/tests/test_evaluators.py b/sdks/python/tests/test_evaluators.py new file mode 100644 index 00000000..7fcfb5b0 --- /dev/null +++ b/sdks/python/tests/test_evaluators.py @@ -0,0 +1,260 @@ +"""Unit tests for the evaluator system. + +Tests evaluator registration, discovery, and base functionality without +requiring actual evaluator implementations or external services. + +Evaluators take config at __init__, evaluate() only takes data. +Registry is now in agent_control_models, discovery in agent_control_engine. +""" + +import pytest +from unittest.mock import MagicMock, patch + +from pydantic import BaseModel + +from agent_control.evaluators import ( + Evaluator, + EvaluatorMetadata, + discover_evaluators, + list_evaluators, + register_evaluator, +) +from agent_control_models import clear_evaluators +from agent_control_engine.discovery import reset_evaluator_discovery +from agent_control_models.controls import EvaluatorResult + + +class MockConfig(BaseModel): + """Config model for MockEvaluator.""" + threshold: float = 0.5 + + +class MockEvaluator(Evaluator): + """Mock evaluator for testing. + + Config is passed at __init__, not at evaluate(). + """ + + metadata = EvaluatorMetadata( + name="test-mock-evaluator", + version="1.0.0", + description="Mock evaluator for testing", + requires_api_key=False, + timeout_ms=10, + ) + config_model = MockConfig + + def __init__(self, config: dict): + super().__init__(config) + self.threshold = config.get("threshold", 0.5) + + def evaluate(self, data) -> EvaluatorResult: + """Mock evaluation (synchronous).""" + matched = float(data) > self.threshold if isinstance(data, (int, float)) else False + return EvaluatorResult( + matched=matched, + confidence=1.0, + message=f"Mock evaluation: {matched}", + metadata={"threshold": self.threshold}, + ) + + +class TestEvaluatorMetadata: + """Tests for EvaluatorMetadata dataclass.""" + + def test_metadata_creation(self): + """Test creating evaluator metadata.""" + metadata = EvaluatorMetadata( + name="test-evaluator", + version="1.0.0", + description="Test evaluator", + ) + + assert metadata.name == "test-evaluator" + assert metadata.version == "1.0.0" + assert metadata.description == "Test evaluator" + assert metadata.requires_api_key is False + assert metadata.timeout_ms == 10000 # Default + + def test_metadata_with_all_fields(self): + """Test metadata with all fields populated.""" + metadata = EvaluatorMetadata( + name="full-evaluator", + version="2.0.0", + description="Full test", + requires_api_key=True, + timeout_ms=5000, + ) + + assert metadata.requires_api_key is True + assert metadata.timeout_ms == 5000 + + +class TestEvaluatorRegistry: + """Tests for evaluator registry functionality.""" + + def setup_method(self): + """Clear registry before each test.""" + # Clear all evaluators and reset discovery + clear_evaluators() + reset_evaluator_discovery() + # Run discovery to load built-in evaluators + discover_evaluators() + + def test_register_evaluator(self): + """Test registering an evaluator.""" + # Register mock evaluator + register_evaluator(MockEvaluator) + + # Verify it's registered + evaluator_class = list_evaluators().get("test-mock-evaluator") + assert evaluator_class is MockEvaluator + + def test_get_nonexistent_evaluator(self): + """Test getting an evaluator that doesn't exist.""" + evaluator_class = list_evaluators().get("nonexistent-evaluator-xyz") + assert evaluator_class is None + + def test_list_evaluators_includes_registered(self): + """Test listing evaluators includes registered evaluators.""" + # Register mock evaluator + register_evaluator(MockEvaluator) + + # List evaluators - now returns dict of evaluator classes + evaluators = list_evaluators() + + assert "test-mock-evaluator" in evaluators + assert evaluators["test-mock-evaluator"] is MockEvaluator + + def test_builtin_evaluators_available(self): + """Test that built-in evaluators are available after discovery.""" + evaluators = list_evaluators() + + assert "regex" in evaluators + assert "list" in evaluators + + def test_register_duplicate_evaluator_raises_error(self): + """Test that registering a different evaluator with same name raises ValueError.""" + # Register evaluator first + register_evaluator(MockEvaluator) + + # Create a different class with the same evaluator name + class DuplicateEvaluator(Evaluator): + metadata = EvaluatorMetadata( + name="test-mock-evaluator", # Same name as MockEvaluator + version="2.0.0", + description="Duplicate evaluator", + ) + config_model = MockConfig + + def evaluate(self, data) -> EvaluatorResult: + return EvaluatorResult(matched=False, confidence=1.0, message="duplicate") + + # Second registration with different class should fail + with pytest.raises(ValueError, match="already registered"): + register_evaluator(DuplicateEvaluator) + + def test_re_register_same_evaluator_allowed(self): + """Test that re-registering the same class is allowed (hot reload support).""" + register_evaluator(MockEvaluator) + # Should not raise - same class can be re-registered + result = register_evaluator(MockEvaluator) + assert result is MockEvaluator + + +class TestEvaluatorBase: + """Tests for Evaluator base class.""" + + def test_evaluator_evaluate(self): + """Test synchronous evaluation.""" + # Config is now passed at init + evaluator = MockEvaluator({"threshold": 0.5}) + result = evaluator.evaluate(data=0.8) + + assert isinstance(result, EvaluatorResult) + assert result.matched is True + assert result.confidence == 1.0 + assert "Mock evaluation" in result.message + + def test_evaluator_evaluate_no_match(self): + """Test evaluation when rule doesn't match.""" + evaluator = MockEvaluator({"threshold": 0.5}) + result = evaluator.evaluate(data=0.3) + + assert isinstance(result, EvaluatorResult) + assert result.matched is False + assert result.confidence == 1.0 + + def test_evaluator_with_different_configs(self): + """Test evaluator uses config correctly (set at init).""" + # Create two evaluators with different configs + evaluator_low = MockEvaluator({"threshold": 0.5}) + evaluator_high = MockEvaluator({"threshold": 0.7}) + + # Same data, different thresholds + assert evaluator_low.evaluate(data=0.6).matched is True + assert evaluator_high.evaluate(data=0.6).matched is False + + def test_evaluator_metadata_accessible(self): + """Test that evaluator metadata is accessible.""" + evaluator = MockEvaluator({"threshold": 0.5}) + + assert evaluator.metadata.name == "test-mock-evaluator" + assert evaluator.metadata.version == "1.0.0" + assert evaluator.metadata.timeout_ms == 10 + + def test_evaluator_config_stored(self): + """Test that evaluator stores config.""" + config = {"threshold": 0.75, "extra": "value"} + evaluator = MockEvaluator(config) + + assert evaluator.config == config + assert evaluator.threshold == 0.75 + + +class TestEvaluatorDiscovery: + """Tests for evaluator discovery mechanism.""" + + def setup_method(self): + """Reset discovery state before each test.""" + clear_evaluators() + reset_evaluator_discovery() + + def test_discover_evaluators_loads_builtins(self): + """Test that discover_evaluators loads built-in evaluators.""" + discover_evaluators() + + evaluators = list_evaluators() + assert "regex" in evaluators + assert "list" in evaluators + + def test_discover_evaluators_only_runs_once(self): + """Test that discovery only runs once.""" + count1 = discover_evaluators() + count2 = discover_evaluators() + + # Second call should return 0 (already discovered) + assert count2 == 0 + + @patch("agent_control_engine.discovery.entry_points") + def test_discover_evaluators_loads_entry_points(self, mock_entry_points): + """Test loading evaluators via entry points.""" + mock_ep = MagicMock() + mock_ep.name = "custom-evaluator" + mock_ep.load.return_value = MockEvaluator + + mock_entry_points.return_value = [mock_ep] + + discover_evaluators() + + mock_entry_points.assert_called_with(group="agent_control.evaluators") + + def test_ensure_evaluators_discovered_triggers_discovery(self): + """Test that ensure_evaluators_discovered triggers discovery.""" + from agent_control.evaluators import ensure_evaluators_discovered + + ensure_evaluators_discovered() + + evaluators = list_evaluators() + assert "regex" in evaluators + assert "list" in evaluators diff --git a/sdks/python/tests/test_local_evaluation.py b/sdks/python/tests/test_local_evaluation.py index cafd0dc6..d39ce141 100644 --- a/sdks/python/tests/test_local_evaluation.py +++ b/sdks/python/tests/test_local_evaluation.py @@ -57,7 +57,7 @@ def make_control_dict( name: str, *, execution: str = "server", - plugin: str = "regex", + evaluator: str = "regex", pattern: str = r"test", action: str = "deny", step_type: str = "llm_inference", @@ -78,7 +78,7 @@ def make_control_dict( "scope": {"step_types": [step_type], "stages": [stage]}, "selector": {"path": path}, "evaluator": { - "plugin": plugin, + "name": evaluator, "config": {"pattern": pattern}, }, "action": {"decision": action}, @@ -564,19 +564,19 @@ async def test_tool_step_local_deny_skips_server(self, agent_uuid, tool_payload) assert result.matches[0].control_name == "local_deny_ctrl" @pytest.mark.asyncio - async def test_local_control_with_missing_plugin_raises(self, agent_uuid, llm_payload): - """Test that local control with unavailable plugin raises RuntimeError. + async def test_local_control_with_missing_evaluator_raises(self, agent_uuid, llm_payload): + """Test that local control with unavailable evaluator raises RuntimeError. - Given: A local control referencing a plugin that doesn't exist + Given: A local control referencing an evaluator that doesn't exist When: check_evaluation_with_local is called Then: RuntimeError is raised with helpful message """ controls = [ make_control_dict( 1, - "local_missing_plugin", + "local_missing_evaluator", execution="sdk", - plugin="nonexistent-plugin-xyz", + evaluator="nonexistent-evaluator-xyz", pattern=r"test", ), ] @@ -593,24 +593,24 @@ async def test_local_control_with_missing_plugin_raises(self, agent_uuid, llm_pa controls=controls, ) - assert "local_missing_plugin" in str(exc_info.value) - assert "nonexistent-plugin-xyz" in str(exc_info.value) + assert "local_missing_evaluator" in str(exc_info.value) + assert "nonexistent-evaluator-xyz" in str(exc_info.value) assert "not available" in str(exc_info.value) @pytest.mark.asyncio - async def test_local_control_with_agent_scoped_plugin_raises(self, agent_uuid, llm_payload): - """Test that local control with agent-scoped plugin raises RuntimeError. + async def test_local_control_with_agent_scoped_evaluator_raises(self, agent_uuid, llm_payload): + """Test that local control with agent-scoped evaluator raises RuntimeError. - Given: A local control referencing an agent-scoped plugin (agent:evaluator) + Given: A local control referencing an agent-scoped evaluator (agent:evaluator) When: check_evaluation_with_local is called - Then: RuntimeError is raised explaining agent-scoped plugins are server-only + Then: RuntimeError is raised explaining agent-scoped evaluators are server-only """ controls = [ make_control_dict( 1, "local_agent_scoped", execution="sdk", - plugin="my-agent:custom-evaluator", + evaluator="my-agent:custom-evaluator", pattern=r"test", ), ] @@ -632,19 +632,19 @@ async def test_local_control_with_agent_scoped_plugin_raises(self, agent_uuid, l assert "server-only" in str(exc_info.value) @pytest.mark.asyncio - async def test_server_control_with_missing_plugin_allowed(self, agent_uuid, llm_payload): - """Test that server control with unavailable plugin is allowed (server handles it). + async def test_server_control_with_missing_evaluator_allowed(self, agent_uuid, llm_payload): + """Test that server control with unavailable evaluator is allowed (server handles it). - Given: A server control (execution="server") referencing a plugin that doesn't exist locally + Given: A server control (execution="server") referencing an evaluator that doesn't exist locally When: check_evaluation_with_local is called Then: No error, server is called to handle it """ controls = [ make_control_dict( 1, - "server_custom_plugin", + "server_custom_evaluator", execution="server", - plugin="server-only-plugin", + evaluator="server-only-evaluator", pattern=r"test", ), ] @@ -657,7 +657,7 @@ async def test_server_control_with_missing_plugin_allowed(self, agent_uuid, llm_ client.http_client = AsyncMock() client.http_client.post = AsyncMock(return_value=mock_response) - # Should not raise - server handles unavailable plugins + # Should not raise - server handles unavailable evaluators result = await check_evaluation_with_local( client=client, agent_uuid=agent_uuid, diff --git a/sdks/python/tests/test_luna2_plugin.py b/sdks/python/tests/test_luna2_evaluator.py similarity index 68% rename from sdks/python/tests/test_luna2_plugin.py rename to sdks/python/tests/test_luna2_evaluator.py index 2ee3f7c8..6c4f4742 100644 --- a/sdks/python/tests/test_luna2_plugin.py +++ b/sdks/python/tests/test_luna2_evaluator.py @@ -1,10 +1,10 @@ -"""Unit tests for the Luna-2 plugin. +"""Unit tests for the Luna-2 evaluator. -These tests mock the HTTP client to test the plugin logic without +These tests mock the HTTP client to test the evaluator logic without requiring actual Galileo API access. -New architecture: Plugins take config at __init__, evaluate() only takes data. -The plugin now uses direct HTTP API calls instead of the galileo SDK. +Evaluators take config at __init__, evaluate() only takes data. +The evaluator uses direct HTTP API calls instead of the galileo SDK. """ import os @@ -13,7 +13,7 @@ import pytest from pydantic import ValidationError -from agent_control_models import EvaluatorResult, PluginEvaluator +from agent_control_models import EvaluatorResult, Evaluator def create_mock_protect_response( @@ -23,7 +23,7 @@ def create_mock_protect_response( execution_time: float = 100.0, ) -> MagicMock: """Create a mock ProtectResponse object for testing.""" - from agent_control_plugins.luna2.client import ProtectResponse, TraceMetadata + from agent_control_evaluators.luna2.client import ProtectResponse, TraceMetadata return ProtectResponse( status=status, @@ -39,14 +39,14 @@ def create_mock_protect_response( ) -class TestLuna2Config: - """Tests for Luna2Config Pydantic model.""" +class TestLuna2EvaluatorConfig: + """Tests for Luna2EvaluatorConfig Pydantic model.""" def test_local_stage_config_valid(self): """Test valid local stage configuration.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="gt", @@ -62,9 +62,9 @@ def test_local_stage_config_valid(self): def test_local_stage_config_with_numeric_target(self): """Test local stage configuration with numeric target_value.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="gt", @@ -76,9 +76,9 @@ def test_local_stage_config_with_numeric_target(self): def test_central_stage_config_valid(self): """Test valid central stage configuration.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="central", stage_name="production-guard", galileo_project="my-project", @@ -90,10 +90,10 @@ def test_central_stage_config_valid(self): def test_local_stage_requires_metric(self): """Test local stage requires metric field.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="metric.*required"): - Luna2Config( + Luna2EvaluatorConfig( stage_type="local", operator="gt", target_value="0.5", @@ -101,10 +101,10 @@ def test_local_stage_requires_metric(self): def test_local_stage_requires_operator(self): """Test local stage requires operator field.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="operator.*required"): - Luna2Config( + Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", target_value="0.5", @@ -112,10 +112,10 @@ def test_local_stage_requires_operator(self): def test_local_stage_requires_target_value(self): """Test local stage requires target_value field.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="target_value.*required"): - Luna2Config( + Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="gt", @@ -123,21 +123,21 @@ def test_local_stage_requires_target_value(self): def test_central_stage_requires_stage_name(self): """Test central stage requires stage_name field.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig with pytest.raises(ValidationError, match="stage_name.*required"): - Luna2Config( + Luna2EvaluatorConfig( stage_type="central", galileo_project="my-project", ) def test_timeout_ms_validation(self): """Test timeout_ms must be within valid range.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig # Too low with pytest.raises(ValidationError): - Luna2Config( + Luna2EvaluatorConfig( stage_type="central", stage_name="test", timeout_ms=500, # Below 1000 @@ -145,14 +145,14 @@ def test_timeout_ms_validation(self): # Too high with pytest.raises(ValidationError): - Luna2Config( + Luna2EvaluatorConfig( stage_type="central", stage_name="test", timeout_ms=100000, # Above 60000 ) # Valid - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="central", stage_name="test", timeout_ms=30000, @@ -161,16 +161,16 @@ def test_timeout_ms_validation(self): def test_on_error_validation(self): """Test on_error must be 'allow' or 'deny'.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig - config_allow = Luna2Config( + config_allow = Luna2EvaluatorConfig( stage_type="central", stage_name="test", on_error="allow", ) assert config_allow.on_error == "allow" - config_deny = Luna2Config( + config_deny = Luna2EvaluatorConfig( stage_type="central", stage_name="test", on_error="deny", @@ -178,7 +178,7 @@ def test_on_error_validation(self): assert config_deny.on_error == "deny" with pytest.raises(ValidationError): - Luna2Config( + Luna2EvaluatorConfig( stage_type="central", stage_name="test", on_error="invalid", @@ -186,7 +186,7 @@ def test_on_error_validation(self): def test_metric_validation(self): """Test metric must be a valid Luna2 metric.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig # Valid metrics valid_metrics = [ @@ -198,7 +198,7 @@ def test_metric_validation(self): "tone", ] for metric in valid_metrics: - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric=metric, operator="gt", @@ -208,7 +208,7 @@ def test_metric_validation(self): # Invalid metric with pytest.raises(ValidationError): - Luna2Config( + Luna2EvaluatorConfig( stage_type="local", metric="invalid_metric", operator="gt", @@ -217,11 +217,11 @@ def test_metric_validation(self): def test_operator_validation(self): """Test operator must be a valid Luna2 operator.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig valid_operators = ["gt", "lt", "gte", "lte", "eq", "contains", "any"] for op in valid_operators: - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator=op, @@ -230,7 +230,7 @@ def test_operator_validation(self): assert config.operator == op with pytest.raises(ValidationError): - Luna2Config( + Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="invalid_op", @@ -239,9 +239,9 @@ def test_operator_validation(self): def test_model_dump(self): """Test config can be dumped to dict.""" - from agent_control_plugins.luna2 import Luna2Config + from agent_control_evaluators.luna2 import Luna2EvaluatorConfig - config = Luna2Config( + config = Luna2EvaluatorConfig( stage_type="local", metric="input_toxicity", operator="gt", @@ -259,52 +259,52 @@ def test_model_dump(self): assert "stage_name" not in data # None excluded -class TestLuna2PluginInheritance: - """Tests for Luna-2 plugin inheritance.""" +class TestLuna2EvaluatorInheritance: + """Tests for Luna-2 evaluator inheritance.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) - def test_plugin_extends_evaluator(self): - """Test Luna2Plugin extends PluginEvaluator.""" - from agent_control_plugins.luna2 import Luna2Plugin + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + def test_evaluator_extends_base(self): + """Test Luna2Evaluator extends Evaluator.""" + from agent_control_evaluators.luna2 import Luna2Evaluator - assert issubclass(Luna2Plugin, PluginEvaluator) + assert issubclass(Luna2Evaluator, Evaluator) -class TestLuna2PluginImport: - """Tests for Luna-2 plugin import and initialization.""" +class TestLuna2EvaluatorImport: + """Tests for Luna-2 evaluator import and initialization.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) - def test_luna2_plugin_import_success(self): - """Test importing Luna-2 plugin with dependencies available.""" - from agent_control_plugins.luna2 import Luna2Plugin + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + def test_luna2_evaluator_import_success(self): + """Test importing Luna-2 evaluator with dependencies available.""" + from agent_control_evaluators.luna2 import Luna2Evaluator - assert Luna2Plugin is not None - assert Luna2Plugin.metadata.name == "galileo-luna2" - assert Luna2Plugin.metadata.version == "2.0.0" + assert Luna2Evaluator is not None + assert Luna2Evaluator.metadata.name == "galileo-luna2" + assert Luna2Evaluator.metadata.version == "2.0.0" - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", False) - def test_luna2_plugin_is_available_false_without_httpx(self): + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", False) + def test_luna2_evaluator_is_available_false_without_httpx(self): """Test that is_available() returns False when httpx is not installed.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator # When httpx is not available, is_available() should return False - assert Luna2Plugin.is_available() is False + assert Luna2Evaluator.is_available() is False - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) - def test_luna2_plugin_is_available_true_with_httpx(self): + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) + def test_luna2_evaluator_is_available_true_with_httpx(self): """Test that is_available() returns True when httpx is installed.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator # When httpx is available, is_available() should return True - assert Luna2Plugin.is_available() is True + assert Luna2Evaluator.is_available() is True - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @patch.dict(os.environ, {}, clear=True) - def test_luna2_plugin_init_without_api_key_raises_error(self): + def test_luna2_evaluator_init_without_api_key_raises_error(self): """Test that initializing without API key raises ValueError.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -314,33 +314,33 @@ def test_luna2_plugin_init_without_api_key_raises_error(self): } with pytest.raises(ValueError, match="GALILEO_API_KEY"): - Luna2Plugin.from_dict(config) + Luna2Evaluator.from_dict(config) -class TestLuna2PluginMetadata: - """Tests for Luna-2 plugin metadata.""" +class TestLuna2EvaluatorMetadata: + """Tests for Luna-2 evaluator metadata.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_metadata_fields(self): - """Test Luna-2 plugin metadata fields.""" - from agent_control_plugins.luna2 import Luna2Plugin + """Test Luna-2 evaluator metadata fields.""" + from agent_control_evaluators.luna2 import Luna2Evaluator - metadata = Luna2Plugin.metadata + metadata = Luna2Evaluator.metadata assert metadata.name == "galileo-luna2" assert metadata.requires_api_key is True assert metadata.timeout_ms == 10000 # Config schema is now from config_model - assert Luna2Plugin.config_model is not None + assert Luna2Evaluator.config_model is not None @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_config_schema_supported_metrics(self): """Test config schema includes all supported metrics.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator - schema = Luna2Plugin.config_model.model_json_schema() + schema = Luna2Evaluator.config_model.model_json_schema() # Pydantic uses anyOf with const for Literal types metric_def = schema.get("$defs", {}).get("Luna2Metric", {}) if "enum" in metric_def: @@ -360,16 +360,16 @@ def test_config_schema_supported_metrics(self): assert "metric" in schema["properties"] -class TestLuna2PluginLocalStage: - """Tests for Luna-2 plugin with local stages.""" +class TestLuna2EvaluatorLocalStage: + """Tests for Luna-2 evaluator with local stages.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_triggered(self): """Test local stage evaluation when rule is triggered.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient # Create mock response with triggered status mock_response = create_mock_protect_response( @@ -386,7 +386,7 @@ async def test_local_stage_triggered(self): "galileo_project": "test-project", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) # Mock the client's invoke_protect method with patch.object( @@ -394,7 +394,7 @@ async def test_local_stage_triggered(self): ) as mock_invoke: mock_invoke.return_value = mock_response - result = await plugin.evaluate(data="toxic content here") + result = await evaluator.evaluate(data="toxic content here") assert isinstance(result, EvaluatorResult) assert result.matched is True @@ -404,12 +404,12 @@ async def test_local_stage_triggered(self): assert result.metadata["status"] == "triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_not_triggered(self): """Test local stage evaluation when rule is not triggered.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response( status="not_triggered", @@ -425,26 +425,26 @@ async def test_local_stage_not_triggered(self): "galileo_project": "test-project", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.return_value = mock_response - result = await plugin.evaluate(data="hello world") + result = await evaluator.evaluate(data="hello world") assert result.matched is False assert result.confidence == 0.0 assert result.metadata["status"] == "not_triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_local_stage_with_timeout_ms(self): """Test local stage respects timeout_ms configuration.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response() @@ -457,14 +457,14 @@ async def test_local_stage_with_timeout_ms(self): "timeout_ms": 5000, } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.return_value = mock_response - await plugin.evaluate(data="test") + await evaluator.evaluate(data="test") # Check that invoke_protect was called with correct timeout mock_invoke.assert_called_once() @@ -472,16 +472,16 @@ async def test_local_stage_with_timeout_ms(self): assert call_kwargs["timeout"] == 5.0 -class TestLuna2PluginCentralStage: - """Tests for Luna-2 plugin with central stages.""" +class TestLuna2EvaluatorCentralStage: + """Tests for Luna-2 evaluator with central stages.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_central_stage_evaluation(self): """Test central stage evaluation.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response( status="triggered", @@ -496,25 +496,25 @@ async def test_central_stage_evaluation(self): "galileo_project": "prod-project", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.return_value = mock_response - result = await plugin.evaluate(data="test input") + result = await evaluator.evaluate(data="test input") assert result.matched is True assert result.metadata["status"] == "triggered" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_central_stage_without_version(self): """Test central stage without pinned version.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient mock_response = create_mock_protect_response(trace_id="trace-latest") @@ -524,28 +524,28 @@ async def test_central_stage_without_version(self): "galileo_project": "prod-project", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.return_value = mock_response - await plugin.evaluate(data="test") + await evaluator.evaluate(data="test") mock_invoke.assert_called_once() call_kwargs = mock_invoke.call_args.kwargs assert call_kwargs["stage_name"] == "latest-protection" -class TestLuna2PluginPayloadPreparation: +class TestLuna2EvaluatorPayloadPreparation: """Tests for payload preparation logic.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_input_metric_payload(self): """Test payload for input metrics uses _prepare_payload correctly.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -554,18 +554,18 @@ def test_input_metric_payload(self): "target_value": 0.8, } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) # Test the _prepare_payload method directly - payload = plugin._prepare_payload("user input text") + payload = evaluator._prepare_payload("user input text") assert payload.input == "user input text" assert payload.output == "" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_output_metric_payload(self): """Test payload for output metrics uses _prepare_payload correctly.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -574,18 +574,18 @@ def test_output_metric_payload(self): "target_value": 0.7, } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) # Test the _prepare_payload method directly - payload = plugin._prepare_payload("llm output text") + payload = evaluator._prepare_payload("llm output text") assert payload.input == "" assert payload.output == "llm output text" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_payload_field_override(self): """Test explicit payload_field configuration.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "central", @@ -593,24 +593,24 @@ def test_payload_field_override(self): "payload_field": "output", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) # Test the _prepare_payload method directly - payload = plugin._prepare_payload("some data") + payload = evaluator._prepare_payload("some data") assert payload.input == "" assert payload.output == "some data" -class TestLuna2PluginErrorHandling: - """Tests for error handling in Luna-2 plugin.""" +class TestLuna2EvaluatorErrorHandling: + """Tests for error handling in Luna-2 evaluator.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_error_with_fail_open(self): """Test error handling with fail open (default).""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -620,14 +620,14 @@ async def test_error_with_fail_open(self): "on_error": "allow", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.side_effect = Exception("Luna-2 API unavailable") - result = await plugin.evaluate(data="test") + result = await evaluator.evaluate(data="test") assert result.matched is False assert result.confidence == 0.0 @@ -635,12 +635,12 @@ async def test_error_with_fail_open(self): assert result.metadata["fallback_action"] == "allow" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_error_with_fail_closed(self): """Test error handling with fail closed.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -650,14 +650,14 @@ async def test_error_with_fail_closed(self): "on_error": "deny", } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.side_effect = Exception("Luna-2 API error") - result = await plugin.evaluate(data="test") + result = await evaluator.evaluate(data="test") assert result.matched is True assert result.confidence == 0.0 @@ -665,12 +665,12 @@ async def test_error_with_fail_closed(self): assert result.metadata["fallback_action"] == "deny" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) @pytest.mark.asyncio async def test_empty_response_handling(self): """Test handling of empty/None response.""" - from agent_control_plugins.luna2 import Luna2Plugin - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2 import Luna2Evaluator + from agent_control_evaluators.luna2.client import GalileoProtectClient config = { "stage_type": "local", @@ -679,28 +679,28 @@ async def test_empty_response_handling(self): "target_value": 0.8, } - plugin = Luna2Plugin.from_dict(config) + evaluator = Luna2Evaluator.from_dict(config) with patch.object( GalileoProtectClient, "invoke_protect", new_callable=AsyncMock ) as mock_invoke: mock_invoke.return_value = None - result = await plugin.evaluate(data="test") + result = await evaluator.evaluate(data="test") assert result.matched is False assert "No response from Luna-2" in result.message assert result.metadata["error"] == "empty_response" -class TestLuna2PluginTimeoutHelper: +class TestLuna2EvaluatorTimeoutHelper: """Tests for timeout helper method.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_get_timeout_from_config(self): """Test timeout conversion from config.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -710,14 +710,14 @@ def test_get_timeout_from_config(self): "timeout_ms": 5000, } - plugin = Luna2Plugin.from_dict(config) - assert plugin.get_timeout_seconds() == 5.0 + evaluator = Luna2Evaluator.from_dict(config) + assert evaluator.get_timeout_seconds() == 5.0 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_get_timeout_from_default(self): """Test timeout uses metadata default.""" - from agent_control_plugins.luna2 import Luna2Plugin + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -727,18 +727,18 @@ def test_get_timeout_from_default(self): # No timeout_ms - should use default } - plugin = Luna2Plugin.from_dict(config) - assert plugin.get_timeout_seconds() == 10.0 # Default from metadata + evaluator = Luna2Evaluator.from_dict(config) + assert evaluator.get_timeout_seconds() == 10.0 # Default from metadata -class TestLuna2PluginNumericTargetValue: +class TestLuna2EvaluatorNumericTargetValue: """Tests for numeric target_value handling.""" @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_numeric_target_value_float(self): - """Test plugin accepts float target_value.""" - from agent_control_plugins.luna2 import Luna2Plugin + """Test evaluator accepts float target_value.""" + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -747,14 +747,14 @@ def test_numeric_target_value_float(self): "target_value": 0.5, } - plugin = Luna2Plugin.from_dict(config) - assert plugin._get_numeric_target_value() == 0.5 + evaluator = Luna2Evaluator.from_dict(config) + assert evaluator._get_numeric_target_value() == 0.5 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_numeric_target_value_int(self): - """Test plugin accepts int target_value.""" - from agent_control_plugins.luna2 import Luna2Plugin + """Test evaluator accepts int target_value.""" + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -763,14 +763,14 @@ def test_numeric_target_value_int(self): "target_value": 1, } - plugin = Luna2Plugin.from_dict(config) - assert plugin._get_numeric_target_value() == 1 + evaluator = Luna2Evaluator.from_dict(config) + assert evaluator._get_numeric_target_value() == 1 @patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}) - @patch("agent_control_plugins.luna2.plugin.LUNA2_AVAILABLE", True) + @patch("agent_control_evaluators.luna2.evaluator.LUNA2_AVAILABLE", True) def test_string_target_value_converts_to_float(self): - """Test plugin converts string target_value to float.""" - from agent_control_plugins.luna2 import Luna2Plugin + """Test evaluator converts string target_value to float.""" + from agent_control_evaluators.luna2 import Luna2Evaluator config = { "stage_type": "local", @@ -779,8 +779,8 @@ def test_string_target_value_converts_to_float(self): "target_value": "0.75", } - plugin = Luna2Plugin.from_dict(config) - assert plugin._get_numeric_target_value() == 0.75 + evaluator = Luna2Evaluator.from_dict(config) + assert evaluator._get_numeric_target_value() == 0.75 class TestGalileoProtectClient: @@ -788,7 +788,7 @@ class TestGalileoProtectClient: def test_client_init_with_api_key(self): """Test client initialization with API key.""" - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient() @@ -796,7 +796,7 @@ def test_client_init_with_api_key(self): def test_client_init_without_api_key_raises(self): """Test client raises error without API key.""" - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2.client import GalileoProtectClient with patch.dict(os.environ, {}, clear=True): with pytest.raises(ValueError, match="GALILEO_API_KEY"): @@ -804,7 +804,7 @@ def test_client_init_without_api_key_raises(self): def test_derive_api_url_from_console_url(self): """Test API URL derivation from console URL.""" - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient( @@ -814,7 +814,7 @@ def test_derive_api_url_from_console_url(self): def test_derive_api_url_default(self): """Test default API URL.""" - from agent_control_plugins.luna2.client import GalileoProtectClient + from agent_control_evaluators.luna2.client import GalileoProtectClient with patch.dict(os.environ, {"GALILEO_API_KEY": "test-key"}): client = GalileoProtectClient() @@ -826,14 +826,14 @@ class TestPayloadDataClasses: def test_payload_to_dict(self): """Test Payload.to_dict() method.""" - from agent_control_plugins.luna2.client import Payload + from agent_control_evaluators.luna2.client import Payload payload = Payload(input="test input", output="test output") assert payload.to_dict() == {"input": "test input", "output": "test output"} def test_rule_to_dict(self): """Test Rule.to_dict() method.""" - from agent_control_plugins.luna2.client import Rule + from agent_control_evaluators.luna2.client import Rule rule = Rule(metric="input_toxicity", operator="gt", target_value=0.5) assert rule.to_dict() == { @@ -844,7 +844,7 @@ def test_rule_to_dict(self): def test_ruleset_to_dict(self): """Test Ruleset.to_dict() method.""" - from agent_control_plugins.luna2.client import PassthroughAction, Rule, Ruleset + from agent_control_evaluators.luna2.client import PassthroughAction, Rule, Ruleset ruleset = Ruleset( rules=[Rule(metric="input_toxicity", operator="gt", target_value=0.5)], @@ -858,7 +858,7 @@ def test_ruleset_to_dict(self): def test_protect_response_from_dict(self): """Test ProtectResponse.from_dict() method.""" - from agent_control_plugins.luna2.client import ProtectResponse + from agent_control_evaluators.luna2.client import ProtectResponse data = { "status": "triggered", diff --git a/sdks/python/tests/test_plugins.py b/sdks/python/tests/test_plugins.py deleted file mode 100644 index 22e552ce..00000000 --- a/sdks/python/tests/test_plugins.py +++ /dev/null @@ -1,260 +0,0 @@ -"""Unit tests for the plugin system. - -Tests plugin registration, discovery, and base functionality without -requiring actual plugin implementations or external services. - -New architecture: Plugins take config at __init__, evaluate() only takes data. -Registry is now in agent_control_models, discovery in agent_control_engine. -""" - -import pytest -from unittest.mock import MagicMock, patch - -from pydantic import BaseModel - -from agent_control.plugins import ( - PluginEvaluator, - PluginMetadata, - discover_plugins, - list_plugins, - register_plugin, -) -from agent_control_models import clear_plugins -from agent_control_engine.discovery import reset_discovery -from agent_control_models.controls import EvaluatorResult - - -class MockConfig(BaseModel): - """Config model for MockPlugin.""" - threshold: float = 0.5 - - -class MockPlugin(PluginEvaluator): - """Mock plugin for testing. - - New pattern: config is passed at __init__, not at evaluate(). - """ - - metadata = PluginMetadata( - name="test-mock-plugin", - version="1.0.0", - description="Mock plugin for testing", - requires_api_key=False, - timeout_ms=10, - ) - config_model = MockConfig - - def __init__(self, config: dict): - super().__init__(config) - self.threshold = config.get("threshold", 0.5) - - def evaluate(self, data) -> EvaluatorResult: - """Mock evaluation (synchronous).""" - matched = float(data) > self.threshold if isinstance(data, (int, float)) else False - return EvaluatorResult( - matched=matched, - confidence=1.0, - message=f"Mock evaluation: {matched}", - metadata={"threshold": self.threshold}, - ) - - -class TestPluginMetadata: - """Tests for PluginMetadata dataclass.""" - - def test_metadata_creation(self): - """Test creating plugin metadata.""" - metadata = PluginMetadata( - name="test-plugin", - version="1.0.0", - description="Test plugin", - ) - - assert metadata.name == "test-plugin" - assert metadata.version == "1.0.0" - assert metadata.description == "Test plugin" - assert metadata.requires_api_key is False - assert metadata.timeout_ms == 10000 # Default - - def test_metadata_with_all_fields(self): - """Test metadata with all fields populated.""" - metadata = PluginMetadata( - name="full-plugin", - version="2.0.0", - description="Full test", - requires_api_key=True, - timeout_ms=5000, - ) - - assert metadata.requires_api_key is True - assert metadata.timeout_ms == 5000 - - -class TestPluginRegistry: - """Tests for plugin registry functionality.""" - - def setup_method(self): - """Clear registry before each test.""" - # Clear all plugins and reset discovery - clear_plugins() - reset_discovery() - # Run discovery to load built-in plugins - discover_plugins() - - def test_register_plugin(self): - """Test registering a plugin.""" - # Register mock plugin - register_plugin(MockPlugin) - - # Verify it's registered - plugin_class = list_plugins().get("test-mock-plugin") - assert plugin_class is MockPlugin - - def test_get_nonexistent_plugin(self): - """Test getting a plugin that doesn't exist.""" - plugin_class = list_plugins().get("nonexistent-plugin-xyz") - assert plugin_class is None - - def test_list_plugins_includes_registered(self): - """Test listing plugins includes registered plugins.""" - # Register mock plugin - register_plugin(MockPlugin) - - # List plugins - now returns dict of plugin classes - plugins = list_plugins() - - assert "test-mock-plugin" in plugins - assert plugins["test-mock-plugin"] is MockPlugin - - def test_builtin_plugins_available(self): - """Test that built-in plugins are available after discovery.""" - plugins = list_plugins() - - assert "regex" in plugins - assert "list" in plugins - - def test_register_duplicate_plugin_raises_error(self): - """Test that registering a different plugin with same name raises ValueError.""" - # Register plugin first - register_plugin(MockPlugin) - - # Create a different class with the same plugin name - class DuplicatePlugin(PluginEvaluator): - metadata = PluginMetadata( - name="test-mock-plugin", # Same name as MockPlugin - version="2.0.0", - description="Duplicate plugin", - ) - config_model = MockConfig - - def evaluate(self, data) -> EvaluatorResult: - return EvaluatorResult(matched=False, confidence=1.0, message="duplicate") - - # Second registration with different class should fail - with pytest.raises(ValueError, match="already registered"): - register_plugin(DuplicatePlugin) - - def test_re_register_same_plugin_allowed(self): - """Test that re-registering the same class is allowed (hot reload support).""" - register_plugin(MockPlugin) - # Should not raise - same class can be re-registered - result = register_plugin(MockPlugin) - assert result is MockPlugin - - -class TestPluginEvaluator: - """Tests for PluginEvaluator base class.""" - - def test_plugin_evaluate(self): - """Test synchronous evaluation.""" - # Config is now passed at init - plugin = MockPlugin({"threshold": 0.5}) - result = plugin.evaluate(data=0.8) - - assert isinstance(result, EvaluatorResult) - assert result.matched is True - assert result.confidence == 1.0 - assert "Mock evaluation" in result.message - - def test_plugin_evaluate_no_match(self): - """Test evaluation when rule doesn't match.""" - plugin = MockPlugin({"threshold": 0.5}) - result = plugin.evaluate(data=0.3) - - assert isinstance(result, EvaluatorResult) - assert result.matched is False - assert result.confidence == 1.0 - - def test_plugin_with_different_configs(self): - """Test plugin uses config correctly (set at init).""" - # Create two plugins with different configs - plugin_low = MockPlugin({"threshold": 0.5}) - plugin_high = MockPlugin({"threshold": 0.7}) - - # Same data, different thresholds - assert plugin_low.evaluate(data=0.6).matched is True - assert plugin_high.evaluate(data=0.6).matched is False - - def test_plugin_metadata_accessible(self): - """Test that plugin metadata is accessible.""" - plugin = MockPlugin({"threshold": 0.5}) - - assert plugin.metadata.name == "test-mock-plugin" - assert plugin.metadata.version == "1.0.0" - assert plugin.metadata.timeout_ms == 10 - - def test_plugin_config_stored(self): - """Test that plugin stores config.""" - config = {"threshold": 0.75, "extra": "value"} - plugin = MockPlugin(config) - - assert plugin.config == config - assert plugin.threshold == 0.75 - - -class TestPluginDiscovery: - """Tests for plugin discovery mechanism.""" - - def setup_method(self): - """Reset discovery state before each test.""" - clear_plugins() - reset_discovery() - - def test_discover_plugins_loads_builtins(self): - """Test that discover_plugins loads built-in plugins.""" - discover_plugins() - - plugins = list_plugins() - assert "regex" in plugins - assert "list" in plugins - - def test_discover_plugins_only_runs_once(self): - """Test that discovery only runs once.""" - count1 = discover_plugins() - count2 = discover_plugins() - - # Second call should return 0 (already discovered) - assert count2 == 0 - - @patch("agent_control_engine.discovery.entry_points") - def test_discover_plugins_loads_entry_points(self, mock_entry_points): - """Test loading plugins via entry points.""" - mock_ep = MagicMock() - mock_ep.name = "custom-plugin" - mock_ep.load.return_value = MockPlugin - - mock_entry_points.return_value = [mock_ep] - - discover_plugins() - - mock_entry_points.assert_called_with(group="agent_control.plugins") - - def test_ensure_plugins_discovered_triggers_discovery(self): - """Test that ensure_plugins_discovered triggers discovery.""" - from agent_control.plugins import ensure_plugins_discovered - - ensure_plugins_discovered() - - plugins = list_plugins() - assert "regex" in plugins - assert "list" in plugins diff --git a/server/Dockerfile b/server/Dockerfile index eb3a5bc9..2dcf1f49 100644 --- a/server/Dockerfile +++ b/server/Dockerfile @@ -20,7 +20,7 @@ COPY pyproject.toml uv.lock* ./ # Copy shared dependencies (Workspaces) COPY models/ models/ COPY engine/ engine/ -COPY plugins/ plugins/ +COPY evaluators/ evaluators/ # Copy server application COPY server/ server/ diff --git a/server/README.md b/server/README.md index c75d93ce..6e7da4e7 100644 --- a/server/README.md +++ b/server/README.md @@ -7,10 +7,10 @@ FastAPI server for Agent Control - provides centralized control management, poli - **Control Management** - CRUD operations for controls - **Policy Management** - Group controls into reusable policies - **Agent Registration** - Register and manage agents -- **Evaluation Engine** - Server-side control evaluation with plugin support +- **Evaluation Engine** - Server-side control evaluation with evaluator support - **Observability** - Event tracking and control execution metrics - **API Key Authentication** - Secure production deployments -- **Plugin System** - Extensible evaluators (Regex, List, SQL, Luna-2 AI) +- **Evaluator System** - Extensible evaluators (Regex, List, SQL, Luna-2 AI) - **Prometheus Metrics** - Built-in monitoring and observability - **PostgreSQL/SQLite** - Production and development database support @@ -71,7 +71,7 @@ AGENT_CONTROL_API_KEYS=your-api-key-here,another-key-here OBSERVABILITY_ENABLED=true OBSERVABILITY_FLUSH_INTERVAL_SECONDS=10 -# Luna-2 Plugin (optional) +# Luna-2 Evaluator (optional) GALILEO_API_KEY=your-galileo-api-key # Prometheus metrics @@ -147,8 +147,8 @@ GET /health # Prometheus metrics (public) GET /metrics -# List available plugins -GET /api/v1/plugins +# List available evaluators +GET /api/v1/evaluators ``` ### Agent Management diff --git a/server/alembic/versions/c8d9e0f1a2b3_rename_plugin_to_evaluator.py b/server/alembic/versions/c8d9e0f1a2b3_rename_plugin_to_evaluator.py new file mode 100644 index 00000000..c4cf6adc --- /dev/null +++ b/server/alembic/versions/c8d9e0f1a2b3_rename_plugin_to_evaluator.py @@ -0,0 +1,48 @@ +"""Rename plugin column to evaluator in evaluator_configs table. + +Revision ID: c8d9e0f1a2b3 +Revises: b7c9d8e1f2a3 +Create Date: 2026-01-28 00:00:00.000000 + +""" +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c8d9e0f1a2b3" +down_revision = "b7c9d8e1f2a3" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Rename plugin column to evaluator + op.alter_column( + "evaluator_configs", + "plugin", + new_column_name="evaluator", + ) + # Rename the index + op.drop_index("ix_evaluator_configs_plugin", table_name="evaluator_configs") + op.create_index( + "ix_evaluator_configs_evaluator", + "evaluator_configs", + ["evaluator"], + unique=False, + ) + + +def downgrade() -> None: + # Rename evaluator column back to plugin + op.alter_column( + "evaluator_configs", + "evaluator", + new_column_name="plugin", + ) + # Rename the index back + op.drop_index("ix_evaluator_configs_evaluator", table_name="evaluator_configs") + op.create_index( + "ix_evaluator_configs_plugin", + "evaluator_configs", + ["plugin"], + unique=False, + ) diff --git a/server/pyproject.toml b/server/pyproject.toml index 17de26dd..3e10902d 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -3,7 +3,7 @@ name = "agent-control-server" version = "2.1.0" description = "Server for Agent Control - manage and evaluate controls for AI agents" requires-python = ">=3.12" -# Note: agent-control-models, agent-control-engine, and agent-control-plugins are bundled at build time +# Note: agent-control-models, agent-control-engine, and agent-control-evaluators are bundled at build time dependencies = [ "fastapi>=0.109.0", "starlette-exporter>=0.23.0", @@ -18,7 +18,7 @@ dependencies = [ "jsonschema>=4.25.1", "jsonschema-rs>=0.22.0", "google-re2>=1.1", # For engine (bundled) - "sqlglot[rs]>=20.0.0", # For SQL plugin (bundled) + "sqlglot[rs]>=20.0.0", # For SQL evaluator (bundled) ] authors = [ {name = "Agent Control Team"} @@ -41,7 +41,7 @@ dev = [ "types-jsonschema>=4.23.0", "agent-control-models", "agent-control-engine", - "agent-control-plugins", + "agent-control-evaluators", ] [project.scripts] @@ -54,8 +54,8 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -# Note: agent_control_models, agent_control_engine, and agent_control_plugins are copied by scripts/build.py -packages = ["src/agent_control_server", "src/agent_control_models", "src/agent_control_engine", "src/agent_control_plugins"] +# Note: agent_control_models, agent_control_engine, and agent_control_evaluators are copied by scripts/build.py +packages = ["src/agent_control_server", "src/agent_control_models", "src/agent_control_engine", "src/agent_control_evaluators"] [tool.pytest.ini_options] asyncio_mode = "auto" @@ -81,4 +81,4 @@ known-first-party = ["agent_control_server"] [tool.uv.sources] agent-control-models = { workspace = true } agent-control-engine = { workspace = true } -agent-control-plugins = { workspace = true } +agent-control-evaluators = { workspace = true } diff --git a/server/src/agent_control_server/auth.py b/server/src/agent_control_server/auth.py index 902a4e27..623d3906 100644 --- a/server/src/agent_control_server/auth.py +++ b/server/src/agent_control_server/auth.py @@ -191,7 +191,7 @@ async def require_admin_key( """ Dependency that requires an admin API key. - Use for sensitive operations like plugin management or configuration: + Use for sensitive operations like evaluator management or configuration: @router.delete("/dangerous", dependencies=[Depends(require_admin_key)]) async def dangerous_op(): diff --git a/server/src/agent_control_server/endpoints/agents.py b/server/src/agent_control_server/endpoints/agents.py index 18e538d8..32c7a027 100644 --- a/server/src/agent_control_server/endpoints/agents.py +++ b/server/src/agent_control_server/endpoints/agents.py @@ -1,7 +1,7 @@ from typing import Any from uuid import UUID -from agent_control_engine import list_plugins +from agent_control_engine import list_evaluators from agent_control_models.agent import Agent as APIAgent from agent_control_models.agent import StepSchema from agent_control_models.errors import ErrorCode, ValidationErrorItem @@ -53,8 +53,8 @@ _logger = get_logger(__name__) -# Cache for built-in plugin names (populated on first use) -_BUILTIN_PLUGIN_NAMES: set[str] | None = None +# Cache for built-in evaluator names (populated on first use) +_BUILTIN_EVALUATOR_NAMES: set[str] | None = None # Pagination constants _DEFAULT_PAGINATION_OFFSET = 0 @@ -69,12 +69,12 @@ # ============================================================================= -def _get_builtin_plugin_names() -> set[str]: - """Get built-in plugin names (cached).""" - global _BUILTIN_PLUGIN_NAMES - if _BUILTIN_PLUGIN_NAMES is None: - _BUILTIN_PLUGIN_NAMES = set(list_plugins().keys()) - return _BUILTIN_PLUGIN_NAMES +def _get_builtin_evaluator_names() -> set[str]: + """Get built-in evaluator names (cached).""" + global _BUILTIN_EVALUATOR_NAMES + if _BUILTIN_EVALUATOR_NAMES is None: + _BUILTIN_EVALUATOR_NAMES = set(list_evaluators().keys()) + return _BUILTIN_EVALUATOR_NAMES async def _validate_policy_controls_for_agent( @@ -107,18 +107,18 @@ async def _validate_policy_controls_for_agent( continue evaluator_cfg = control.data.get("evaluator", {}) - plugin = evaluator_cfg.get("plugin", "") - if not plugin: + evaluator_name = evaluator_cfg.get("name", "") + if not evaluator_name: continue - agent_name, eval_name = parse_evaluator_ref(plugin) + agent_name, eval_name = parse_evaluator_ref(evaluator_name) if agent_name is None: - continue # Built-in plugin, already validated at control creation + continue # Built-in evaluator, already validated at control creation # Agent-scoped evaluator - check if target matches this agent if agent_name != agent.name: errors.append( - f"Control '{control.name}' references evaluator '{plugin}' " + f"Control '{control.name}' references evaluator '{evaluator_name}' " f"which belongs to agent '{agent_name}', not '{agent.name}'" ) continue @@ -309,22 +309,22 @@ async def init_agent( HTTPException 409: Agent name exists with different UUID HTTPException 500: Database error during creation/update """ - # Check for evaluator name collisions with built-in plugins - builtin_names = _get_builtin_plugin_names() + # Check for evaluator name collisions with built-in evaluators + builtin_names = _get_builtin_evaluator_names() for ev in request.evaluators: if ev.name in builtin_names: raise ConflictError( error_code=ErrorCode.EVALUATOR_NAME_CONFLICT, - detail=f"Evaluator name '{ev.name}' conflicts with built-in plugin.", + detail=f"Evaluator name '{ev.name}' conflicts with built-in evaluator.", resource="Evaluator", resource_id=ev.name, - hint="Choose a different name that does not conflict with built-in plugins.", + hint="Choose a different name that does not conflict with built-in evaluators.", errors=[ ValidationErrorItem( resource="Evaluator", field="name", code="name_conflict", - message=f"Name '{ev.name}' conflicts with a built-in plugin", + message=f"Name '{ev.name}' conflicts with a built-in evaluator", value=ev.name, ) ], @@ -1137,7 +1137,7 @@ async def patch_agent( referencing_controls: list[tuple[str, str]] = [] # (control_name, evaluator) for ctrl in controls: - evaluator_ref = ctrl.control.evaluator.plugin + evaluator_ref = ctrl.control.evaluator.name if ":" in evaluator_ref: ref_agent, ref_eval = evaluator_ref.split(":", 1) # Check if this control references an evaluator we're removing @@ -1154,7 +1154,7 @@ async def patch_agent( errors=[ ValidationErrorItem( resource="Control", - field="evaluator.plugin", + field="evaluator.name", code="in_use", message=f"Control '{ctrl}' uses evaluator '{ev}'", ) diff --git a/server/src/agent_control_server/endpoints/controls.py b/server/src/agent_control_server/endpoints/controls.py index f348178d..c24fd5ce 100644 --- a/server/src/agent_control_server/endpoints/controls.py +++ b/server/src/agent_control_server/endpoints/controls.py @@ -1,4 +1,4 @@ -from agent_control_engine import list_plugins +from agent_control_engine import list_evaluators from agent_control_models import ControlDefinition from agent_control_models.errors import ErrorCode, ValidationErrorItem from agent_control_models.server import ( @@ -250,8 +250,8 @@ async def set_control_data( ) # Validate evaluator config - plugin_ref = request.data.evaluator.plugin - agent_name, eval_name = parse_evaluator_ref(plugin_ref) + evaluator_ref = request.data.evaluator.name + agent_name, eval_name = parse_evaluator_ref(evaluator_ref) if agent_name is not None: # Agent-scoped evaluator: validate against agent's registered schema @@ -306,10 +306,10 @@ async def set_control_data( errors=[ ValidationErrorItem( resource="Control", - field="data.evaluator.plugin", + field="data.evaluator.name", code="evaluator_not_found", message=f"Evaluator '{eval_name}' not found on agent '{agent_name}'", - value=plugin_ref, + value=evaluator_ref, ) ], ) @@ -336,17 +336,17 @@ async def set_control_data( ], ) else: - # Built-in or server-side plugin: validate if registered - plugin_cls = list_plugins().get(eval_name) - if plugin_cls is not None: + # Built-in or server-side evaluator: validate if registered + evaluator_cls = list_evaluators().get(eval_name) + if evaluator_cls is not None: try: - plugin_cls.config_model(**request.data.evaluator.config) + evaluator_cls.config_model(**request.data.evaluator.config) except ValidationError as e: raise APIValidationError( error_code=ErrorCode.INVALID_CONFIG, - detail=f"Config validation failed for plugin '{eval_name}'", + detail=f"Config validation failed for evaluator '{eval_name}'", resource="Control", - hint="Check the plugin's config schema for required fields and types.", + hint="Check the evaluator's config schema for required fields and types.", errors=[ ValidationErrorItem( resource="Control", @@ -363,9 +363,9 @@ async def set_control_data( except TypeError as e: raise APIValidationError( error_code=ErrorCode.INVALID_CONFIG, - detail=f"Invalid config parameters for plugin '{eval_name}'", + detail=f"Invalid config parameters for evaluator '{eval_name}'", resource="Control", - hint="Check the plugin's config schema for valid parameter names.", + hint="Check the evaluator's config schema for valid parameter names.", errors=[ ValidationErrorItem( resource="Control", @@ -375,7 +375,7 @@ async def set_control_data( ) ], ) - # If plugin not found, allow it - might be a server-side registered plugin + # If evaluator not found, allow it - might be a server-side registered evaluator # that will be validated at runtime data_json = request.data.model_dump(mode="json", exclude_none=True, exclude_unset=True) diff --git a/server/src/agent_control_server/endpoints/evaluation.py b/server/src/agent_control_server/endpoints/evaluation.py index e38fe86a..61cee6e5 100644 --- a/server/src/agent_control_server/endpoints/evaluation.py +++ b/server/src/agent_control_server/endpoints/evaluation.py @@ -63,7 +63,7 @@ async def evaluate( evaluation engine. Controls are evaluated in parallel with cancel-on-deny for efficiency. - Custom evaluators must be deployed as PluginEvaluator classes + Custom evaluators must be deployed as Evaluator classes with the engine. Their schemas are registered via initAgent. Optionally accepts X-Trace-Id and X-Span-Id headers for @@ -187,7 +187,7 @@ async def _emit_observability_events( matched=True, confidence=match.result.confidence, timestamp=now, - evaluator_plugin=ctrl.control.evaluator.plugin if ctrl else None, + evaluator_name=ctrl.control.evaluator.name if ctrl else None, error_message=match.result.error, metadata=match.result.metadata or {}, ) @@ -212,7 +212,7 @@ async def _emit_observability_events( matched=False, confidence=error.result.confidence, timestamp=now, - evaluator_plugin=ctrl.control.evaluator.plugin if ctrl else None, + evaluator_name=ctrl.control.evaluator.name if ctrl else None, error_message=error.result.error, metadata=error.result.metadata or {}, ) @@ -237,7 +237,7 @@ async def _emit_observability_events( matched=False, confidence=non_match.result.confidence, timestamp=now, - evaluator_plugin=ctrl.control.evaluator.plugin if ctrl else None, + evaluator_name=ctrl.control.evaluator.name if ctrl else None, error_message=None, metadata=non_match.result.metadata or {}, ) diff --git a/server/src/agent_control_server/endpoints/evaluator_configs.py b/server/src/agent_control_server/endpoints/evaluator_configs.py index 3b3422fc..e297cc71 100644 --- a/server/src/agent_control_server/endpoints/evaluator_configs.py +++ b/server/src/agent_control_server/endpoints/evaluator_configs.py @@ -2,7 +2,7 @@ from http import HTTPStatus from typing import Any -from agent_control_engine import list_plugins +from agent_control_engine import list_evaluators from agent_control_models.errors import ErrorCode, ValidationErrorItem from agent_control_models.server import ( CreateEvaluatorConfigRequest, @@ -38,28 +38,28 @@ def _to_item(config: EvaluatorConfigDB) -> EvaluatorConfigItem: id=config.id, name=config.name, description=config.description, - plugin=config.plugin, + evaluator=config.evaluator, config=config.config, created_at=config.created_at.isoformat() if config.created_at else None, updated_at=config.updated_at.isoformat() if config.updated_at else None, ) -def _ensure_not_agent_scoped(plugin: str) -> None: - agent_name, _ = parse_evaluator_ref(plugin) +def _ensure_not_agent_scoped(evaluator: str) -> None: + agent_name, _ = parse_evaluator_ref(evaluator) if agent_name is not None: raise APIValidationError( error_code=ErrorCode.VALIDATION_ERROR, detail="Agent-scoped evaluators are not supported for evaluator configs", resource="EvaluatorConfig", - hint="Use a built-in plugin name without an agent prefix.", + hint="Use a built-in evaluator name without an agent prefix.", errors=[ ValidationErrorItem( resource="EvaluatorConfig", - field="plugin", + field="evaluator", code="agent_scoped_not_supported", message="Agent-scoped evaluator references are not supported", - value=plugin, + value=evaluator, ) ], ) @@ -80,13 +80,13 @@ def _raise_invalid_config( ) -def _validate_known_plugin_config(plugin: str, config: dict[str, Any]) -> None: - plugin_cls = list_plugins().get(plugin) - if plugin_cls is None: +def _validate_known_evaluator_config(evaluator: str, config: dict[str, Any]) -> None: + evaluator_cls = list_evaluators().get(evaluator) + if evaluator_cls is None: return try: - plugin_cls.config_model(**config) + evaluator_cls.config_model(**config) except ValidationError as e: _raise_invalid_config( [ @@ -98,8 +98,8 @@ def _validate_known_plugin_config(plugin: str, config: dict[str, Any]) -> None: ) for err in e.errors() ], - detail=f"Config validation failed for plugin '{plugin}'", - hint="Check the plugin's config schema for required fields and types.", + detail=f"Config validation failed for evaluator '{evaluator}'", + hint="Check the evaluator's config schema for required fields and types.", ) except TypeError as e: _raise_invalid_config( @@ -111,14 +111,14 @@ def _validate_known_plugin_config(plugin: str, config: dict[str, Any]) -> None: message=str(e), ) ], - detail=f"Invalid config parameters for plugin '{plugin}'", - hint="Check the plugin's config schema for valid parameter names.", + detail=f"Invalid config parameters for evaluator '{evaluator}'", + hint="Check the evaluator's config schema for valid parameter names.", ) -def _validate_plugin_config(plugin: str, config: dict[str, Any]) -> None: - _ensure_not_agent_scoped(plugin) - _validate_known_plugin_config(plugin, config) +def _validate_evaluator_config(evaluator: str, config: dict[str, Any]) -> None: + _ensure_not_agent_scoped(evaluator) + _validate_known_evaluator_config(evaluator, config) def _is_name_conflict_error(exc: IntegrityError) -> bool: @@ -156,12 +156,12 @@ async def create_evaluator_config( request: CreateEvaluatorConfigRequest, db: AsyncSession = Depends(get_async_db), ) -> EvaluatorConfigItem: - _validate_plugin_config(request.plugin, request.config) + _validate_evaluator_config(request.evaluator, request.config) evaluator_config = EvaluatorConfigDB( name=request.name, description=request.description, - plugin=request.plugin, + evaluator=request.evaluator, config=request.config, ) db.add(evaluator_config) @@ -208,7 +208,7 @@ async def list_evaluator_configs( cursor: int | None = Query(None, description="Evaluator config ID to start after"), limit: int = Query(_DEFAULT_PAGINATION_LIMIT, ge=1, le=_MAX_PAGINATION_LIMIT), name: str | None = Query(None, description="Filter by name (partial, case-insensitive)"), - plugin: str | None = Query(None, description="Filter by plugin name"), + evaluator: str | None = Query(None, description="Filter by evaluator name"), db: AsyncSession = Depends(get_async_db), ) -> ListEvaluatorConfigsResponse: query = select(EvaluatorConfigDB).order_by(EvaluatorConfigDB.id.desc()) @@ -219,8 +219,8 @@ async def list_evaluator_configs( if name is not None: query = query.where(EvaluatorConfigDB.name.ilike(f"%{name}%")) - if plugin is not None: - query = query.where(EvaluatorConfigDB.plugin == plugin) + if evaluator is not None: + query = query.where(EvaluatorConfigDB.evaluator == evaluator) query = query.limit(limit + 1) result = await db.execute(query) @@ -229,8 +229,8 @@ async def list_evaluator_configs( total_query = select(func.count()).select_from(EvaluatorConfigDB) if name is not None: total_query = total_query.where(EvaluatorConfigDB.name.ilike(f"%{name}%")) - if plugin is not None: - total_query = total_query.where(EvaluatorConfigDB.plugin == plugin) + if evaluator is not None: + total_query = total_query.where(EvaluatorConfigDB.evaluator == evaluator) total_result = await db.execute(total_query) total = total_result.scalar() or 0 @@ -301,11 +301,11 @@ async def update_evaluator_config( hint="Verify the evaluator config ID is correct.", ) - _validate_plugin_config(request.plugin, request.config) + _validate_evaluator_config(request.evaluator, request.config) evaluator_config.name = request.name evaluator_config.description = request.description - evaluator_config.plugin = request.plugin + evaluator_config.evaluator = request.evaluator evaluator_config.config = request.config evaluator_config.updated_at = dt.datetime.now(dt.UTC) diff --git a/server/src/agent_control_server/endpoints/evaluators.py b/server/src/agent_control_server/endpoints/evaluators.py new file mode 100644 index 00000000..99f7be27 --- /dev/null +++ b/server/src/agent_control_server/endpoints/evaluators.py @@ -0,0 +1,55 @@ +"""Evaluator discovery endpoints.""" + +from typing import Any + +from agent_control_engine import list_evaluators +from fastapi import APIRouter +from pydantic import BaseModel, Field + +router = APIRouter(prefix="/evaluators", tags=["evaluators"]) + + +class EvaluatorInfo(BaseModel): + """Information about a registered evaluator.""" + + name: str = Field(..., description="Evaluator name") + version: str = Field(..., description="Evaluator version") + description: str = Field(..., description="Evaluator description") + requires_api_key: bool = Field(..., description="Whether evaluator requires API key") + timeout_ms: int = Field(..., description="Default timeout in milliseconds") + config_schema: dict[str, Any] = Field(..., description="JSON Schema for config") + + +@router.get( + "", + response_model=dict[str, EvaluatorInfo], + summary="List available evaluators", + response_description="Dictionary of evaluator name to evaluator info", +) +async def get_evaluators() -> dict[str, EvaluatorInfo]: + """List all available evaluators. + + Returns metadata and JSON Schema for each built-in evaluator. + + Built-in evaluators: + - **regex**: Regular expression pattern matching + - **list**: List-based value matching with flexible logic + - **json**: JSON validation with schema, types, constraints + - **sql**: SQL query validation + + Custom evaluators are registered per-agent via initAgent. + Use GET /agents/{agent_id}/evaluators to list agent-specific schemas. + """ + evaluators = list_evaluators() + + return { + name: EvaluatorInfo( + name=evaluator_cls.metadata.name, + version=evaluator_cls.metadata.version, + description=evaluator_cls.metadata.description, + requires_api_key=evaluator_cls.metadata.requires_api_key, + timeout_ms=evaluator_cls.metadata.timeout_ms, + config_schema=evaluator_cls.config_model.model_json_schema(), + ) + for name, evaluator_cls in evaluators.items() + } diff --git a/server/src/agent_control_server/endpoints/plugins.py b/server/src/agent_control_server/endpoints/plugins.py deleted file mode 100644 index 0f6d1367..00000000 --- a/server/src/agent_control_server/endpoints/plugins.py +++ /dev/null @@ -1,53 +0,0 @@ -"""Plugin discovery endpoints.""" - -from typing import Any - -from agent_control_engine import list_plugins -from fastapi import APIRouter -from pydantic import BaseModel, Field - -router = APIRouter(prefix="/plugins", tags=["plugins"]) - - -class PluginInfo(BaseModel): - """Information about a registered plugin.""" - - name: str = Field(..., description="Plugin name") - version: str = Field(..., description="Plugin version") - description: str = Field(..., description="Plugin description") - requires_api_key: bool = Field(..., description="Whether plugin requires API key") - timeout_ms: int = Field(..., description="Default timeout in milliseconds") - config_schema: dict[str, Any] = Field(..., description="JSON Schema for config") - - -@router.get( - "", - response_model=dict[str, PluginInfo], - summary="List available plugins", - response_description="Dictionary of plugin name to plugin info", -) -async def get_plugins() -> dict[str, PluginInfo]: - """List all available evaluator plugins. - - Returns metadata and JSON Schema for each built-in plugin. - - Built-in plugins: - - **regex**: Regular expression pattern matching - - **list**: List-based value matching with flexible logic - - Custom evaluators are registered per-agent via initAgent. - Use GET /agents/{agent_id}/evaluators to list agent-specific schemas. - """ - plugins = list_plugins() - - return { - name: PluginInfo( - name=plugin_cls.metadata.name, - version=plugin_cls.metadata.version, - description=plugin_cls.metadata.description, - requires_api_key=plugin_cls.metadata.requires_api_key, - timeout_ms=plugin_cls.metadata.timeout_ms, - config_schema=plugin_cls.config_model.model_json_schema(), - ) - for name, plugin_cls in plugins.items() - } diff --git a/server/src/agent_control_server/main.py b/server/src/agent_control_server/main.py index 2b33dc64..ce754b97 100644 --- a/server/src/agent_control_server/main.py +++ b/server/src/agent_control_server/main.py @@ -5,7 +5,7 @@ from contextlib import asynccontextmanager import uvicorn -from agent_control_engine import discover_plugins, list_plugins +from agent_control_engine import discover_evaluators, list_evaluators from agent_control_models import HealthResponse from fastapi import Depends, FastAPI, HTTPException from fastapi.exceptions import RequestValidationError @@ -19,8 +19,8 @@ from .endpoints.controls import router as control_router from .endpoints.evaluation import router as evaluation_router from .endpoints.evaluator_configs import router as evaluator_config_router +from .endpoints.evaluators import router as evaluator_router from .endpoints.observability import router as observability_router -from .endpoints.plugins import router as plugin_router from .endpoints.policies import router as policy_router from .errors import ( APIError, @@ -76,10 +76,10 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: log_level = "DEBUG" if settings.debug else "INFO" configure_logging(level=log_level) - # Discover plugins at startup - discover_plugins() - available = list(list_plugins().keys()) - logger.info(f"Plugin discovery complete. Available plugins: {available}") + # Discover evaluators at startup + discover_evaluators() + available = list(list_evaluators().keys()) + logger.info(f"Evaluator discovery complete. Available evaluators: {available}") # Initialize observability components (stored on app.state) if observability_settings.enabled: @@ -203,7 +203,7 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: ) app.include_router( - plugin_router, + evaluator_router, prefix=api_v1_prefix, dependencies=[Depends(require_api_key)], ) diff --git a/server/src/agent_control_server/models.py b/server/src/agent_control_server/models.py index 16df1880..093a3b17 100644 --- a/server/src/agent_control_server/models.py +++ b/server/src/agent_control_server/models.py @@ -73,7 +73,7 @@ class EvaluatorConfigDB(Base): id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) name: Mapped[str] = mapped_column(String(255), nullable=False, unique=True) description: Mapped[str | None] = mapped_column(String(1000), nullable=True) - plugin: Mapped[str] = mapped_column(String(255), nullable=False, index=True) + evaluator: Mapped[str] = mapped_column(String(255), nullable=False, index=True) config: Mapped[dict[str, Any]] = mapped_column(JSONB, nullable=False) created_at: Mapped[dt.datetime] = mapped_column( DateTime(timezone=True), diff --git a/server/src/agent_control_server/services/evaluator_utils.py b/server/src/agent_control_server/services/evaluator_utils.py index d6fc1234..cb162f51 100644 --- a/server/src/agent_control_server/services/evaluator_utils.py +++ b/server/src/agent_control_server/services/evaluator_utils.py @@ -7,23 +7,23 @@ from jsonschema_rs import validator_for -def parse_evaluator_ref(plugin: str) -> tuple[str | None, str]: - """Parse plugin reference into (agent_name, evaluator_name). +def parse_evaluator_ref(evaluator_ref: str) -> tuple[str | None, str]: + """Parse evaluator reference into (agent_name, evaluator_name). - Built-in plugins have no prefix, agent-scoped evaluators use {agent}:{name} format. + Built-in evaluators have no prefix, agent-scoped evaluators use {agent}:{name} format. Args: - plugin: Plugin reference string (e.g., "regex" or "my-agent:pii-detector") + evaluator_ref: Evaluator reference string (e.g., "regex" or "my-agent:pii-detector") Returns: Tuple of (agent_name, evaluator_name): - - (None, "regex") for built-in plugins + - (None, "regex") for built-in evaluators - ("my-agent", "pii-detector") for agent-scoped evaluators """ - if ":" in plugin: - agent, name = plugin.split(":", 1) + if ":" in evaluator_ref: + agent, name = evaluator_ref.split(":", 1) return agent, name - return None, plugin + return None, evaluator_ref def _canonicalize_schema(schema: dict[str, Any]) -> str: diff --git a/server/tests/conftest.py b/server/tests/conftest.py index 6805c870..39304828 100644 --- a/server/tests/conftest.py +++ b/server/tests/conftest.py @@ -3,15 +3,15 @@ from sqlalchemy import create_engine, text from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker -from agent_control_engine import discover_plugins +from agent_control_engine import discover_evaluators from agent_control_server.config import auth_settings, db_config from agent_control_server.db import Base from agent_control_server.main import app as fastapi_app import agent_control_server.models # ensure models are imported so tables are registered -# Discover plugins at test session start -discover_plugins() +# Discover evaluators at test session start +discover_evaluators() # Test API keys TEST_API_KEY = "test-api-key-12345" diff --git a/server/tests/test_auth.py b/server/tests/test_auth.py index 99a8cba4..2cf0ef1c 100644 --- a/server/tests/test_auth.py +++ b/server/tests/test_auth.py @@ -77,31 +77,31 @@ def test_admin_key_works_on_protected_endpoints(self, admin_client: TestClient) assert response.status_code == 404 -class TestPluginsEndpoint: - """Plugins endpoint requires valid API key (regular or admin).""" +class TestEvaluatorsEndpoint: + """Evaluators endpoint requires valid API key (regular or admin).""" - def test_regular_key_works_on_plugins(self, client: TestClient) -> None: - """Given regular API key, when listing plugins, then returns 200.""" + def test_regular_key_works_on_evaluators(self, client: TestClient) -> None: + """Given regular API key, when listing evaluators, then returns 200.""" # When - response = client.get("/api/v1/plugins") + response = client.get("/api/v1/evaluators") # Then assert response.status_code == 200 - def test_admin_key_works_on_plugins(self, admin_client: TestClient) -> None: - """Given admin API key, when listing plugins, then returns 200.""" + def test_admin_key_works_on_evaluators(self, admin_client: TestClient) -> None: + """Given admin API key, when listing evaluators, then returns 200.""" # When - response = admin_client.get("/api/v1/plugins") + response = admin_client.get("/api/v1/evaluators") # Then assert response.status_code == 200 - def test_missing_key_returns_401_on_plugins( + def test_missing_key_returns_401_on_evaluators( self, unauthenticated_client: TestClient ) -> None: - """Given no API key, when listing plugins, then returns 401.""" + """Given no API key, when listing evaluators, then returns 401.""" # When - response = unauthenticated_client.get("/api/v1/plugins") + response = unauthenticated_client.get("/api/v1/evaluators") # Then assert response.status_code == 401 @@ -127,12 +127,12 @@ def test_no_key_allowed_when_disabled( # Then (404 for non-existent resource, but NOT 401) assert response.status_code == 404 - def test_plugins_accessible_when_disabled( + def test_evaluators_accessible_when_disabled( self, unauthenticated_client: TestClient ) -> None: - """Given auth disabled, when listing plugins without API key, then returns 200.""" + """Given auth disabled, when listing evaluators without API key, then returns 200.""" # When - response = unauthenticated_client.get("/api/v1/plugins") + response = unauthenticated_client.get("/api/v1/evaluators") # Then assert response.status_code == 200 diff --git a/server/tests/test_controls.py b/server/tests/test_controls.py index 9f1da0ca..2be65b56 100644 --- a/server/tests/test_controls.py +++ b/server/tests/test_controls.py @@ -40,7 +40,7 @@ def test_get_control_data_initially_unconfigured(client: TestClient) -> None: "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": "test", "flags": []} }, "action": {"decision": "deny"}, diff --git a/server/tests/test_controls_validation.py b/server/tests/test_controls_validation.py index f9c95ffe..40e663b3 100644 --- a/server/tests/test_controls_validation.py +++ b/server/tests/test_controls_validation.py @@ -16,7 +16,7 @@ def test_validation_invalid_logic_enum(client: TestClient): # Given: Payload with invalid 'logic' value payload = VALID_CONTROL_PAYLOAD.copy() payload["evaluator"] = { - "plugin": "list", + "name": "list", "config": { "values": ["a", "b"], "logic": "invalid_logic", # Should be 'any' or 'all' @@ -41,23 +41,23 @@ def test_validation_discriminator_mismatch(client: TestClient): """Test that config must match the evaluator type.""" control_id = create_control(client) - # Given: type='list' but config has 'pattern' (RegexConfig) + # Given: type='list' but config has 'pattern' (RegexEvaluatorConfig) payload = VALID_CONTROL_PAYLOAD.copy() payload["evaluator"] = { - "plugin": "list", + "name": "list", "config": { - "pattern": "some_regex", # Invalid for ListConfig + "pattern": "some_regex", # Invalid for ListEvaluatorConfig # Missing 'values' } } - + # When: Setting control data resp = client.put(f"/api/v1/controls/{control_id}/data", json={"data": payload}) - + # Then: 422 Unprocessable Entity assert resp.status_code == 422 - - # Verify error mentions missing required field for ListConfig (RFC 7807 format) + + # Verify error mentions missing required field for ListEvaluatorConfig (RFC 7807 format) response_data = resp.json() errors = response_data.get("errors", []) # Expecting 'values' field missing @@ -72,7 +72,7 @@ def test_validation_regex_flags_list(client: TestClient): # Given: regex config with invalid flags type (string instead of list) payload = VALID_CONTROL_PAYLOAD.copy() payload["evaluator"] = { - "plugin": "regex", + "name": "regex", "config": { "pattern": "abc", "flags": "IGNORECASE" # Should be ["IGNORECASE"] @@ -96,7 +96,7 @@ def test_validation_invalid_regex_pattern(client: TestClient): # Given: regex config with invalid pattern (unclosed bracket) payload = VALID_CONTROL_PAYLOAD.copy() payload["evaluator"] = { - "plugin": "regex", + "name": "regex", "config": { "pattern": "[", # Invalid regex "flags": [] diff --git a/server/tests/test_error_handling.py b/server/tests/test_error_handling.py index 4a256f9f..1536f198 100644 --- a/server/tests/test_error_handling.py +++ b/server/tests/test_error_handling.py @@ -291,7 +291,7 @@ async def mock_db_returns_control() -> AsyncGenerator[AsyncSession, None]: "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "x"}}, + "evaluator": {"name": "regex", "config": {"pattern": "x"}}, "action": {"decision": "deny"} } resp = client.put( diff --git a/server/tests/test_evaluation_e2e.py b/server/tests/test_evaluation_e2e.py index 63ea5dec..32734e9d 100644 --- a/server/tests/test_evaluation_e2e.py +++ b/server/tests/test_evaluation_e2e.py @@ -14,7 +14,7 @@ def test_evaluation_flow_deny(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": "secret"} }, "action": {"decision": "deny"} @@ -100,7 +100,7 @@ def test_evaluation_path_failure(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input.non_existent_field"}, # Invalid for string input "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": ".*"} # Match anything if found }, "action": {"decision": "deny"} @@ -133,7 +133,7 @@ def test_evaluation_tool_step_nested(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.config.risk_level"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": "^critical$"} }, "action": {"decision": "deny"} @@ -188,7 +188,7 @@ def test_evaluation_deny_precedence(client: TestClient): "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "keyword"}}, + "evaluator": {"name": "regex", "config": {"pattern": "keyword"}}, "action": {"decision": "warn"} } # Use helper to setup agent with first control @@ -206,7 +206,7 @@ def test_evaluation_deny_precedence(client: TestClient): "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "keyword"}}, + "evaluator": {"name": "regex", "config": {"pattern": "keyword"}}, "action": {"decision": "deny"} } resp = client.put("/api/v1/controls", json={"name": f"deny-control-{uuid.uuid4()}"}) @@ -243,7 +243,7 @@ def test_evaluation_stage_filtering(client: TestClient): "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "bad_output"}}, + "evaluator": {"name": "regex", "config": {"pattern": "bad_output"}}, "action": {"decision": "deny"} } agent_uuid, _ = create_and_assign_policy(client, control_data, agent_name="StageAgent") @@ -280,7 +280,7 @@ def test_evaluation_step_type_filtering(client: TestClient): "execution": "server", "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "name"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "rm_rf"}}, + "evaluator": {"name": "regex", "config": {"pattern": "rm_rf"}}, "action": {"decision": "deny"} } agent_uuid, _ = create_and_assign_policy(client, control_data, agent_name="AppliesToAgent") @@ -315,7 +315,7 @@ def test_evaluation_denylist_step_name(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "name"}, "evaluator": { - "plugin": "list", # Matches if value is IN list (exact match) + "name": "list", # Matches if value is IN list (exact match) "config": {"values": ["dangerous_tool", "rm_rf"], "match_on": "match"} }, "action": {"decision": "deny"} diff --git a/server/tests/test_evaluation_e2e_list_evaluator.py b/server/tests/test_evaluation_e2e_list_evaluator.py index f71d4967..1f4cf143 100644 --- a/server/tests/test_evaluation_e2e_list_evaluator.py +++ b/server/tests/test_evaluation_e2e_list_evaluator.py @@ -14,7 +14,7 @@ def test_list_evaluator_denylist_behavior(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.cmd"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["rm", "shutdown"], "logic": "any", @@ -62,7 +62,7 @@ def test_list_evaluator_allowlist_behavior(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "name"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["safe_tool"], "logic": "any", @@ -109,7 +109,7 @@ def test_list_evaluator_case_insensitive(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["BlockMe"], "case_sensitive": False, @@ -142,7 +142,7 @@ def test_list_evaluator_list_input_any_match(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.tags"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["restricted"], "logic": "any", @@ -189,7 +189,7 @@ def test_list_evaluator_list_input_all_match(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.tags"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["safe_tag", "audit_approved"], "logic": "all", @@ -235,7 +235,7 @@ def test_list_evaluator_disallow_name(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "name"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["delete_user", "drop_db"], "logic": "any", @@ -278,7 +278,7 @@ def test_list_evaluator_allow_only_argument_values(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.region"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["us-east-1", "us-west-2"], "logic": "any", @@ -322,7 +322,7 @@ def test_list_evaluator_edge_cases(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "name"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": [], "logic": "any", @@ -352,7 +352,7 @@ def test_list_evaluator_edge_cases(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.count"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": [10, 20], # Integers in control "logic": "any", @@ -392,7 +392,7 @@ def test_list_evaluator_edge_cases(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["(test)", "a.b*c"], # Literal parens and dot/star "logic": "any", @@ -433,7 +433,7 @@ def test_list_evaluator_edge_cases(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.missing_arg"}, # Will be None "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["something"], "logic": "any", @@ -469,7 +469,7 @@ def test_list_evaluator_re2_corner_cases(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.item"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": large_list, "logic": "any", @@ -501,7 +501,7 @@ def test_list_evaluator_newline_strictness(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.val"}, "evaluator": { - "plugin": "list", + "name": "list", "config": { "values": ["exact"], "logic": "any", diff --git a/server/tests/test_evaluation_e2e_sql_evaluator.py b/server/tests/test_evaluation_e2e_sql_evaluator.py index 05f3a06f..e8369a2f 100644 --- a/server/tests/test_evaluation_e2e_sql_evaluator.py +++ b/server/tests/test_evaluation_e2e_sql_evaluator.py @@ -1,4 +1,4 @@ -"""End-to-end tests for SQL evaluator plugin.""" +"""End-to-end tests for SQL evaluator.""" from agent_control_models import EvaluationRequest, Step from fastapi.testclient import TestClient @@ -21,7 +21,7 @@ def test_sql_read_only_agent(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "allowed_operations": ["SELECT"], "require_limit": True, @@ -102,7 +102,7 @@ def test_sql_multi_tenant_security(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "required_columns": ["tenant_id"], "column_context": "where" @@ -168,7 +168,7 @@ def test_sql_block_destructive_operations(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "blocked_operations": ["DROP", "TRUNCATE", "DELETE"] } @@ -261,7 +261,7 @@ def test_sql_table_restrictions(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "allowed_tables": ["users", "orders"] } @@ -340,7 +340,7 @@ def test_sql_multi_statement_blocking(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "allow_multi_statements": False } @@ -391,7 +391,7 @@ def test_sql_limit_enforcement(client: TestClient): "scope": {"step_types": ["tool"], "stages": ["pre"]}, "selector": {"path": "input.query"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "require_limit": True, "max_limit": 1000 @@ -491,7 +491,7 @@ def test_sql_llm_output_validation_read_only(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "allowed_operations": ["SELECT"], "require_limit": True @@ -554,7 +554,7 @@ def test_sql_llm_output_multi_statement_blocking(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "allow_multi_statements": False } @@ -603,7 +603,7 @@ def test_sql_llm_output_table_restrictions(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, "selector": {"path": "output"}, "evaluator": { - "plugin": "sql", + "name": "sql", "config": { "allowed_tables": ["analytics", "reports"] } diff --git a/server/tests/test_evaluation_error_handling.py b/server/tests/test_evaluation_error_handling.py index 1c20747b..6a780e36 100644 --- a/server/tests/test_evaluation_error_handling.py +++ b/server/tests/test_evaluation_error_handling.py @@ -32,7 +32,7 @@ def test_evaluation_with_agent_scoped_evaluator_missing(client: TestClient): "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": f"{agent_name}:missing-evaluator", + "name": f"{agent_name}:missing-evaluator", "config": {} }, "action": {"decision": "deny"} @@ -52,7 +52,7 @@ def test_evaluation_with_agent_scoped_evaluator_missing(client: TestClient): def test_evaluation_control_with_invalid_config_caught_early(client: TestClient): """Test that invalid evaluator config is caught at control creation. - Given: A control with invalid config for a plugin + Given: A control with invalid config for an evaluator When: Setting control data Then: Returns 422 with validation error """ @@ -69,7 +69,7 @@ def test_evaluation_control_with_invalid_config_caught_early(client: TestClient) "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {} # Missing required 'pattern' field }, "action": {"decision": "deny"} @@ -87,7 +87,7 @@ def test_evaluation_errors_field_populated_on_evaluator_failure( ): """Test that errors field is populated when evaluator fails at runtime. - Given: A valid control with a plugin that crashes during evaluation + Given: A valid control with an evaluator that crashes during evaluation When: Evaluation is requested Then: Response has errors field populated and is_safe=False (for deny) """ @@ -101,25 +101,25 @@ def test_evaluation_errors_field_populated_on_evaluator_failure( "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, "evaluator": { - "plugin": "regex", + "name": "regex", "config": {"pattern": "test"} }, "action": {"decision": "deny"} } agent_uuid, control_name = create_and_assign_policy(client, control_data) - # Mock get_evaluator to return a plugin that throws + # Mock get_evaluator_instance to return an evaluator that throws mock_evaluator = MagicMock() - mock_evaluator.evaluate = AsyncMock(side_effect=RuntimeError("Simulated plugin crash")) + mock_evaluator.evaluate = AsyncMock(side_effect=RuntimeError("Simulated evaluator crash")) mock_evaluator.get_timeout_seconds = MagicMock(return_value=30.0) # Patch where it's used (in core module), not where it's defined import agent_control_engine.core as core_module - def mock_get_evaluator(config): + def mock_get_evaluator_instance(config): return mock_evaluator - monkeypatch.setattr(core_module, "get_evaluator", mock_get_evaluator) + monkeypatch.setattr(core_module, "get_evaluator_instance", mock_get_evaluator_instance) # When: Sending evaluation request payload = Step(type="llm_inference", name="test-step", input="test content", output=None) @@ -145,7 +145,7 @@ def mock_get_evaluator(config): assert len(data["errors"]) == 1 assert data["errors"][0]["control_name"] == control_name assert "RuntimeError" in data["errors"][0]["result"]["error"] - assert "Simulated plugin crash" in data["errors"][0]["result"]["error"] + assert "Simulated evaluator crash" in data["errors"][0]["result"]["error"] # No matches because evaluation failed assert data["matches"] is None or len(data["matches"]) == 0 diff --git a/server/tests/test_evaluator_configs.py b/server/tests/test_evaluator_configs.py index 4ae8064d..7fb08546 100644 --- a/server/tests/test_evaluator_configs.py +++ b/server/tests/test_evaluator_configs.py @@ -9,30 +9,30 @@ from fastapi.testclient import TestClient -def _default_config_for_plugin(plugin: str) -> dict: - if plugin == "list": +def _default_config_for_evaluator(evaluator: str) -> dict: + if evaluator == "list": return {"values": ["blocked"], "logic": "any", "match_on": "match"} - if plugin == "regex": + if evaluator == "regex": return {"pattern": r"\b\d{3}-\d{2}-\d{4}\b"} return {} def _create_config_payload( name: str, - plugin: str = "regex", + evaluator: str = "regex", config: dict | None = None, description: str | None = None, ) -> dict: return { "name": name, "description": description, - "plugin": plugin, - "config": config if config is not None else _default_config_for_plugin(plugin), + "evaluator": evaluator, + "config": config if config is not None else _default_config_for_evaluator(evaluator), } -def _create_config(client: TestClient, name: str, plugin: str = "regex") -> dict: - payload = _create_config_payload(name=name, plugin=plugin) +def _create_config(client: TestClient, name: str, evaluator: str = "regex") -> dict: + payload = _create_config_payload(name=name, evaluator=evaluator) resp = client.post("/api/v1/evaluator-configs", json=payload) assert resp.status_code == 201 return resp.json() @@ -51,7 +51,7 @@ def test_create_evaluator_config_success(client: TestClient) -> None: data = resp.json() assert data["id"] is not None assert data["name"] == name - assert data["plugin"] == "regex" + assert data["evaluator"] == "regex" assert data["config"]["pattern"] == payload["config"]["pattern"] assert data["created_at"] is not None assert data["updated_at"] is not None @@ -71,10 +71,10 @@ def test_create_evaluator_config_duplicate_name_409(client: TestClient) -> None: assert data["error_code"] == "EVALUATOR_CONFIG_NAME_CONFLICT" -def test_create_evaluator_config_unknown_plugin_allowed(client: TestClient) -> None: - # Given: A payload with an unknown plugin name +def test_create_evaluator_config_unknown_evaluator_allowed(client: TestClient) -> None: + # Given: A payload with an unknown evaluator name name = f"config-{uuid.uuid4().hex}" - payload = _create_config_payload(name=name, plugin="unknown-plugin", config={}) + payload = _create_config_payload(name=name, evaluator="unknown-evaluator", config={}) # When: Creating the evaluator config resp = client.post("/api/v1/evaluator-configs", json=payload) @@ -82,13 +82,13 @@ def test_create_evaluator_config_unknown_plugin_allowed(client: TestClient) -> N # Then: It succeeds (validation skipped) assert resp.status_code == 201 data = resp.json() - assert data["plugin"] == "unknown-plugin" + assert data["evaluator"] == "unknown-evaluator" def test_create_evaluator_config_agent_scoped_rejected(client: TestClient) -> None: - # Given: A payload referencing an agent-scoped plugin + # Given: A payload referencing an agent-scoped evaluator name = f"config-{uuid.uuid4().hex}" - payload = _create_config_payload(name=name, plugin="agent:custom", config={}) + payload = _create_config_payload(name=name, evaluator="agent:custom", config={}) # When: Creating the evaluator config resp = client.post("/api/v1/evaluator-configs", json=payload) @@ -97,13 +97,13 @@ def test_create_evaluator_config_agent_scoped_rejected(client: TestClient) -> No assert resp.status_code == 422 data = resp.json() assert data["error_code"] == "VALIDATION_ERROR" - assert any(err.get("field") == "plugin" for err in data.get("errors", [])) + assert any(err.get("field") == "evaluator" for err in data.get("errors", [])) def test_create_evaluator_config_invalid_config_422(client: TestClient) -> None: - # Given: A payload with invalid config for regex plugin + # Given: A payload with invalid config for regex evaluator name = f"config-{uuid.uuid4().hex}" - payload = _create_config_payload(name=name, plugin="regex", config={"flags": ["IGNORECASE"]}) + payload = _create_config_payload(name=name, evaluator="regex", config={"flags": ["IGNORECASE"]}) # When: Creating the evaluator config resp = client.post("/api/v1/evaluator-configs", json=payload) @@ -128,7 +128,7 @@ def test_update_evaluator_config_replaces_fields_and_updates_timestamp( # When: Updating the evaluator config via PUT payload = _create_config_payload( name=f"{name}-v2", - plugin="regex", + evaluator="regex", config={"pattern": r"\b\d{4}\b"}, description="Updated description", ) @@ -157,7 +157,7 @@ def test_update_evaluator_config_name_conflict_409(client: TestClient) -> None: # When: Updating second to use first's name payload = _create_config_payload( name=first["name"], - plugin="regex", + evaluator="regex", config={"pattern": r"\btest\b"}, ) resp = client.put(f"/api/v1/evaluator-configs/{second['id']}", json=payload) @@ -186,15 +186,15 @@ def test_list_evaluator_configs_with_filters_and_pagination( ) -> None: # Given: Multiple evaluator configs base = f"config-{uuid.uuid4().hex}" - _create_config(client, name=f"{base}-a", plugin="regex") - _create_config(client, name=f"{base}-b", plugin="regex") - _create_config(client, name=f"{base}-c", plugin="regex") - _create_config(client, name=f"{base}-d", plugin="list") + _create_config(client, name=f"{base}-a", evaluator="regex") + _create_config(client, name=f"{base}-b", evaluator="regex") + _create_config(client, name=f"{base}-c", evaluator="regex") + _create_config(client, name=f"{base}-d", evaluator="list") - # When: Listing with limit and plugin filter + # When: Listing with limit and evaluator filter resp = client.get( "/api/v1/evaluator-configs", - params={"limit": 2, "plugin": "regex", "name": base}, + params={"limit": 2, "evaluator": "regex", "name": base}, ) # Then: Pagination metadata is correct @@ -203,7 +203,7 @@ def test_list_evaluator_configs_with_filters_and_pagination( assert data["pagination"]["limit"] == 2 assert data["pagination"]["has_more"] is True assert len(data["evaluator_configs"]) == 2 - assert all(cfg["plugin"] == "regex" for cfg in data["evaluator_configs"]) + assert all(cfg["evaluator"] == "regex" for cfg in data["evaluator_configs"]) def test_delete_evaluator_config_success(client: TestClient) -> None: @@ -236,7 +236,7 @@ def test_delete_evaluator_config_not_found(client: TestClient) -> None: def test_create_evaluator_config_empty_config_allowed(client: TestClient) -> None: # Given: A payload with an empty config object name = f"config-{uuid.uuid4().hex}" - payload = _create_config_payload(name=name, plugin="unknown-plugin", config={}) + payload = _create_config_payload(name=name, evaluator="unknown-evaluator", config={}) # When: Creating the evaluator config resp = client.post("/api/v1/evaluator-configs", json=payload) diff --git a/server/tests/test_evaluator_schemas.py b/server/tests/test_evaluator_schemas.py index 18dcb7bb..ef1b9e6d 100644 --- a/server/tests/test_evaluator_schemas.py +++ b/server/tests/test_evaluator_schemas.py @@ -58,7 +58,7 @@ def test_init_agent_with_evaluators(client: TestClient) -> None: def test_init_agent_evaluator_name_collision_rejected(client: TestClient) -> None: - """Test that evaluator names conflicting with built-in plugins are rejected.""" + """Test that evaluator names conflicting with built-in evaluators are rejected.""" # Given: Evaluator name conflicting with built-in payload = make_agent_payload( evaluators=[ @@ -74,7 +74,7 @@ def test_init_agent_evaluator_name_collision_rejected(client: TestClient) -> Non # Then: Should be rejected (RFC 7807 format) assert resp.status_code == 409 response_data = resp.json() - assert "conflicts with built-in plugin" in response_data.get("detail", "") + assert "conflicts with built-in evaluator" in response_data.get("detail", "") def test_init_agent_evaluator_name_collision_list(client: TestClient) -> None: diff --git a/server/tests/test_evaluator_utils.py b/server/tests/test_evaluator_utils.py index e18e6cb4..2ff0fc10 100644 --- a/server/tests/test_evaluator_utils.py +++ b/server/tests/test_evaluator_utils.py @@ -11,8 +11,8 @@ class TestParseEvaluatorRef: """Tests for parse_evaluator_ref function.""" - def test_builtin_plugin(self) -> None: - """Given a built-in plugin name, when parsing, then returns None for agent.""" + def test_builtin_evaluator(self) -> None: + """Given a built-in evaluator name, when parsing, then returns None for agent.""" # When agent, name = parse_evaluator_ref("regex") @@ -47,8 +47,8 @@ def test_empty_string(self) -> None: assert agent is None assert name == "" - def test_list_plugin(self) -> None: - """Given the list built-in plugin, when parsing, then returns None for agent.""" + def test_list_evaluator(self) -> None: + """Given the list built-in evaluator, when parsing, then returns None for agent.""" # When agent, name = parse_evaluator_ref("list") diff --git a/server/tests/test_new_features.py b/server/tests/test_new_features.py index b82a246e..5b813a1c 100644 --- a/server/tests/test_new_features.py +++ b/server/tests/test_new_features.py @@ -1,4 +1,4 @@ -"""Tests for new features: plugins endpoint, policy validation, PATCH agents.""" +"""Tests for new features: evaluators endpoint, policy validation, PATCH agents.""" import uuid @@ -29,23 +29,23 @@ def make_agent_payload( # ============================================================================= -# GET /plugins endpoint +# GET /evaluators endpoint # ============================================================================= -def test_get_plugins(client: TestClient) -> None: - """Given built-in plugins are registered, when listing plugins, then returns all with schemas.""" +def test_get_evaluators(client: TestClient) -> None: + """Given built-in evaluators are registered, when listing evaluators, then returns all with schemas.""" # When - resp = client.get("/api/v1/plugins") + resp = client.get("/api/v1/evaluators") # Then assert resp.status_code == 200 - plugins = resp.json() - assert isinstance(plugins, dict) - assert "regex" in plugins - assert "list" in plugins + evaluators = resp.json() + assert isinstance(evaluators, dict) + assert "regex" in evaluators + assert "list" in evaluators - regex = plugins["regex"] + regex = evaluators["regex"] assert regex["name"] == "regex" assert "version" in regex assert "description" in regex @@ -53,15 +53,15 @@ def test_get_plugins(client: TestClient) -> None: assert isinstance(regex["config_schema"], dict) -def test_get_plugins_schema_has_properties(client: TestClient) -> None: - """Given the regex plugin is registered, when listing plugins, then schema has pattern property.""" +def test_get_evaluators_schema_has_properties(client: TestClient) -> None: + """Given the regex evaluator is registered, when listing evaluators, then schema has pattern property.""" # When - resp = client.get("/api/v1/plugins") + resp = client.get("/api/v1/evaluators") # Then assert resp.status_code == 200 - plugins = resp.json() - regex_schema = plugins["regex"]["config_schema"] + evaluators = resp.json() + regex_schema = evaluators["regex"]["config_schema"] assert "properties" in regex_schema assert "pattern" in regex_schema["properties"] @@ -274,8 +274,8 @@ def _create_policy_with_control( return policy_id, control_id -def test_policy_assignment_with_builtin_plugin(client: TestClient) -> None: - """Given an agent and a policy with built-in plugin control, when assigning policy, then succeeds.""" +def test_policy_assignment_with_builtin_evaluator(client: TestClient) -> None: + """Given an agent and a policy with built-in evaluator control, when assigning policy, then succeeds.""" # Given agent_id = str(uuid.uuid4()) name = f"Test Agent {uuid.uuid4().hex[:8]}" @@ -290,7 +290,7 @@ def test_policy_assignment_with_builtin_plugin(client: TestClient) -> None: "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "test.*"}}, + "evaluator": {"name": "regex", "config": {"pattern": "test.*"}}, "action": {"decision": "deny"}, }, ) @@ -322,7 +322,7 @@ def test_policy_assignment_with_registered_agent_evaluator(client: TestClient) - "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": f"{agent_name}:custom-eval", "config": {}}, + "evaluator": {"name": f"{agent_name}:custom-eval", "config": {}}, "action": {"decision": "deny"}, }, ) @@ -353,7 +353,7 @@ def test_control_creation_with_unregistered_evaluator_fails(client: TestClient) "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": f"{agent_name}:nonexistent-eval", "config": {}}, + "evaluator": {"name": f"{agent_name}:nonexistent-eval", "config": {}}, "action": {"decision": "deny"}, } }, @@ -391,7 +391,7 @@ def test_policy_assignment_cross_agent_evaluator_fails(client: TestClient) -> No "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": f"{agent_a_name}:shared-eval", "config": {}}, + "evaluator": {"name": f"{agent_a_name}:shared-eval", "config": {}}, "action": {"decision": "deny"}, }, ) @@ -553,7 +553,7 @@ def test_patch_agent_remove_evaluator_blocked_by_control(client: TestClient) -> "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": f"{agent_name}:my-eval", "config": {}}, + "evaluator": {"name": f"{agent_name}:my-eval", "config": {}}, "action": {"decision": "deny"}, }, ) diff --git a/server/tests/test_observability_endpoints.py b/server/tests/test_observability_endpoints.py index 486f3a99..ef4872ba 100644 --- a/server/tests/test_observability_endpoints.py +++ b/server/tests/test_observability_endpoints.py @@ -178,7 +178,7 @@ def test_event_with_all_fields(self): confidence=0.99, timestamp=datetime.now(timezone.utc), execution_duration_ms=15.5, - evaluator_plugin="regex", + evaluator_name="regex", selector_path="input", error_message=None, metadata={"key": "value"}, diff --git a/server/tests/test_observability_models.py b/server/tests/test_observability_models.py index 961585ca..c48d69b8 100644 --- a/server/tests/test_observability_models.py +++ b/server/tests/test_observability_models.py @@ -184,13 +184,13 @@ def test_optional_fields(self): matched=False, confidence=0.5, execution_duration_ms=15.3, - evaluator_plugin="regex", + evaluator_name="regex", selector_path="input", error_message=None, metadata={"key": "value"}, ) assert event.execution_duration_ms == 15.3 - assert event.evaluator_plugin == "regex" + assert event.evaluator_name == "regex" assert event.selector_path == "input" assert event.metadata == {"key": "value"} diff --git a/server/tests/utils.py b/server/tests/utils.py index d76ddcd1..afb8b85a 100644 --- a/server/tests/utils.py +++ b/server/tests/utils.py @@ -10,7 +10,7 @@ "execution": "server", "scope": {"step_types": ["llm_inference"], "stages": ["pre"]}, "selector": {"path": "input"}, - "evaluator": {"plugin": "regex", "config": {"pattern": "x"}}, + "evaluator": {"name": "regex", "config": {"pattern": "x"}}, "action": {"decision": "deny"} } From 6ebfbce98c6422571ac3bf53f8bd956e00ff060f Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Wed, 28 Jan 2026 17:16:52 -0800 Subject: [PATCH 2/6] Add Deepeval evaluator example --- examples/deepeval/README.md | 412 ++++++++++++++++++ examples/deepeval/THIRD_PARTY_GUIDE.md | 357 +++++++++++++++ examples/deepeval/__init__.py | 14 + examples/deepeval/config.py | 138 ++++++ examples/deepeval/evaluator.py | 298 +++++++++++++ examples/deepeval/pyproject.toml | 43 ++ examples/deepeval/qa_agent.py | 372 ++++++++++++++++ examples/deepeval/setup_controls.py | 311 +++++++++++++ .../deepeval/start_server_with_evaluator.sh | 19 + 9 files changed, 1964 insertions(+) create mode 100644 examples/deepeval/README.md create mode 100644 examples/deepeval/THIRD_PARTY_GUIDE.md create mode 100644 examples/deepeval/__init__.py create mode 100644 examples/deepeval/config.py create mode 100644 examples/deepeval/evaluator.py create mode 100644 examples/deepeval/pyproject.toml create mode 100755 examples/deepeval/qa_agent.py create mode 100755 examples/deepeval/setup_controls.py create mode 100644 examples/deepeval/start_server_with_evaluator.sh diff --git a/examples/deepeval/README.md b/examples/deepeval/README.md new file mode 100644 index 00000000..c287b4aa --- /dev/null +++ b/examples/deepeval/README.md @@ -0,0 +1,412 @@ +# DeepEval GEval Evaluator Example + +This example demonstrates how to extend the agent-control `Evaluator` base class to create custom evaluators using external libraries like [DeepEval](https://deepeval.com). + +## Overview + +DeepEval's GEval is an LLM-as-a-judge metric that uses chain-of-thoughts (CoT) to evaluate LLM outputs based on custom criteria. This example shows how to: + +1. **Extend the base Evaluator class** - Create a custom evaluator by implementing the required interface +2. **Configure evaluation criteria** - Define custom quality metrics (coherence, relevance, correctness, etc.) +3. **Register via entry points** - Make the evaluator discoverable by the agent-control server +4. **Integrate with agent-control** - Use the evaluator in controls to enforce quality standards + +## Architecture + +``` +examples/deepeval/ +├── config.py # DeepEvalEvaluatorConfig - Configuration model +├── evaluator.py # DeepEvalEvaluator - Main evaluator implementation +├── qa_agent.py # Q&A agent with DeepEval controls +├── setup_controls.py # Setup script to create controls on server +├── pyproject.toml # Project config with entry point registration +├── README.md # This file +└── THIRD_PARTY_GUIDE.md # Complete guide for third-party developers +``` + +### Key Components + +1. **DeepEvalEvaluatorConfig** ([config.py](config.py)) + - Pydantic model defining configuration options + - Based on DeepEval's GEval API parameters + - Validates that either `criteria` or `evaluation_steps` is provided + +2. **DeepEvalEvaluator** ([evaluator.py](evaluator.py)) + - Extends `Evaluator[DeepEvalEvaluatorConfig]` + - Implements the `evaluate()` method + - Registered with `@register_evaluator` decorator + - Handles LLMTestCase creation and metric execution + +3. **Q&A Agent Demo** ([qa_agent.py](qa_agent.py)) + - Complete working agent with DeepEval quality controls + - Uses `@control()` decorator for automatic evaluation + - Demonstrates handling `ControlViolationError` + +4. **Setup Script** ([setup_controls.py](setup_controls.py)) + - Creates agent and registers with server + - Configures DeepEval-based controls + - Creates 3 quality controls (coherence, relevance, correctness) + +5. **Entry Point Registration** ([pyproject.toml](pyproject.toml)) + - Registers evaluator with server via `project.entry-points` + - Enables automatic discovery when server starts + - Critical for third-party evaluator integration + +## How It Works + +### 1. Extending the Evaluator Base Class + +The evaluator follows the standard pattern for all agent-control evaluators: + +```python +from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator + +@register_evaluator +class DeepEvalEvaluator(Evaluator[DeepEvalEvaluatorConfig]): + # Define metadata + metadata = EvaluatorMetadata( + name="deepeval-geval", + version="1.0.0", + description="DeepEval GEval custom LLM-based evaluator", + requires_api_key=True, + timeout_ms=30000, + ) + + # Define config model + config_model = DeepEvalEvaluatorConfig + + # Implement evaluate method + async def evaluate(self, data: Any) -> EvaluatorResult: + # matched=True triggers the deny action when quality fails + # matched=False allows the request when quality passes + return EvaluatorResult( + matched=not is_successful, # Trigger when quality fails + confidence=score, + message=reason, + ) +``` + +### 2. Entry Point Registration + +The evaluator is registered via `pyproject.toml`: + +```toml +[project.entry-points."agent_control.evaluators"] +deepeval-geval = "evaluator:DeepEvalEvaluator" +``` + +This makes the evaluator automatically discoverable by the server when it starts. + +### 3. Configuration + +DeepEval's GEval supports two modes: + +**With Criteria** (auto-generates evaluation steps): +```python +config = DeepEvalEvaluatorConfig( + name="Coherence", + criteria="Evaluate whether the response is coherent and logically consistent.", + evaluation_params=["input", "actual_output"], + threshold=0.6, +) +``` + +**With Explicit Steps**: +```python +config = DeepEvalEvaluatorConfig( + name="Correctness", + evaluation_steps=[ + "Check whether facts in actual output contradict expected output", + "Heavily penalize omission of critical details", + "Minor wording differences are acceptable" + ], + evaluation_params=["input", "actual_output", "expected_output"], + threshold=0.7, +) +``` + +### 4. Using in Control Definitions + +Once registered, the evaluator can be used in control definitions: + +```python +control_definition = { + "name": "check-coherence", + "description": "Ensures responses are coherent and logically consistent", + "definition": { + "description": "Ensures responses are coherent", + "enabled": True, + "execution": "server", + "scope": {"stages": ["post"]}, # Apply to all steps at post stage + "selector": {}, # Pass full data (input + output) + "evaluator": { + "name": "deepeval-geval", # From metadata.name + "config": { + "name": "Coherence", + "criteria": "Evaluate whether the response is coherent", + "evaluation_params": ["input", "actual_output"], + "threshold": 0.6, + "model": "gpt-4o", + }, + }, + "action": { + "decision": "deny", + "message": "Response failed coherence check", + }, + }, +} +``` + +**Key points:** +- `execution: "server"` - Required field +- `scope: {"stages": ["post"]}` - Apply to all function calls at post stage +- `selector: {}` - Pass full data so evaluator gets both input and output +- `evaluation_params: ["input", "actual_output"]` - Both fields required for relevance checks + +## Getting Started from Fresh Clone + +If you're starting from a fresh clone of the agent-control repository, follow these steps: + +### 1. Clone and Install Repository + +```bash +# Clone the repository +git clone https://github.com/rungalileo/agent-control.git +cd agent-control + +# Install all dependencies (installs models, engine, evaluators, sdk, server packages) +make sync +``` + +### 2. Start Database and Server + +```bash +# Start PostgreSQL database and run migrations +cd server && docker-compose up -d && make alembic-upgrade && cd .. + +# Start the agent-control server (from repository root) +make server-run +``` + +The server will be running at `http://localhost:8000`. + +### 3. Install DeepEval Example + +```bash +# Install the DeepEval example package with its dependencies +cd examples/deepeval +uv sync +``` + +This installs the evaluator package and makes it discoverable by the server via entry points. + +### 4. Set Environment Variables + +```bash +# Required for DeepEval GEval (uses OpenAI models) +export OPENAI_API_KEY="your-openai-api-key" + +# Optional: Disable DeepEval telemetry +export DEEPEVAL_TELEMETRY_OPT_OUT="true" +``` + +### 5. Restart Server + +After installing the DeepEval example, restart the server so it can discover the new evaluator: + +```bash +# Stop the server (Ctrl+C) and restart +cd ../../ # Back to repository root +make server-run +``` + +Verify the evaluator is registered: +```bash +curl http://localhost:8000/api/v1/evaluators | grep deepeval-geval +``` + +### 6. Setup Agent and Controls + +```bash +cd examples/deepeval +uv run setup_controls.py +``` + +This creates the agent registration and three quality controls (coherence, relevance, correctness). + +### 7. Run the Q&A Agent + +```bash +uv run qa_agent.py +``` + +Try asking questions like "What is Python?" or test the controls with "Tell me about something trigger_irrelevant". + +--- + +## Testing the Agent + +### Interactive Commands + +Once the agent is running, try these commands: + +``` +You: What is Python? +You: What is the capital of France? +You: Test trigger_incoherent response please +You: Tell me about something trigger_irrelevant +You: /test-good # Test with quality questions +You: /test-bad # Test quality control triggers +You: /help # Show all commands +You: /quit # Exit +``` + +The agent will: +- Accept questions with coherent, relevant responses +- Block questions that produce incoherent or irrelevant responses +- Show which control triggered when quality checks fail + +### What to Expect + +**Good Quality Responses** (Pass controls): +``` +You: What is Python? +Agent: Python is a high-level, interpreted programming language known for its + simplicity and readability. It was created by Guido van Rossum and first + released in 1991. Python supports multiple programming paradigms... +``` + +**Poor Quality Responses** (Blocked by controls): +``` +You: Test trigger_incoherent response please +⚠️ Quality control triggered: check-coherence + Reason: Response failed coherence check + +Agent: I apologize, but my response didn't meet quality standards. + Could you rephrase your question or ask something else? +``` + +The DeepEval controls evaluate responses in real-time and block those that don't meet quality thresholds. + +## Evaluation Parameters + +DeepEval supports multiple test case parameters: + +- `input` - The user query or prompt +- `actual_output` - The LLM's generated response +- `expected_output` - Reference/ground truth answer +- `context` - Additional context for evaluation +- `retrieval_context` - Retrieved documents (for RAG) +- `tools_called` - Tools invoked by the agent +- `expected_tools` - Expected tool usage +- Plus MCP-related parameters + +Configure which parameters to use via the `evaluation_params` config field. + +**Important:** For relevance checks, always include both `input` and `actual_output` so the evaluator can compare the question with the answer. + +## For Third-Party Developers + +See [THIRD_PARTY_GUIDE.md](THIRD_PARTY_GUIDE.md) for a complete step-by-step guide on creating and publishing your own custom evaluators. + +## Extending This Example + +### Creating Your Own Custom Evaluator + +Follow this pattern to create evaluators for other libraries: + +1. **Define a Config Model** + ```python + from pydantic import BaseModel + + class MyEvaluatorConfig(BaseModel): + threshold: float = 0.5 + # Your config fields + ``` + +2. **Implement the Evaluator** + ```python + from agent_control_models import Evaluator, EvaluatorMetadata, register_evaluator + + @register_evaluator + class MyEvaluator(Evaluator[MyEvaluatorConfig]): + metadata = EvaluatorMetadata(name="my-evaluator", ...) + config_model = MyEvaluatorConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + score = # Your evaluation logic + return EvaluatorResult( + matched=score < self.config.threshold, # Trigger when fails + confidence=score, + ) + ``` + +3. **Register via Entry Point** + ```toml + [project.entry-points."agent_control.evaluators"] + my-evaluator = "evaluator:MyEvaluator" + ``` + +4. **Install and Use** + ```bash + uv pip install -e . # Server will discover it automatically + ``` + +### Adding More GEval Metrics + +You can create specialized evaluators for specific use cases: + +- **Bias Detection**: Evaluate responses for bias or fairness +- **Safety**: Check for harmful or unsafe content +- **Style Compliance**: Ensure responses match brand guidelines +- **Technical Accuracy**: Validate technical correctness +- **Tone Assessment**: Evaluate emotional tone and sentiment + +## Resources + +- **DeepEval Documentation**: https://deepeval.com/docs/metrics-llm-evals +- **G-Eval Guide**: https://www.confident-ai.com/blog/g-eval-the-definitive-guide +- **Third-Party Developer Guide**: [THIRD_PARTY_GUIDE.md](THIRD_PARTY_GUIDE.md) +- **Agent Control Evaluators**: [Base evaluator class](../../models/src/agent_control_models/evaluator.py) + +## Key Takeaways + +1. **Entry Points are Critical**: The server discovers evaluators via `project.entry-points`, not PYTHONPATH +2. **Extensibility**: The `Evaluator` base class makes it easy to integrate any evaluation library +3. **Configuration**: Pydantic models provide type-safe, validated configuration +4. **Registration**: The `@register_evaluator` decorator handles registration automatically +5. **Integration**: Evaluators work seamlessly with agent-control's policy system +6. **Control Logic**: `matched=True` triggers the action (deny/allow), so invert when quality passes + +## Troubleshooting + +### Controls not triggering + +- Check that `execution: "server"` is in control definition +- Use `scope: {"stages": ["post"]}` instead of `step_types` +- Use empty selector `{}` to pass full data (input + output) +- Restart server after evaluator code changes + +### Evaluator not found + +- Verify entry point in `pyproject.toml` +- Run `uv sync` to install package +- Check server logs for evaluator discovery +- Confirm with: `curl http://localhost:8000/api/v1/evaluators` + +### Wrong evaluation results + +- For relevance: include both `input` and `actual_output` in `evaluation_params` +- Check that `matched` logic is inverted (trigger when quality fails) +- Lower threshold to be more strict (0.5 instead of 0.7) + +### DeepEval telemetry files + +- DeepEval creates a `.deepeval/` directory with telemetry files in the working directory +- When the evaluator runs on the server, files appear in `server/.deepeval/` +- These files don't need to be committed (add `.deepeval/` to `.gitignore`) +- To disable telemetry: set environment variable `DEEPEVAL_TELEMETRY_OPT_OUT="true"` + +## License + +This example is part of the agent-control project. diff --git a/examples/deepeval/THIRD_PARTY_GUIDE.md b/examples/deepeval/THIRD_PARTY_GUIDE.md new file mode 100644 index 00000000..d653912f --- /dev/null +++ b/examples/deepeval/THIRD_PARTY_GUIDE.md @@ -0,0 +1,357 @@ +# Third-Party Developer Guide: Creating Custom Evaluators + +This guide shows the complete, end-to-end process for third-party developers to create custom evaluators, register them with the agent-control server, and use them in their agents. + +## Overview + +The DeepEval example demonstrates the complete workflow: + +1. **Create the custom evaluator** - Extend the `Evaluator` base class +2. **Register via entry points** - Make the evaluator discoverable by the server +3. **Install the package** - Install locally or publish to PyPI +4. **Verify server discovery** - Confirm the server recognizes your evaluator +5. **Create controls** - Define controls that use your evaluator +6. **Use in agents** - Apply controls to protect agent functions + +## Complete Workflow + +### Step 1: Create Your Custom Evaluator + +Create a Python package with these files: + +**`config.py`** - Define your evaluator's configuration: +```python +from pydantic import BaseModel, Field + +class MyEvaluatorConfig(BaseModel): + """Configuration for your custom evaluator.""" + threshold: float = Field(default=0.5, ge=0.0, le=1.0) + # Add your evaluator-specific config fields +``` + +**`evaluator.py`** - Implement your evaluator: +```python +from typing import Any +from agent_control_models import ( + Evaluator, + EvaluatorMetadata, + EvaluatorResult, + register_evaluator +) +from config import MyEvaluatorConfig + +@register_evaluator +class MyEvaluator(Evaluator[MyEvaluatorConfig]): + metadata = EvaluatorMetadata( + name="my-custom-evaluator", + version="1.0.0", + description="My custom evaluator", + requires_api_key=False, + timeout_ms=10000, + ) + config_model = MyEvaluatorConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Implement your evaluation logic.""" + score = 0.8 # Your evaluation logic here + + return EvaluatorResult( + passed=score >= self.config.threshold, + score=score, + metadata={"details": "evaluation details"} + ) +``` + +### Step 2: Create Entry Point in pyproject.toml + +**Critical Step**: The server discovers evaluators via entry points. + +**`pyproject.toml`**: +```toml +[project] +name = "my-custom-evaluator" +version = "1.0.0" +requires-python = ">=3.12" +dependencies = [ + "agent-control-models", + "agent-control-engine", + "agent-control-evaluators", + "pydantic>=2.0.0", + # your other dependencies +] + +# This is the critical section - registers your evaluator with the server +[project.entry-points."agent_control.evaluators"] +my-custom-evaluator = "evaluator:MyEvaluator" + +# For local development (if using path references) +[tool.uv.sources] +agent-control-models = { path = "../path/to/models", editable = true } +agent-control-engine = { path = "../path/to/engine", editable = true } +agent-control-evaluators = { path = "../path/to/evaluators", editable = true } + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" +``` + +**Key Points**: +- The entry point group MUST be `"agent_control.evaluators"` +- The entry point name should match your `metadata.name` +- Format: `"module:ClassName"` (e.g., `"evaluator:DeepEvalEvaluator"`) + +### Step 3: Install Your Package + +**For local development with uv** (recommended): +```bash +# Make sure dependencies are declared in pyproject.toml with path sources +cd /path/to/your/evaluator +uv sync # This installs all dependencies including agent-control packages +``` + +**Alternative - Manual installation**: +```bash +# Install your package in editable mode +uv pip install -e /path/to/your/evaluator + +# Also install agent-control dependencies +uv pip install -e /path/to/agent-control/models +uv pip install -e /path/to/agent-control/engine +uv pip install -e /path/to/agent-control/evaluators +``` + +**For published packages**: +```bash +pip install my-custom-evaluator +``` + +**Note**: When using `uv run` for scripts, uv will automatically create and use a project-specific virtual environment. Make sure your `pyproject.toml` includes the agent-control packages in dependencies. + +### Step 4: Start Server and Verify Discovery + +**Start the server** (it will auto-discover evaluators): +```bash +uv run --package agent-control-server uvicorn agent_control_server.main:app --port 8000 +``` + +**Verify your evaluator is recognized**: +```bash +curl -s http://localhost:8000/api/v1/evaluators | grep "my-custom-evaluator" +``` + +Expected output: +```json +{ + "my-custom-evaluator": { + "name": "my-custom-evaluator", + "version": "1.0.0", + "description": "My custom evaluator", + "requires_api_key": false, + "timeout_ms": 10000, + "config_schema": {...} + } +} +``` + +⚠️ **If your evaluator doesn't appear**, check: +- Entry point is correctly defined in `pyproject.toml` +- Package is installed (`pip list | grep my-custom-evaluator`) +- Server logs for any import errors +- Your evaluator's `is_available()` returns `True` + +### Step 5: Create Controls Using Your Evaluator + +**`setup_controls.py`**: +```python +import asyncio +import httpx + +async def create_controls(): + async with httpx.AsyncClient(base_url="http://localhost:8000") as client: + # Register agent + await client.post("/api/v1/agents/initAgent", json={ + "agent": { + "agent_id": "your-agent-uuid", + "agent_name": "My Agent", + "agent_description": "Agent with custom evaluator" + }, + "tools": [] + }) + + # Create control using your evaluator + response = await client.post("/api/v1/agents/your-agent-uuid/controls", json={ + "name": "my-custom-check", + "definition": { + "description": "My custom quality check", + "enabled": True, + "execution": "server", # REQUIRED field + "scope": { + "step_types": ["llm_inference"], + "stages": ["post"] + }, + "selector": {"path": "output"}, + "evaluator": { + "name": "my-custom-evaluator", # Must match metadata.name + "config": { + "threshold": 0.7 + } + }, + "action": { + "decision": "deny", + "message": "Failed custom check" + } + } + }) + print(f"Control created: {response.json()}") + +if __name__ == "__main__": + asyncio.run(create_controls()) +``` + +**Important control definition fields**: +- `execution`: REQUIRED - must be `"server"` +- `scope`: Defines when the control applies (replaces old `applies_to`/`check_stage`) +- `evaluator.name`: Must match your evaluator's `metadata.name` +- `evaluator.config`: Must match your `EvaluatorConfig` schema + +### Step 6: Use Controls in Your Agent + +**`my_agent.py`**: +```python +import asyncio +from agent_control import agent_control, control, ControlViolationError + +# Initialize agent +agent_control.init( + agent_name="My Agent", + agent_id="my-agent", + agent_description="Agent with custom evaluator", + agent_version="1.0.0" +) + +@control() +async def my_protected_function(input_data: str) -> str: + """Function protected by your custom evaluator.""" + result = f"Processed: {input_data}" + return result + +async def main(): + try: + result = await my_protected_function("test input") + print(f"✓ Success: {result}") + except ControlViolationError as e: + print(f"❌ Control violation: {e}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Complete Example: DeepEval + +See the DeepEval example in this directory for a complete, working implementation: + +``` +examples/deepeval/ +├── config.py # DeepEvalEvaluatorConfig +├── evaluator.py # DeepEvalEvaluator +├── pyproject.toml # Entry point registration +├── setup_controls.py # Creates controls on server +├── qa_agent.py # Q&A agent using controls +└── README.md # Full documentation +``` + +**Key files to study**: +1. [pyproject.toml](./pyproject.toml#L20-L21) - Entry point registration +2. [evaluator.py](./evaluator.py) - Complete evaluator implementation +3. [setup_controls.py](./setup_controls.py#L54-L143) - Control definitions + +## Common Issues and Solutions + +### Issue 1: Server doesn't recognize evaluator + +**Symptoms**: `/api/v1/evaluators` doesn't show your evaluator + +**Solutions**: +- Verify entry point in `pyproject.toml`: `[project.entry-points."agent_control.evaluators"]` +- Reinstall package: `uv pip install -e .` +- Restart server to trigger discovery +- Check `is_available()` returns `True` + +### Issue 2: 422 Validation Error when creating controls + +**Symptoms**: `Field required: data.execution` + +**Solutions**: +- Add `"execution": "server"` to control definition +- Use `scope` instead of `applies_to`/`check_stage` +- Ensure `evaluator.config` matches your config schema + +### Issue 3: Import errors in evaluator + +**Symptoms**: `cannot import name 'Evaluator' from 'agent_control_models'` + +**Solutions**: +- Add agent-control packages to `dependencies` in `pyproject.toml` +- Add `[tool.uv.sources]` with path references to local packages +- Run `uv sync` to install all dependencies +- Alternatively: `uv pip install -e path/to/models -e path/to/engine -e path/to/evaluators` + +### Issue 4: uv virtual environment mismatch + +**Symptoms**: `warning: VIRTUAL_ENV=... does not match the project environment path .venv` + +**Explanation**: This is informational, not an error. `uv run` creates a project-specific `.venv` which is correct behavior. + +**Solutions**: +- This warning can be safely ignored +- Ensure your `pyproject.toml` has correct dependencies and sources +- Run `uv sync` to ensure project venv has all packages + +## Best Practices + +1. **Entry Points are Essential**: The server ONLY discovers evaluators via entry points, not PYTHONPATH +2. **Match metadata.name**: Entry point key should match `metadata.name` in your evaluator +3. **Control Definition Structure**: Always include `execution` and `scope` fields +4. **Validation**: The server validates control configs against your `config_model` schema +5. **Dependencies**: List all dependencies in `pyproject.toml`, including `agent-control-models` + +## Testing Your Evaluator + +1. **Unit tests**: Test your `evaluate()` method directly +2. **Integration tests**: Create controls and test with the server +3. **End-to-end**: Run a complete agent with your controls + +## Publishing Your Evaluator + +To share your evaluator with others: + +1. **Publish to PyPI**: + ```bash + python -m build + twine upload dist/* + ``` + +2. **Users install your package**: + ```bash + pip install my-custom-evaluator + ``` + +3. **Server auto-discovers** via entry points when it starts + +4. **Users create controls** using your evaluator name + +## Summary + +The complete workflow for third-party developers: + +```mermaid +graph TD + A[Create Evaluator Class] --> B[Add Entry Point] + B --> C[Install Package] + C --> D[Start Server] + D --> E[Verify Discovery] + E --> F[Create Controls] + F --> G[Use in Agent] +``` + +**Key Takeaway**: Entry points are the critical mechanism that makes custom evaluators work with the agent-control server. Without proper entry point registration, the server will not discover your evaluator. diff --git a/examples/deepeval/__init__.py b/examples/deepeval/__init__.py new file mode 100644 index 00000000..242ac048 --- /dev/null +++ b/examples/deepeval/__init__.py @@ -0,0 +1,14 @@ +"""DeepEval GEval evaluator example. + +This module demonstrates how to extend the base Evaluator class to create +custom evaluators using external libraries like DeepEval. +""" + +from .config import DeepEvalEvaluatorConfig, DeepEvalTestCaseParam +from .evaluator import DeepEvalEvaluator + +__all__ = [ + "DeepEvalEvaluator", + "DeepEvalEvaluatorConfig", + "DeepEvalTestCaseParam", +] diff --git a/examples/deepeval/config.py b/examples/deepeval/config.py new file mode 100644 index 00000000..5c592667 --- /dev/null +++ b/examples/deepeval/config.py @@ -0,0 +1,138 @@ +"""Configuration models for DeepEval GEval evaluator. + +Based on DeepEval's GEval metric: https://deepeval.com/docs/metrics-llm-evals +""" + +from typing import Any, Literal + +from pydantic import BaseModel, Field, model_validator + + +# DeepEval's LLMTestCaseParams enum values +DeepEvalTestCaseParam = Literal[ + "input", + "actual_output", + "expected_output", + "context", + "retrieval_context", + "tools_called", + "expected_tools", + "mcp_servers", + "mcp_tools_called", + "mcp_resources_called", + "mcp_prompts_called", +] + + +class DeepEvalEvaluatorConfig(BaseModel): + """Configuration for DeepEval GEval evaluator. + + DeepEval's GEval uses LLM-as-a-judge with chain-of-thoughts (CoT) to evaluate + LLM outputs based on custom criteria. It's capable of evaluating almost any + use case with human-like accuracy. + + Example (with criteria): + ```python + config = DeepEvalEvaluatorConfig( + name="Correctness", + criteria="Determine if the actual output is correct based on the expected output.", + evaluation_params=["actual_output", "expected_output"], + threshold=0.5, + ) + ``` + + Example (with evaluation_steps): + ```python + config = DeepEvalEvaluatorConfig( + name="Correctness", + evaluation_steps=[ + "Check whether facts in actual output contradict expected output", + "Heavily penalize omission of detail", + "Vague language or contradicting opinions are acceptable" + ], + evaluation_params=["actual_output", "expected_output"], + threshold=0.5, + ) + ``` + """ + + name: str = Field( + description="Name identifier for the custom metric (e.g., 'Correctness', 'Relevance')" + ) + + criteria: str | None = Field( + default=None, + description="Description outlining the specific evaluation aspects. Either provide criteria OR evaluation_steps, not both.", + ) + + evaluation_steps: list[str] | None = Field( + default=None, + description="Specific steps the LLM should follow during evaluation. If omitted with criteria, will be auto-generated. Either provide criteria OR evaluation_steps, not both.", + ) + + evaluation_params: list[DeepEvalTestCaseParam] = Field( + description="List of test case parameters to include in evaluation (e.g., ['input', 'actual_output'])" + ) + + threshold: float = Field( + default=0.5, + ge=0.0, + le=1.0, + description="Passing threshold (0-1). Metric is successful if score >= threshold.", + ) + + model: str = Field( + default="gpt-4o", + description="GPT model to use for evaluation (e.g., 'gpt-4o', 'gpt-4-turbo', 'gpt-3.5-turbo')", + ) + + strict_mode: bool = Field( + default=False, + description="If True, enforces binary scoring (0 or 1). If False, returns scores in 0-1 range.", + ) + + async_mode: bool = Field( + default=True, + description="Enable concurrent execution for better performance.", + ) + + verbose_mode: bool = Field( + default=False, + description="Print intermediate calculation steps for debugging.", + ) + + timeout_ms: int = Field( + default=30000, + ge=1000, + le=120000, + description="Request timeout in milliseconds (1-120 seconds)", + ) + + on_error: Literal["allow", "deny"] = Field( + default="allow", + description="Action on error: 'allow' (fail open) or 'deny' (fail closed)", + ) + + metadata: dict[str, Any] | None = Field( + default=None, + description="Additional metadata for logging/tracking", + ) + + @model_validator(mode="after") + def validate_criteria_or_steps(self) -> "DeepEvalEvaluatorConfig": + """Validate that either criteria or evaluation_steps is provided, but not both.""" + has_criteria = self.criteria is not None + has_steps = self.evaluation_steps is not None and len(self.evaluation_steps) > 0 + + if not has_criteria and not has_steps: + raise ValueError( + "Either 'criteria' or 'evaluation_steps' must be provided" + ) + + if has_criteria and has_steps: + raise ValueError( + "Provide either 'criteria' OR 'evaluation_steps', not both. " + "If you provide criteria, evaluation_steps will be auto-generated." + ) + + return self diff --git a/examples/deepeval/evaluator.py b/examples/deepeval/evaluator.py new file mode 100644 index 00000000..ce8e7c31 --- /dev/null +++ b/examples/deepeval/evaluator.py @@ -0,0 +1,298 @@ +"""DeepEval GEval evaluator implementation. + +This evaluator demonstrates how to extend the base Evaluator class to integrate +DeepEval's GEval metric for custom LLM-based evaluations. + +Based on DeepEval documentation: https://deepeval.com/docs/metrics-llm-evals +""" + +import logging +from typing import Any + +from agent_control_models import ( + Evaluator, + EvaluatorMetadata, + EvaluatorResult, + register_evaluator, +) + +# Import config - handle both relative and absolute imports +try: + from .config import DeepEvalEvaluatorConfig +except ImportError: + from config import DeepEvalEvaluatorConfig + +logger = logging.getLogger(__name__) + +# Check if deepeval is available +try: + from deepeval.metrics import GEval + from deepeval.test_case import LLMTestCase, LLMTestCaseParams + + DEEPEVAL_AVAILABLE = True +except ImportError: + DEEPEVAL_AVAILABLE = False + GEval = None # type: ignore + LLMTestCase = None # type: ignore + LLMTestCaseParams = None # type: ignore + + +@register_evaluator +class DeepEvalEvaluator(Evaluator[DeepEvalEvaluatorConfig]): + """DeepEval GEval evaluator for custom LLM-based evaluations. + + This evaluator uses DeepEval's GEval metric, which leverages LLM-as-a-judge + with chain-of-thoughts (CoT) to evaluate LLM outputs based on custom criteria. + + Features: + - Custom evaluation criteria or step-by-step evaluation logic + - Multiple test case parameters (input, output, context, etc.) + - Configurable LLM model for judging + - Binary or continuous scoring modes + - Automatic chain-of-thought generation + + Example: + ```python + from examples.deepeval import DeepEvalEvaluator, DeepEvalEvaluatorConfig + + # Create config + config = DeepEvalEvaluatorConfig( + name="Coherence", + criteria="Determine if the response is coherent and logically consistent.", + evaluation_params=["actual_output"], + threshold=0.7, + model="gpt-4o", + ) + + # Create evaluator + evaluator = DeepEvalEvaluator(config) + + # Evaluate + result = await evaluator.evaluate({ + "actual_output": "The sky is blue because of Rayleigh scattering." + }) + ``` + + Environment Variables: + OPENAI_API_KEY: Required for GPT model usage. + """ + + metadata = EvaluatorMetadata( + name="deepeval-geval", + version="1.0.0", + description="DeepEval GEval custom LLM-based evaluator", + requires_api_key=True, + timeout_ms=30000, + ) + config_model = DeepEvalEvaluatorConfig + + @classmethod + def is_available(cls) -> bool: + """Check if deepeval dependency is installed.""" + return DEEPEVAL_AVAILABLE + + def __init__(self, config: DeepEvalEvaluatorConfig) -> None: + """Initialize DeepEval evaluator with configuration. + + Args: + config: Validated DeepEvalEvaluatorConfig instance. + + Raises: + ValueError: If required configuration is invalid. + """ + super().__init__(config) + + # Create the GEval metric instance (immutable, safe for instance caching) + self._metric = self._create_geval_metric() + + def _create_geval_metric(self) -> Any: + """Create and configure the GEval metric. + + Returns: + Configured GEval metric instance. + """ + # Convert string evaluation params to LLMTestCaseParams enum + evaluation_params = [ + getattr(LLMTestCaseParams, param.upper()) + for param in self.config.evaluation_params + ] + + # Build GEval kwargs + geval_kwargs = { + "name": self.config.name, + "evaluation_params": evaluation_params, + "threshold": self.config.threshold, + "model": self.config.model, + "strict_mode": self.config.strict_mode, + "async_mode": self.config.async_mode, + "verbose_mode": self.config.verbose_mode, + } + + # Add either criteria or evaluation_steps + if self.config.criteria: + geval_kwargs["criteria"] = self.config.criteria + elif self.config.evaluation_steps: + geval_kwargs["evaluation_steps"] = self.config.evaluation_steps + + logger.debug(f"[DeepEval] Creating GEval metric with config: {geval_kwargs}") + return GEval(**geval_kwargs) + + async def evaluate(self, data: Any) -> EvaluatorResult: + """Evaluate data using DeepEval GEval. + + Args: + data: The data to evaluate. Should be a dict with keys matching + the evaluation_params (e.g., {"actual_output": "text"}). + + Returns: + EvaluatorResult with matched status and metadata. + """ + try: + logger.debug(f"[DeepEval] Evaluating data: {data}") + + # Prepare test case from data + test_case = self._prepare_test_case(data) + + # Run the GEval metric + if self.config.async_mode: + await self._metric.a_measure(test_case) + else: + self._metric.measure(test_case) + + # Parse the results + result = self._parse_metric_result() + + logger.debug( + f"[DeepEval] Evaluation complete: matched={result.matched}, " + f"score={result.confidence}, reason={result.message}" + ) + + return result + + except Exception as e: + logger.error(f"DeepEval evaluation error: {e}", exc_info=True) + return self._handle_error(e) + + def _prepare_test_case(self, data: Any) -> Any: + """Prepare LLMTestCase from input data. + + Args: + data: Input data dict with test case parameters. + May contain: input, output, question, actual_output, etc. + + Returns: + LLMTestCase instance. + """ + # Handle both dict and string inputs + if isinstance(data, str): + # If data is a string, treat it as actual_output by default + data = {"actual_output": data} + elif not isinstance(data, dict): + data = {"actual_output": str(data)} + + # Map agent-control data structure to DeepEval LLMTestCase parameters + # Agent-control may provide: {"input": {...}, "output": "..."} + # DeepEval expects: {"input": "...", "actual_output": "..."} + mapped_data = {} + + # Handle output mapping + if "actual_output" in data: + mapped_data["actual_output"] = data["actual_output"] + elif "output" in data: + mapped_data["actual_output"] = data["output"] + + # Handle input mapping + if "input" in data: + input_val = data["input"] + # If input is a dict (e.g., function arguments), extract the question + if isinstance(input_val, dict): + # Try common field names + mapped_data["input"] = ( + input_val.get("question") or + input_val.get("query") or + input_val.get("prompt") or + str(input_val) + ) + else: + mapped_data["input"] = str(input_val) + elif "question" in data: + mapped_data["input"] = data["question"] + + # Handle other DeepEval parameters + for key in ["expected_output", "context", "retrieval_context", "tools"]: + if key in data: + mapped_data[key] = data[key] + + # Build test case kwargs + # Note: LLMTestCase requires 'input' and 'actual_output' as mandatory fields + # So we always provide them, even if not in evaluation_params + test_case_kwargs = {} + + # Always include mandatory fields for LLMTestCase + test_case_kwargs["input"] = mapped_data.get("input", "") + test_case_kwargs["actual_output"] = mapped_data.get("actual_output", "") + + # Add any additional params from evaluation_params + for param in self.config.evaluation_params: + if param not in test_case_kwargs: # Skip if already added above + if param in mapped_data: + test_case_kwargs[param] = mapped_data[param] + else: + logger.warning(f"[DeepEval] Missing parameter '{param}', using empty string") + test_case_kwargs[param] = "" + + logger.debug(f"[DeepEval] Original data keys: {list(data.keys())}") + logger.debug(f"[DeepEval] Mapped data keys: {list(mapped_data.keys())}") + logger.debug(f"[DeepEval] Test case kwargs: {test_case_kwargs}") + return LLMTestCase(**test_case_kwargs) + + def _parse_metric_result(self) -> EvaluatorResult: + """Parse GEval metric results into EvaluatorResult. + + Returns: + EvaluatorResult with evaluation results. + """ + # Get score and reason from the metric + score = self._metric.score + reason = self._metric.reason + is_successful = self._metric.is_successful() + + # NOTE: matched=True means the control should trigger (block the request) + # In DeepEval, is_successful=True means quality is GOOD (score >= threshold) + # So we want to trigger (matched=True) when quality is BAD (not is_successful) + return EvaluatorResult( + matched=not is_successful, # Invert: trigger when quality fails + confidence=score if score is not None else 0.0, + message=reason if reason else f"GEval {self.config.name}: score={score}", + metadata={ + "metric_name": self.config.name, + "score": score, + "threshold": self.config.threshold, + "model": self.config.model, + "strict_mode": self.config.strict_mode, + **(self.config.metadata or {}), + }, + ) + + def _handle_error(self, error: Exception) -> EvaluatorResult: + """Handle errors from DeepEval evaluation. + + Args: + error: The exception that occurred. + + Returns: + EvaluatorResult indicating error state. + """ + error_action = self.config.on_error + + return EvaluatorResult( + matched=(error_action == "deny"), # Fail closed if configured + confidence=0.0, + message=f"DeepEval evaluation error: {str(error)}", + metadata={ + "error": str(error), + "error_type": type(error).__name__, + "metric_name": self.config.name, + "fallback_action": error_action, + }, + ) diff --git a/examples/deepeval/pyproject.toml b/examples/deepeval/pyproject.toml new file mode 100644 index 00000000..3294bc19 --- /dev/null +++ b/examples/deepeval/pyproject.toml @@ -0,0 +1,43 @@ +[project] +name = "agent-control-deepeval-example" +version = "0.1.0" +description = "Agent Control DeepEval GEval Custom Evaluator Example" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "deepeval>=1.0.0", + "openai>=1.0.0", + "pydantic>=2.0.0", + "httpx>=0.24.0", + "google-re2>=1.1", + "agent-control-models", + "agent-control-engine", + "agent-control-evaluators", + "agent-control-sdk", +] + +[project.optional-dependencies] +dev = [] + +[project.entry-points."agent_control.evaluators"] +deepeval-geval = "evaluator:DeepEvalEvaluator" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.uv.sources] +agent-control-models = { path = "../../models", editable = true } +agent-control-engine = { path = "../../engine", editable = true } +agent-control-evaluators = { path = "../../evaluators", editable = true } +agent-control-sdk = { path = "../../sdks/python", editable = true } + +[tool.ruff] +line-length = 100 +target-version = "py312" + +[tool.ruff.lint] +select = ["E", "F", "I"] diff --git a/examples/deepeval/qa_agent.py b/examples/deepeval/qa_agent.py new file mode 100755 index 00000000..63a205f7 --- /dev/null +++ b/examples/deepeval/qa_agent.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 +""" +Question Answering Agent with DeepEval Quality Controls + +This example demonstrates: +1. Using agent-control SDK with @control() decorator +2. DeepEval GEval evaluators for quality enforcement +3. Handling ControlViolationError gracefully + +The agent is protected by DeepEval-based controls that check: +- Response coherence (logical consistency) +- Answer relevance (stays on topic) +- Factual correctness (when expected outputs available) + +Usage: + # Setup first (creates controls on server) + python setup_controls.py + + # Then run the agent + python qa_agent.py + +Requirements: + - agent-control server running + - OPENAI_API_KEY set (for DeepEval) + - Controls configured via setup_controls.py +""" + +import asyncio +import os +import sys + +import agent_control +from agent_control import ControlViolationError, control + +# ============================================================================= +# SDK INITIALIZATION +# ============================================================================= + +agent_control.init( + agent_name="Q&A Agent with DeepEval", + agent_id="qa-agent-deepeval", + agent_description="Question answering agent with DeepEval quality controls", + agent_version="1.0.0", +) + + +# ============================================================================= +# MOCK LLM (Simulates various quality scenarios) +# ============================================================================= + + +class MockQASystem: + """ + Simulates a Q&A system with various response quality scenarios. + + This mock helps demonstrate how DeepEval controls catch quality issues: + - Coherent responses pass + - Incoherent responses are blocked + - Irrelevant responses are blocked + """ + + GOOD_RESPONSES = { + "python": ( + "Python is a high-level, interpreted programming language known for its " + "simplicity and readability. It was created by Guido van Rossum and first " + "released in 1991. Python supports multiple programming paradigms including " + "procedural, object-oriented, and functional programming." + ), + "capital": ( + "Paris is the capital and largest city of France. It is located in the " + "north-central part of the country along the River Seine. Paris has been " + "a major center of culture, art, and politics for centuries." + ), + "photosynthesis": ( + "Photosynthesis is the process by which plants convert light energy into " + "chemical energy. Using chlorophyll, plants absorb sunlight and combine " + "carbon dioxide from the air with water from the soil to produce glucose " + "and oxygen. This process is essential for life on Earth." + ), + "gravity": ( + "Gravity is a fundamental force of nature that attracts objects with mass " + "toward each other. On Earth, gravity gives objects weight and causes them " + "to fall toward the ground. The force of gravity was described by Newton's " + "laws and later refined by Einstein's theory of general relativity." + ), + } + + # Incoherent responses (logical inconsistencies, contradictions) + INCOHERENT_RESPONSES = { + "trigger_incoherent": ( + "Python is a snake. Also Python is not a snake. " + "It's both simultaneously. Yesterday is tomorrow. " + "The sky is made of cheese but also not cheese. " + "Numbers are letters and letters are numbers." + ), + } + + # Irrelevant responses (don't answer the question) + IRRELEVANT_RESPONSES = { + "trigger_irrelevant": ( + "Bananas are yellow fruits that grow on trees. " + "The weather today is sunny. I like pizza. " + "Dogs are mammals. The year has 12 months." + ), + } + + @classmethod + def answer_question(cls, question: str) -> str: + """Generate an answer to the question.""" + question_lower = question.lower() + + # Check for test triggers + if "trigger_incoherent" in question_lower or "incoherent" in question_lower: + return cls.INCOHERENT_RESPONSES["trigger_incoherent"] + + if "trigger_irrelevant" in question_lower or "irrelevant" in question_lower: + return cls.IRRELEVANT_RESPONSES["trigger_irrelevant"] + + # Match question to good responses + if "python" in question_lower: + return cls.GOOD_RESPONSES["python"] + elif "capital" in question_lower and "france" in question_lower: + return cls.GOOD_RESPONSES["capital"] + elif "photosynthesis" in question_lower: + return cls.GOOD_RESPONSES["photosynthesis"] + elif "gravity" in question_lower: + return cls.GOOD_RESPONSES["gravity"] + else: + # Default educational response + return ( + f"That's an interesting question about '{question}'. " + "Based on general knowledge, I can provide information on this topic. " + "Would you like me to explain in more detail?" + ) + + +# ============================================================================= +# PROTECTED AGENT FUNCTION +# ============================================================================= + + +@control() +async def answer_question(question: str) -> str: + """ + Answer a question with quality controls. + + The @control() decorator: + - Checks 'pre' controls before generating (validates input) + - Checks 'post' controls after generating (validates output quality) + + DeepEval controls check: + - Coherence: Is the response logically consistent? + - Relevance: Does it address the question? + - Correctness: Is it factually accurate? (if enabled) + + If a control fails, ControlViolationError is raised. + """ + response = MockQASystem.answer_question(question) + return response + + +# ============================================================================= +# Q&A AGENT CLASS +# ============================================================================= + + +class QAAgent: + """ + Question answering agent with DeepEval quality controls. + + Demonstrates graceful error handling when quality controls fail. + """ + + def __init__(self): + self.conversation_history: list[dict[str, str]] = [] + + async def ask(self, question: str) -> str: + """ + Ask a question and get an answer. + + Handles ControlViolationError gracefully by returning + a helpful message instead of exposing internal errors. + """ + self.conversation_history.append({"role": "user", "content": question}) + + try: + # Get answer - protected by DeepEval controls + answer = await answer_question(question) + + self.conversation_history.append({"role": "assistant", "content": answer}) + return answer + + except ControlViolationError as e: + # Control triggered - return helpful feedback + fallback = ( + f"I apologize, but my response didn't meet quality standards. " + f"({e.control_name})\n\n" + f"Could you rephrase your question or ask something else?" + ) + self.conversation_history.append({"role": "assistant", "content": fallback}) + print(f"\n⚠️ Quality control triggered: {e.control_name}") + print(f" Reason: {e.message}") + return fallback + + +# ============================================================================= +# INTERACTIVE MODE +# ============================================================================= + + +def print_header(): + """Print the demo header.""" + print() + print("=" * 70) + print(" Q&A Agent with DeepEval Quality Controls") + print("=" * 70) + print() + print("This agent uses DeepEval GEval to enforce response quality:") + print(" ✓ Coherence - Responses must be logically consistent") + print(" ✓ Relevance - Answers must address the question") + print(" ○ Correctness - Factual accuracy (disabled by default)") + print() + print("Commands:") + print(" /test-good Test with high-quality questions") + print(" /test-bad Test quality control triggers") + print(" /help Show this help") + print(" /quit Exit") + print() + print("Or just type a question!") + print("-" * 70) + print() + + +def print_help(): + """Print help information.""" + print() + print("Available Commands:") + print(" /test-good Test with questions that produce quality answers") + print(" /test-bad Test questions that trigger quality controls") + print(" /help Show this help message") + print(" /quit or /exit Exit the program") + print() + print("Or ask any question and see how DeepEval evaluates quality!") + print() + + +async def run_good_tests(agent: QAAgent): + """Run tests with good quality responses.""" + print("\n" + "=" * 70) + print("Testing Good Quality Responses") + print("=" * 70) + print("\nThese should pass all quality controls.\n") + + test_questions = [ + "What is Python?", + "What is the capital of France?", + "How does photosynthesis work?", + "What is gravity?", + ] + + for question in test_questions: + print(f"Q: {question}") + answer = await agent.ask(question) + print(f"A: {answer[:150]}...") + print() + + +async def run_bad_tests(agent: QAAgent): + """Run tests that should trigger quality controls.""" + print("\n" + "=" * 70) + print("Testing Quality Control Triggers") + print("=" * 70) + print("\nThese should trigger DeepEval controls.\n") + + test_questions = [ + "Test trigger_incoherent response please", # Should fail coherence + "Tell me about something trigger_irrelevant", # Should fail relevance + ] + + for question in test_questions: + print(f"Q: {question}") + answer = await agent.ask(question) + print(f"A: {answer}") + print() + + +async def run_interactive(agent: QAAgent): + """Run interactive mode.""" + print_header() + + while True: + try: + user_input = input("You: ").strip() + except (KeyboardInterrupt, EOFError): + print("\nGoodbye!") + break + + if not user_input: + continue + + # Handle commands + if user_input.startswith("/"): + command = user_input.lower().split()[0] + + if command in ("/quit", "/exit"): + print("Goodbye!") + break + + elif command == "/help": + print_help() + + elif command == "/test-good": + await run_good_tests(agent) + + elif command == "/test-bad": + await run_bad_tests(agent) + + else: + print(f"Unknown command: {command}") + print("Type /help for available commands") + + else: + # Regular question + answer = await agent.ask(user_input) + print(f"\nAgent: {answer}\n") + + +# ============================================================================= +# MAIN +# ============================================================================= + + +async def main(): + """Run the Q&A agent.""" + # Check for OPENAI_API_KEY + if not os.getenv("OPENAI_API_KEY"): + print("\n⚠️ Warning: OPENAI_API_KEY not set!") + print(" DeepEval requires OpenAI API access for GEval.") + print(" Set it with: export OPENAI_API_KEY='your-key'") + print() + response = input("Continue anyway? (y/N): ").strip().lower() + if response != "y": + print("Exiting. Set OPENAI_API_KEY and try again.") + sys.exit(1) + + # Check server connection + server_url = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") + print(f"\nConnecting to agent-control server at {server_url}...") + + import httpx + + try: + async with httpx.AsyncClient() as client: + resp = await client.get(f"{server_url}/health", timeout=5.0) + resp.raise_for_status() + print("✓ Connected to server") + except Exception as e: + print(f"\n❌ Cannot connect to server: {e}") + print(" Make sure the agent-control server is running.") + print(" Run setup_controls.py first to configure the agent.") + sys.exit(1) + + # Create and run agent + agent = QAAgent() + await run_interactive(agent) + + +if __name__ == "__main__": + try: + asyncio.run(main()) + except KeyboardInterrupt: + print("\n\nInterrupted. Goodbye!") diff --git a/examples/deepeval/setup_controls.py b/examples/deepeval/setup_controls.py new file mode 100755 index 00000000..ec1f37da --- /dev/null +++ b/examples/deepeval/setup_controls.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +""" +Setup script that creates DeepEval-based controls for the Q&A Agent. + +This script: +1. Registers the agent with the server +2. Creates DeepEval GEval evaluator controls for quality checks +3. Creates a policy and attaches controls +4. Assigns the policy to the agent + +The controls demonstrate using DeepEval's LLM-as-a-judge to enforce: +- Response coherence +- Answer relevance +- Factual correctness + +Run this after starting the server to have a working demo. +""" + +import asyncio +import os +import sys +import uuid + +import httpx + +# Add the current directory to the path so we can import the evaluator +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +# Import and register the DeepEval evaluator +# This must be done before creating controls that use it +try: + from evaluator import DeepEvalEvaluator + + print(f"✓ DeepEval evaluator loaded: {DeepEvalEvaluator.metadata.name}") + + # Note: We don't check is_available() here because the evaluator + # may not be used immediately - it just needs to be registered + # so the server knows about it when creating control definitions + +except ImportError as e: + print(f"❌ Error: Cannot import DeepEval evaluator: {e}") + print("\nMake sure you're running from the examples/deepeval directory") + print("and that agent-control-models is installed") + sys.exit(1) + +# Agent configuration +AGENT_ID = "qa-agent-deepeval" +AGENT_NAME = "Q&A Agent with DeepEval" +AGENT_DESCRIPTION = "Question answering agent with DeepEval quality controls" + +SERVER_URL = os.getenv("AGENT_CONTROL_URL", "http://localhost:8000") + +# DeepEval controls to create +DEEPEVAL_CONTROLS = [ + { + "name": "check-coherence", + "description": "Ensures LLM responses are coherent and logically consistent", + "definition": { + "description": "Ensures LLM responses are coherent and logically consistent", + "enabled": True, + "execution": "server", + "scope": {"stages": ["post"]}, + "selector": {}, + "evaluator": { + "name": "deepeval-geval", + "config": { + "name": "Coherence", + "criteria": ( + "Evaluate whether the response is coherent, logically consistent, " + "and well-structured. Check for contradictions and flow of ideas. " + "The response should make logical sense and not contain contradictory statements." + ), + "evaluation_params": ["input", "actual_output"], + "threshold": 0.6, + "model": "gpt-4o", + "strict_mode": False, + "verbose_mode": False, + }, + }, + "action": { + "decision": "deny", + "message": "Response failed coherence check - please reformulate", + }, + }, + }, + { + "name": "check-relevance", + "description": "Ensures responses are relevant to the user's question", + "definition": { + "description": "Ensures responses are relevant to the user's question", + "enabled": True, + "execution": "server", + "scope": {"stages": ["post"]}, + "selector": {}, + "evaluator": { + "name": "deepeval-geval", + "config": { + "name": "Relevance", + "criteria": ( + "Determine whether the actual output is relevant and directly addresses " + "the input query. Check if it stays on topic and provides useful information " + "that answers the question asked." + ), + "evaluation_params": ["input", "actual_output"], + "threshold": 0.5, + "model": "gpt-4o", + "strict_mode": False, + }, + }, + "action": { + "decision": "deny", + "message": "Response is not relevant to the question - please provide a relevant answer", + }, + }, + }, + { + "name": "check-correctness", + "description": "Validates factual correctness against expected answers (when available)", + "definition": { + "description": "Validates factual correctness against expected answers (when available)", + "enabled": False, # Disabled by default - enable when you have expected outputs + "execution": "server", + "scope": {"step_types": ["llm_inference"], "stages": ["post"]}, + "selector": {"path": "*"}, + "evaluator": { + "name": "deepeval-geval", + "config": { + "name": "Correctness", + "evaluation_steps": [ + "Check whether facts in actual output contradict expected output", + "Heavily penalize omission of critical details", + "Minor wording differences are acceptable", + "Focus on factual accuracy, not style", + ], + "evaluation_params": ["actual_output", "expected_output"], + "threshold": 0.8, + "model": "gpt-4o", + }, + }, + "action": { + "decision": "warn", + "message": "Response may contain factual errors - review carefully", + }, + }, + }, +] + + +async def setup_demo(quiet: bool = False): + """Set up the demo agent with DeepEval controls.""" + # Generate the same UUID5 that the SDK generates + agent_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, AGENT_ID)) + + print(f"Setting up agent: {AGENT_NAME}") + print(f"Agent ID: {AGENT_ID}") + print(f"Agent UUID: {agent_uuid}") + print(f"Server URL: {SERVER_URL}") + print() + + async with httpx.AsyncClient(base_url=SERVER_URL, timeout=30.0) as client: + # Check server health + try: + resp = await client.get("/health") + resp.raise_for_status() + print("✓ Server is healthy") + except httpx.HTTPError as e: + print(f"❌ Error: Cannot connect to server at {SERVER_URL}") + print(f" {e}") + print("\nMake sure the server is running") + return False + + # Register the agent + try: + resp = await client.post( + "/api/v1/agents/initAgent", + json={ + "agent": { + "agent_id": agent_uuid, + "agent_name": AGENT_NAME, + "agent_description": AGENT_DESCRIPTION, + }, + "tools": [], + }, + ) + resp.raise_for_status() + result = resp.json() + status = "Created" if result.get("created") else "Updated" + print(f"✓ {status} agent: {AGENT_NAME}") + except httpx.HTTPError as e: + print(f"❌ Error registering agent: {e}") + return False + + # Get or create a policy for the agent + policy_name = f"policy-{AGENT_ID}" + policy_id = None + + # Check if agent already has a policy + try: + resp = await client.get(f"/api/v1/agents/{agent_uuid}/policy") + if resp.status_code == 200: + policy_id = resp.json().get("policy_id") + print(f"✓ Found existing policy: {policy_id}") + except httpx.HTTPError: + pass # No policy yet + + # Create policy if needed + if not policy_id: + try: + resp = await client.put( + "/api/v1/policies", + json={"name": policy_name}, + ) + if resp.status_code == 409: + # Policy name exists but not assigned - create with unique name + import time + + policy_name = f"policy-{AGENT_ID}-{int(time.time())}" + resp = await client.put( + "/api/v1/policies", + json={"name": policy_name}, + ) + resp.raise_for_status() + policy_id = resp.json()["policy_id"] + print(f"✓ Created policy: {policy_name}") + + # Assign policy to agent + resp = await client.post(f"/api/v1/agents/{agent_uuid}/policy/{policy_id}") + resp.raise_for_status() + print(f"✓ Assigned policy to agent") + except httpx.HTTPError as e: + print(f"❌ Error setting up policy: {e}") + return False + + # Create controls and add to policy + print() + print("Creating DeepEval controls...") + controls_created = 0 + controls_updated = 0 + + for control_spec in DEEPEVAL_CONTROLS: + control_name = control_spec["name"] + definition = control_spec["definition"] + description = control_spec["description"] + + try: + # Create control + resp = await client.put( + "/api/v1/controls", + json={"name": control_name}, + ) + if resp.status_code == 409: + # Control exists, get its ID + resp = await client.get("/api/v1/controls", params={"name": control_name}) + resp.raise_for_status() + controls = resp.json().get("controls", []) + if controls: + control_id = controls[0]["id"] + controls_updated += 1 + else: + continue + else: + resp.raise_for_status() + control_id = resp.json()["control_id"] + controls_created += 1 + + # Set control definition + resp = await client.put( + f"/api/v1/controls/{control_id}/data", + json={"data": definition}, + ) + resp.raise_for_status() + + # Add control to policy + resp = await client.post(f"/api/v1/policies/{policy_id}/controls/{control_id}") + resp.raise_for_status() + + status = "✓" if definition.get("enabled") else "○" + enabled_text = "enabled" if definition.get("enabled") else "disabled" + print(f" {status} {control_name} ({enabled_text})") + print(f" {description}") + + except httpx.HTTPError as e: + print(f" ❌ Error with control '{control_name}': {e}") + continue + + print() + if controls_created > 0: + print(f"✓ Created {controls_created} new control(s)") + if controls_updated > 0: + print(f"✓ Updated {controls_updated} existing control(s)") + print(f"✓ Agent has {len(DEEPEVAL_CONTROLS)} DeepEval control(s) configured") + print() + print("=" * 70) + print("Setup Complete!") + print("=" * 70) + print() + print("Next steps:") + print(" 1. Ensure OPENAI_API_KEY is set (required for DeepEval)") + print(" 2. Run the Q&A agent: python qa_agent.py") + print(" 3. Ask questions and observe quality controls in action") + print() + print("Note: The 'check-correctness' control is disabled by default.") + print(" Enable it when you have test cases with expected outputs.") + print() + + return True + + +if __name__ == "__main__": + success = asyncio.run(setup_demo()) + sys.exit(0 if success else 1) diff --git a/examples/deepeval/start_server_with_evaluator.sh b/examples/deepeval/start_server_with_evaluator.sh new file mode 100644 index 00000000..1c2a1390 --- /dev/null +++ b/examples/deepeval/start_server_with_evaluator.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Start the agent-control server with DeepEval evaluator registered + +# Get the directory containing this script +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +# Add this directory to PYTHONPATH so the server can import the evaluator +export PYTHONPATH="$DIR:$PYTHONPATH" + +# Import the evaluator before starting the server +python3 -c "import sys; sys.path.insert(0, '$DIR'); from evaluator import DeepEvalEvaluator; print(f'✓ Loaded {DeepEvalEvaluator.metadata.name}')" + +echo "Starting server with DeepEval evaluator..." +echo "PYTHONPATH: $PYTHONPATH" +echo "" + +# Navigate to repository root and start server +cd "$DIR/../.." +./demo.sh start From ce9640a5955cd16989f7b4d32d50e9b698a59df3 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Fri, 30 Jan 2026 16:06:21 -0800 Subject: [PATCH 3/6] update readme --- examples/deepeval/README.md | 2 +- examples/deepeval/THIRD_PARTY_GUIDE.md | 357 ------------------------- 2 files changed, 1 insertion(+), 358 deletions(-) delete mode 100644 examples/deepeval/THIRD_PARTY_GUIDE.md diff --git a/examples/deepeval/README.md b/examples/deepeval/README.md index c287b4aa..6eeb4ee7 100644 --- a/examples/deepeval/README.md +++ b/examples/deepeval/README.md @@ -171,7 +171,7 @@ If you're starting from a fresh clone of the agent-control repository, follow th ```bash # Clone the repository -git clone https://github.com/rungalileo/agent-control.git +git clone https://github.com/agentcontrol/agent-control.git cd agent-control # Install all dependencies (installs models, engine, evaluators, sdk, server packages) diff --git a/examples/deepeval/THIRD_PARTY_GUIDE.md b/examples/deepeval/THIRD_PARTY_GUIDE.md deleted file mode 100644 index d653912f..00000000 --- a/examples/deepeval/THIRD_PARTY_GUIDE.md +++ /dev/null @@ -1,357 +0,0 @@ -# Third-Party Developer Guide: Creating Custom Evaluators - -This guide shows the complete, end-to-end process for third-party developers to create custom evaluators, register them with the agent-control server, and use them in their agents. - -## Overview - -The DeepEval example demonstrates the complete workflow: - -1. **Create the custom evaluator** - Extend the `Evaluator` base class -2. **Register via entry points** - Make the evaluator discoverable by the server -3. **Install the package** - Install locally or publish to PyPI -4. **Verify server discovery** - Confirm the server recognizes your evaluator -5. **Create controls** - Define controls that use your evaluator -6. **Use in agents** - Apply controls to protect agent functions - -## Complete Workflow - -### Step 1: Create Your Custom Evaluator - -Create a Python package with these files: - -**`config.py`** - Define your evaluator's configuration: -```python -from pydantic import BaseModel, Field - -class MyEvaluatorConfig(BaseModel): - """Configuration for your custom evaluator.""" - threshold: float = Field(default=0.5, ge=0.0, le=1.0) - # Add your evaluator-specific config fields -``` - -**`evaluator.py`** - Implement your evaluator: -```python -from typing import Any -from agent_control_models import ( - Evaluator, - EvaluatorMetadata, - EvaluatorResult, - register_evaluator -) -from config import MyEvaluatorConfig - -@register_evaluator -class MyEvaluator(Evaluator[MyEvaluatorConfig]): - metadata = EvaluatorMetadata( - name="my-custom-evaluator", - version="1.0.0", - description="My custom evaluator", - requires_api_key=False, - timeout_ms=10000, - ) - config_model = MyEvaluatorConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - """Implement your evaluation logic.""" - score = 0.8 # Your evaluation logic here - - return EvaluatorResult( - passed=score >= self.config.threshold, - score=score, - metadata={"details": "evaluation details"} - ) -``` - -### Step 2: Create Entry Point in pyproject.toml - -**Critical Step**: The server discovers evaluators via entry points. - -**`pyproject.toml`**: -```toml -[project] -name = "my-custom-evaluator" -version = "1.0.0" -requires-python = ">=3.12" -dependencies = [ - "agent-control-models", - "agent-control-engine", - "agent-control-evaluators", - "pydantic>=2.0.0", - # your other dependencies -] - -# This is the critical section - registers your evaluator with the server -[project.entry-points."agent_control.evaluators"] -my-custom-evaluator = "evaluator:MyEvaluator" - -# For local development (if using path references) -[tool.uv.sources] -agent-control-models = { path = "../path/to/models", editable = true } -agent-control-engine = { path = "../path/to/engine", editable = true } -agent-control-evaluators = { path = "../path/to/evaluators", editable = true } - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" -``` - -**Key Points**: -- The entry point group MUST be `"agent_control.evaluators"` -- The entry point name should match your `metadata.name` -- Format: `"module:ClassName"` (e.g., `"evaluator:DeepEvalEvaluator"`) - -### Step 3: Install Your Package - -**For local development with uv** (recommended): -```bash -# Make sure dependencies are declared in pyproject.toml with path sources -cd /path/to/your/evaluator -uv sync # This installs all dependencies including agent-control packages -``` - -**Alternative - Manual installation**: -```bash -# Install your package in editable mode -uv pip install -e /path/to/your/evaluator - -# Also install agent-control dependencies -uv pip install -e /path/to/agent-control/models -uv pip install -e /path/to/agent-control/engine -uv pip install -e /path/to/agent-control/evaluators -``` - -**For published packages**: -```bash -pip install my-custom-evaluator -``` - -**Note**: When using `uv run` for scripts, uv will automatically create and use a project-specific virtual environment. Make sure your `pyproject.toml` includes the agent-control packages in dependencies. - -### Step 4: Start Server and Verify Discovery - -**Start the server** (it will auto-discover evaluators): -```bash -uv run --package agent-control-server uvicorn agent_control_server.main:app --port 8000 -``` - -**Verify your evaluator is recognized**: -```bash -curl -s http://localhost:8000/api/v1/evaluators | grep "my-custom-evaluator" -``` - -Expected output: -```json -{ - "my-custom-evaluator": { - "name": "my-custom-evaluator", - "version": "1.0.0", - "description": "My custom evaluator", - "requires_api_key": false, - "timeout_ms": 10000, - "config_schema": {...} - } -} -``` - -⚠️ **If your evaluator doesn't appear**, check: -- Entry point is correctly defined in `pyproject.toml` -- Package is installed (`pip list | grep my-custom-evaluator`) -- Server logs for any import errors -- Your evaluator's `is_available()` returns `True` - -### Step 5: Create Controls Using Your Evaluator - -**`setup_controls.py`**: -```python -import asyncio -import httpx - -async def create_controls(): - async with httpx.AsyncClient(base_url="http://localhost:8000") as client: - # Register agent - await client.post("/api/v1/agents/initAgent", json={ - "agent": { - "agent_id": "your-agent-uuid", - "agent_name": "My Agent", - "agent_description": "Agent with custom evaluator" - }, - "tools": [] - }) - - # Create control using your evaluator - response = await client.post("/api/v1/agents/your-agent-uuid/controls", json={ - "name": "my-custom-check", - "definition": { - "description": "My custom quality check", - "enabled": True, - "execution": "server", # REQUIRED field - "scope": { - "step_types": ["llm_inference"], - "stages": ["post"] - }, - "selector": {"path": "output"}, - "evaluator": { - "name": "my-custom-evaluator", # Must match metadata.name - "config": { - "threshold": 0.7 - } - }, - "action": { - "decision": "deny", - "message": "Failed custom check" - } - } - }) - print(f"Control created: {response.json()}") - -if __name__ == "__main__": - asyncio.run(create_controls()) -``` - -**Important control definition fields**: -- `execution`: REQUIRED - must be `"server"` -- `scope`: Defines when the control applies (replaces old `applies_to`/`check_stage`) -- `evaluator.name`: Must match your evaluator's `metadata.name` -- `evaluator.config`: Must match your `EvaluatorConfig` schema - -### Step 6: Use Controls in Your Agent - -**`my_agent.py`**: -```python -import asyncio -from agent_control import agent_control, control, ControlViolationError - -# Initialize agent -agent_control.init( - agent_name="My Agent", - agent_id="my-agent", - agent_description="Agent with custom evaluator", - agent_version="1.0.0" -) - -@control() -async def my_protected_function(input_data: str) -> str: - """Function protected by your custom evaluator.""" - result = f"Processed: {input_data}" - return result - -async def main(): - try: - result = await my_protected_function("test input") - print(f"✓ Success: {result}") - except ControlViolationError as e: - print(f"❌ Control violation: {e}") - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## Complete Example: DeepEval - -See the DeepEval example in this directory for a complete, working implementation: - -``` -examples/deepeval/ -├── config.py # DeepEvalEvaluatorConfig -├── evaluator.py # DeepEvalEvaluator -├── pyproject.toml # Entry point registration -├── setup_controls.py # Creates controls on server -├── qa_agent.py # Q&A agent using controls -└── README.md # Full documentation -``` - -**Key files to study**: -1. [pyproject.toml](./pyproject.toml#L20-L21) - Entry point registration -2. [evaluator.py](./evaluator.py) - Complete evaluator implementation -3. [setup_controls.py](./setup_controls.py#L54-L143) - Control definitions - -## Common Issues and Solutions - -### Issue 1: Server doesn't recognize evaluator - -**Symptoms**: `/api/v1/evaluators` doesn't show your evaluator - -**Solutions**: -- Verify entry point in `pyproject.toml`: `[project.entry-points."agent_control.evaluators"]` -- Reinstall package: `uv pip install -e .` -- Restart server to trigger discovery -- Check `is_available()` returns `True` - -### Issue 2: 422 Validation Error when creating controls - -**Symptoms**: `Field required: data.execution` - -**Solutions**: -- Add `"execution": "server"` to control definition -- Use `scope` instead of `applies_to`/`check_stage` -- Ensure `evaluator.config` matches your config schema - -### Issue 3: Import errors in evaluator - -**Symptoms**: `cannot import name 'Evaluator' from 'agent_control_models'` - -**Solutions**: -- Add agent-control packages to `dependencies` in `pyproject.toml` -- Add `[tool.uv.sources]` with path references to local packages -- Run `uv sync` to install all dependencies -- Alternatively: `uv pip install -e path/to/models -e path/to/engine -e path/to/evaluators` - -### Issue 4: uv virtual environment mismatch - -**Symptoms**: `warning: VIRTUAL_ENV=... does not match the project environment path .venv` - -**Explanation**: This is informational, not an error. `uv run` creates a project-specific `.venv` which is correct behavior. - -**Solutions**: -- This warning can be safely ignored -- Ensure your `pyproject.toml` has correct dependencies and sources -- Run `uv sync` to ensure project venv has all packages - -## Best Practices - -1. **Entry Points are Essential**: The server ONLY discovers evaluators via entry points, not PYTHONPATH -2. **Match metadata.name**: Entry point key should match `metadata.name` in your evaluator -3. **Control Definition Structure**: Always include `execution` and `scope` fields -4. **Validation**: The server validates control configs against your `config_model` schema -5. **Dependencies**: List all dependencies in `pyproject.toml`, including `agent-control-models` - -## Testing Your Evaluator - -1. **Unit tests**: Test your `evaluate()` method directly -2. **Integration tests**: Create controls and test with the server -3. **End-to-end**: Run a complete agent with your controls - -## Publishing Your Evaluator - -To share your evaluator with others: - -1. **Publish to PyPI**: - ```bash - python -m build - twine upload dist/* - ``` - -2. **Users install your package**: - ```bash - pip install my-custom-evaluator - ``` - -3. **Server auto-discovers** via entry points when it starts - -4. **Users create controls** using your evaluator name - -## Summary - -The complete workflow for third-party developers: - -```mermaid -graph TD - A[Create Evaluator Class] --> B[Add Entry Point] - B --> C[Install Package] - C --> D[Start Server] - D --> E[Verify Discovery] - E --> F[Create Controls] - F --> G[Use in Agent] -``` - -**Key Takeaway**: Entry points are the critical mechanism that makes custom evaluators work with the agent-control server. Without proper entry point registration, the server will not discover your evaluator. From d352e0809cb393df93d39ef8553e75139942e03e Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Fri, 30 Jan 2026 16:31:31 -0800 Subject: [PATCH 4/6] addressing comment --- examples/deepeval/README.md | 8 +++++--- examples/deepeval/evaluator.py | 6 +----- examples/deepeval/pyproject.toml | 11 ++++------- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/examples/deepeval/README.md b/examples/deepeval/README.md index 6eeb4ee7..f52e0f62 100644 --- a/examples/deepeval/README.md +++ b/examples/deepeval/README.md @@ -193,12 +193,14 @@ The server will be running at `http://localhost:8000`. ### 3. Install DeepEval Example ```bash -# Install the DeepEval example package with its dependencies +# Navigate to the DeepEval example directory cd examples/deepeval -uv sync + +# Install the example package in editable mode with its dependencies +uv pip install -e . ``` -This installs the evaluator package and makes it discoverable by the server via entry points. +This installs the evaluator package in editable mode and makes it discoverable by the server via entry points. ### 4. Set Environment Variables diff --git a/examples/deepeval/evaluator.py b/examples/deepeval/evaluator.py index ce8e7c31..8332d151 100644 --- a/examples/deepeval/evaluator.py +++ b/examples/deepeval/evaluator.py @@ -16,11 +16,7 @@ register_evaluator, ) -# Import config - handle both relative and absolute imports -try: - from .config import DeepEvalEvaluatorConfig -except ImportError: - from config import DeepEvalEvaluatorConfig +from .config import DeepEvalEvaluatorConfig logger = logging.getLogger(__name__) diff --git a/examples/deepeval/pyproject.toml b/examples/deepeval/pyproject.toml index 3294bc19..100bc641 100644 --- a/examples/deepeval/pyproject.toml +++ b/examples/deepeval/pyproject.toml @@ -27,13 +27,10 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -packages = ["."] - -[tool.uv.sources] -agent-control-models = { path = "../../models", editable = true } -agent-control-engine = { path = "../../engine", editable = true } -agent-control-evaluators = { path = "../../evaluators", editable = true } -agent-control-sdk = { path = "../../sdks/python", editable = true } +include = [ + "*.py", + ".env.example", +] [tool.ruff] line-length = 100 From ee691e207fe20005178a92ed43f0bad3f51582c2 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Mon, 2 Feb 2026 13:42:53 -0800 Subject: [PATCH 5/6] fix examples after evaluator --- examples/deepeval/README.md | 192 +++++++++++++++++++++++++++---- examples/deepeval/__init__.py | 4 +- examples/deepeval/evaluator.py | 2 +- examples/deepeval/pyproject.toml | 11 +- 4 files changed, 175 insertions(+), 34 deletions(-) diff --git a/examples/deepeval/README.md b/examples/deepeval/README.md index f52e0f62..0f634ec2 100644 --- a/examples/deepeval/README.md +++ b/examples/deepeval/README.md @@ -15,15 +15,22 @@ DeepEval's GEval is an LLM-as-a-judge metric that uses chain-of-thoughts (CoT) t ``` examples/deepeval/ -├── config.py # DeepEvalEvaluatorConfig - Configuration model -├── evaluator.py # DeepEvalEvaluator - Main evaluator implementation -├── qa_agent.py # Q&A agent with DeepEval controls -├── setup_controls.py # Setup script to create controls on server -├── pyproject.toml # Project config with entry point registration -├── README.md # This file -└── THIRD_PARTY_GUIDE.md # Complete guide for third-party developers +├── __init__.py # Package initialization +├── config.py # DeepEvalEvaluatorConfig - Configuration model +├── evaluator.py # DeepEvalEvaluator - Main evaluator implementation +├── qa_agent.py # Q&A agent with DeepEval controls +├── setup_controls.py # Setup script to create controls on server +├── start_server_with_evaluator.sh # Helper script to start server with evaluator +├── pyproject.toml # Project config with entry point and dependencies +└── README.md # This file ``` +**Package Structure Notes:** +- Uses a **flat layout** with Python files at the root (configured via `packages = ["."]` in pyproject.toml) +- Modules use **absolute imports** (e.g., `from config import X`) rather than relative imports +- Entry point `evaluator:DeepEvalEvaluator` references the module directly +- Install with `uv pip install -e .` to register the entry point for server discovery + ### Key Components 1. **DeepEvalEvaluatorConfig** ([config.py](config.py)) @@ -49,8 +56,10 @@ examples/deepeval/ 5. **Entry Point Registration** ([pyproject.toml](pyproject.toml)) - Registers evaluator with server via `project.entry-points` + - Depends on `agent-control-models>=3.0.0` and `agent-control-sdk>=3.0.0` + - In monorepo: uses workspace dependencies (editable installs) + - For third-party: can use published PyPI packages - Enables automatic discovery when server starts - - Critical for third-party evaluator integration ## How It Works @@ -91,11 +100,19 @@ class DeepEvalEvaluator(Evaluator[DeepEvalEvaluatorConfig]): The evaluator is registered via `pyproject.toml`: ```toml +[project] +dependencies = [ + "agent-control-models>=3.0.0", + "agent-control-sdk>=3.0.0", + "deepeval>=1.0.0", + # ... other dependencies +] + [project.entry-points."agent_control.evaluators"] deepeval-geval = "evaluator:DeepEvalEvaluator" ``` -This makes the evaluator automatically discoverable by the server when it starts. +This makes the evaluator automatically discoverable by the server when it starts. The pattern works with both workspace dependencies (for monorepo development) and published PyPI packages (for third-party evaluators). ### 3. Configuration @@ -165,17 +182,19 @@ control_definition = { ## Getting Started from Fresh Clone -If you're starting from a fresh clone of the agent-control repository, follow these steps: +This example demonstrates **custom evaluator development** within the agent-control monorepo. It uses workspace dependencies (editable installs) to work with the latest development versions of: +- `agent-control-models` - Base evaluator classes and types +- `agent-control-sdk` - Agent Control SDK for integration +- `deepeval` - DeepEval evaluation framework -### 1. Clone and Install Repository +**Note:** This is a **development/monorepo example** showing the evaluator architecture. + +### 1. Clone Repository ```bash # Clone the repository git clone https://github.com/agentcontrol/agent-control.git cd agent-control - -# Install all dependencies (installs models, engine, evaluators, sdk, server packages) -make sync ``` ### 2. Start Database and Server @@ -196,11 +215,19 @@ The server will be running at `http://localhost:8000`. # Navigate to the DeepEval example directory cd examples/deepeval -# Install the example package in editable mode with its dependencies +# Install dependencies +uv sync + +# Install the evaluator package itself in editable mode uv pip install -e . ``` -This installs the evaluator package in editable mode and makes it discoverable by the server via entry points. +This installs: +- **Dependencies**: `deepeval>=1.0.0`, `openai>=1.0.0`, `pydantic>=2.0.0`, etc. +- **Workspace packages** (as editable installs): `agent-control-models`, `agent-control-sdk` +- **This evaluator package** in editable mode, which registers the entry point for server discovery + +The entry point `deepeval-geval = "evaluator:DeepEvalEvaluator"` makes the evaluator discoverable by the server. ### 4. Set Environment Variables @@ -309,7 +336,57 @@ Configure which parameters to use via the `evaluation_params` config field. ## For Third-Party Developers -See [THIRD_PARTY_GUIDE.md](THIRD_PARTY_GUIDE.md) for a complete step-by-step guide on creating and publishing your own custom evaluators. +This example shows the **evaluator plugin architecture** for extending agent-control. While this specific example is set up for monorepo development, the same pattern works for third-party plugins using published packages. + +To create your own evaluator plugin: + +1. **Extend the Evaluator base class** from `agent-control-models` (published on PyPI) +2. **Define a configuration model** using Pydantic +3. **Register via entry points** in your `pyproject.toml` +4. **Install your package** so the server can discover the entry point +5. **Restart the server** to load the new evaluator + +For standalone packages outside the monorepo, use published versions: +```toml +[project] +dependencies = [ + "agent-control-models>=3.0.0", # From PyPI + "agent-control-sdk>=3.0.0", # From PyPI + "your-evaluation-library>=1.0.0" +] +``` + +See the [Extending This Example](#extending-this-example) section below for the complete pattern. + +### Production Deployment + +For production deployments, build your evaluator as a Python wheel and install it on your agent-control server: + +**Development (this example):** +```bash +uv pip install -e . # Editable install for development +``` + +**Production:** +```bash +python -m build # Creates dist/*.whl +# Install wheel on production server where agent-control runs +``` + +**Deployment Options:** + +1. **Self-Hosted Server (Full Control)** + - Deploy your own agent-control server instance + - Install custom evaluator packages (wheel, source, or private PyPI) + - Your agents connect to this server via the SDK + - Complete control over evaluators and policies + +2. **Managed Service (If Available)** + - Use a hosted agent-control service + - May require coordination to install custom evaluators + - Or use only built-in/approved evaluators + +In both cases, evaluators run **server-side** (`execution: "server"`), so your agent applications only need the lightweight SDK installed. The evaluator package must be installed where the agent-control server runs, not in your agent application. ## Extending This Example @@ -351,7 +428,7 @@ Follow this pattern to create evaluators for other libraries: 4. **Install and Use** ```bash - uv pip install -e . # Server will discover it automatically + uv sync # Server will discover it automatically ``` ### Adding More GEval Metrics @@ -368,8 +445,8 @@ You can create specialized evaluators for specific use cases: - **DeepEval Documentation**: https://deepeval.com/docs/metrics-llm-evals - **G-Eval Guide**: https://www.confident-ai.com/blog/g-eval-the-definitive-guide -- **Third-Party Developer Guide**: [THIRD_PARTY_GUIDE.md](THIRD_PARTY_GUIDE.md) - **Agent Control Evaluators**: [Base evaluator class](../../models/src/agent_control_models/evaluator.py) +- **CrewAI Example**: [Using agent-control as a consumer](../crewai/) ## Key Takeaways @@ -391,10 +468,33 @@ You can create specialized evaluators for specific use cases: ### Evaluator not found -- Verify entry point in `pyproject.toml` -- Run `uv sync` to install package -- Check server logs for evaluator discovery -- Confirm with: `curl http://localhost:8000/api/v1/evaluators` +The server couldn't discover the evaluator. Check: + +1. **Entry point registration** in `pyproject.toml`: + ```toml + [project.entry-points."agent_control.evaluators"] + deepeval-geval = "evaluator:DeepEvalEvaluator" + ``` + +2. **Package is installed**: + ```bash + cd examples/deepeval + uv sync # Install dependencies + uv pip install -e . # Install this package + ``` + +3. **Server was restarted** after package installation: + ```bash + # Stop server (Ctrl+C), then restart + make server-run + ``` + +4. **Verify registration**: + ```bash + curl http://localhost:8000/api/v1/evaluators | grep deepeval-geval + ``` + +5. **Check server logs** for evaluator discovery messages during startup ### Wrong evaluation results @@ -402,6 +502,52 @@ You can create specialized evaluators for specific use cases: - Check that `matched` logic is inverted (trigger when quality fails) - Lower threshold to be more strict (0.5 instead of 0.7) +### Import errors: "cannot import name 'X'" + +If you see import errors like `ImportError: cannot import name 'AgentRef'`: + +1. **Stale editable install**: Reinstall the package + ```bash + uv pip install -e /path/to/package --force-reinstall --no-deps + ``` + +2. **For agent-control-models specifically**: + ```bash + uv pip install -e ../../models --force-reinstall --no-deps + ``` + +3. **Clear Python cache** if issues persist: + ```bash + find . -name "*.pyc" -delete + find . -name "__pycache__" -type d -exec rm -rf {} + + ``` + +4. **Verify installation**: + ```bash + python -c "from agent_control_models.server import AgentRef; print('Success')" + ``` + +### Package not discoverable: "attempted relative import" + +If you see `attempted relative import with no known parent package`: + +1. **Ensure the package is installed**: + ```bash + cd examples/deepeval + uv pip install -e . + ``` + +2. **Verify entry point registration**: + ```bash + uv pip show agent-control-deepeval-example + ``` + +3. **Check pyproject.toml has**: + ```toml + [tool.hatch.build.targets.wheel] + packages = ["."] + ``` + ### DeepEval telemetry files - DeepEval creates a `.deepeval/` directory with telemetry files in the working directory diff --git a/examples/deepeval/__init__.py b/examples/deepeval/__init__.py index 242ac048..77d5fbc6 100644 --- a/examples/deepeval/__init__.py +++ b/examples/deepeval/__init__.py @@ -4,8 +4,8 @@ custom evaluators using external libraries like DeepEval. """ -from .config import DeepEvalEvaluatorConfig, DeepEvalTestCaseParam -from .evaluator import DeepEvalEvaluator +from config import DeepEvalEvaluatorConfig, DeepEvalTestCaseParam +from evaluator import DeepEvalEvaluator __all__ = [ "DeepEvalEvaluator", diff --git a/examples/deepeval/evaluator.py b/examples/deepeval/evaluator.py index 8332d151..53e46938 100644 --- a/examples/deepeval/evaluator.py +++ b/examples/deepeval/evaluator.py @@ -16,7 +16,7 @@ register_evaluator, ) -from .config import DeepEvalEvaluatorConfig +from config import DeepEvalEvaluatorConfig logger = logging.getLogger(__name__) diff --git a/examples/deepeval/pyproject.toml b/examples/deepeval/pyproject.toml index 100bc641..cafca019 100644 --- a/examples/deepeval/pyproject.toml +++ b/examples/deepeval/pyproject.toml @@ -10,10 +10,8 @@ dependencies = [ "pydantic>=2.0.0", "httpx>=0.24.0", "google-re2>=1.1", - "agent-control-models", - "agent-control-engine", - "agent-control-evaluators", - "agent-control-sdk", + "agent-control-models>=3.0.0", + "agent-control-sdk>=3.0.0", ] [project.optional-dependencies] @@ -27,10 +25,7 @@ requires = ["hatchling"] build-backend = "hatchling.build" [tool.hatch.build.targets.wheel] -include = [ - "*.py", - ".env.example", -] +packages = ["."] [tool.ruff] line-length = 100 From 95bc827463e93528fb3abfecea579e8a725a8dd6 Mon Sep 17 00:00:00 2001 From: "namrata.ghadi" Date: Mon, 2 Feb 2026 13:43:28 -0800 Subject: [PATCH 6/6] fix examples after evaluator --- examples/deepeval/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/deepeval/README.md b/examples/deepeval/README.md index 0f634ec2..25cb6fc5 100644 --- a/examples/deepeval/README.md +++ b/examples/deepeval/README.md @@ -336,9 +336,9 @@ Configure which parameters to use via the `evaluation_params` config field. ## For Third-Party Developers -This example shows the **evaluator plugin architecture** for extending agent-control. While this specific example is set up for monorepo development, the same pattern works for third-party plugins using published packages. +This example shows the **evaluator architecture** for extending agent-control. While this specific example is set up for monorepo development, the same pattern works for third-party evluators using published packages. -To create your own evaluator plugin: +To create your own evaluator: 1. **Extend the Evaluator base class** from `agent-control-models` (published on PyPI) 2. **Define a configuration model** using Pydantic