diff --git a/.githooks/pre-push b/.githooks/pre-push index c4ac972d..d96954b5 100755 --- a/.githooks/pre-push +++ b/.githooks/pre-push @@ -16,9 +16,9 @@ echo "pre-push: running make typecheck" make typecheck # Check extras if they exist and have changes -if [ -d "evaluators/extra/galileo" ]; then - echo "pre-push: checking evaluators/extra/galileo" - cd evaluators/extra/galileo +if [ -d "evaluators/contrib/galileo" ]; then + echo "pre-push: checking evaluators/contrib/galileo" + cd evaluators/contrib/galileo uv run --extra dev ruff check --config ../../../pyproject.toml src/ uv run --extra dev mypy --config-file ../../../pyproject.toml src/ cd "$REPO_ROOT" diff --git a/.github/ISSUE_TEMPLATE/general.md b/.github/ISSUE_TEMPLATE/general.md new file mode 100644 index 00000000..0db637ae --- /dev/null +++ b/.github/ISSUE_TEMPLATE/general.md @@ -0,0 +1,30 @@ +--- +name: General issue +about: Report a bug, request a feature, or propose a change +title: "" +labels: [] +assignees: [] +--- + +## Summary +- What is the problem or request? + +## Motivation +- Why does this matter? + +## Current behavior +- What happens today? + +## Expected behavior +- What should happen instead? + +## Reproduction (if bug) +1. +2. +3. + +## Proposed solution (optional) +- Suggested approach, tradeoffs, or constraints. + +## Additional context +- Links, screenshots, logs, related issues/PRs. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 00000000..127f9d4d --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,21 @@ +## Summary +- What changed and why. + +## Scope +- User-facing/API changes: +- Internal changes: +- Out of scope: + +## Risk and Rollout +- Risk level: low / medium / high +- Rollback plan: + +## Testing +- [ ] Added or updated automated tests +- [ ] Ran `make check` (or explained why not) +- [ ] Manually verified behavior + +## Checklist +- [ ] Linked issue/spec (if applicable) +- [ ] Updated docs/examples for user-facing changes +- [ ] Included any required follow-up tasks diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index f7f09dfb..8103788a 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -71,7 +71,7 @@ jobs: if: steps.release.outputs.released == 'true' uses: pypa/gh-action-pypi-publish@release/v1 with: - packages-dir: evaluators/extra/galileo/dist/ + packages-dir: evaluators/contrib/galileo/dist/ user: __token__ password: ${{ secrets.PYPI_API_TOKEN }} @@ -87,4 +87,4 @@ jobs: evaluators/builtin/dist/* sdks/python/dist/* server/dist/* - evaluators/extra/galileo/dist/* + evaluators/contrib/galileo/dist/* diff --git a/AGENTS.md b/AGENTS.md index e10f7f93..9ae9e74d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -27,7 +27,7 @@ Forwarded targets: - `server/`: FastAPI server (`server/src/agent_control_server/`) - `sdks/python/`: Python SDK — uses engine for evaluation (`sdks/python/src/agent_control/`) - `evaluators/builtin/`: builtin evaluator implementations (`evaluators/builtin/src/agent_control_evaluators/`) -- `evaluators/extra/`: optional evaluator packages (e.g., `evaluators/extra/galileo/`) +- `evaluators/contrib/`: optional evaluator packages (e.g., `evaluators/contrib/galileo/`) - `ui/`: Nextjs based web app to manage agent controls - `examples/`: runnable examples (ruff has relaxed import rules here) @@ -75,7 +75,7 @@ All testing guidance (including “behavior changes require tests”) lives in ` 5) evaluator is automatically available to server and SDK via `discover_evaluators()` - Add an external evaluator package: - 1) copy `evaluators/extra/template/` as a starting point + 1) copy `evaluators/contrib/template/` as a starting point 2) implement evaluator class extending `Evaluator` from `agent_control_evaluators` 3) add entry point using `org.name` format (e.g., `galileo.luna2`) 4) package is discovered automatically when installed alongside agent-control diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 94e20531..321aaacc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,734 +1,43 @@ # Contributing to Agent Control -Thanks for contributing! This document covers conventions, setup, and workflows for all contributors. +Thanks for taking the time to contribute! -## Project Architecture +## Before You Start -Agent Control is a **uv workspace monorepo** with these components: +- **For significant changes, please open an issue first.** Discussing your approach ahead of time avoids wasted effort. Changes that were not discussed may be rejected. +- **For small fixes** (typos, minor bug fixes), you can go straight to a pull request. -``` -agent-control/ -├── models/ # Shared Pydantic models (agent-control-models) -├── server/ # FastAPI server (agent-control-server) -├── sdks/python/ # Python SDK (agent-control) -├── engine/ # Control evaluation engine (agent-control-engine) -├── evaluators/ # Evaluator implementations (agent-control-evaluators) -└── examples/ # Usage examples -``` +### Reporting Bugs -**Dependency flow:** -``` -SDK ──────────────────────────────────────┐ - ▼ -Server ──► Engine ──► Models ◄── Evaluators -``` +1. **Search first**: Check [GitHub Issues](https://github.com/agentcontrol/agent-control/issues) for duplicates. If you find one, add context in a comment rather than opening a new issue. +2. **Create an issue**: Make sure to fill out the template and give us enough context to understand and reproduce the issue. +3. **Keep issues focused.** One topic per issue. Link related issues rather than combining them. ---- +### Suggesting Features -## Development Setup +1. **Search first**: Check [existing feature requests](https://github.com/agentcontrol/agent-control/issues?q=is%3Aissue+label%3Aenhancement). +2. **Open an issue** with the `enhancement` label. Describe the use case, explain the value, provide examples, and consider alternatives. +3. Be open to discussion and iteration on your idea. -### Prerequisites +## 1. Integrate Agent Control into Your Agent Framework -- Python 3.12+ -- [uv](https://docs.astral.sh/uv/) (package manager) -- Docker (for server database) +We welcome examples showing how Agent Control works with different agent frameworks. -### Initial Setup +See existing examples in [examples/](examples/) for the expected structure. -```bash -# Clone the repository -git clone -cd agent-control +## 2. Contribute New Evaluators -# Install all dependencies (creates single .venv for workspace) -make sync +See [evaluators/contrib/template/README.md](evaluators/contrib/template/README.md) for instructions on how to create a new evaluator package. -# Install git hooks (recommended) -make hooks-install -``` +## 3. Improve Code and Documentation ---- +Your PR must fill out the template. + - All tests must pass. + - Code changes must be covered by behavioral tests. + - Related documentation must be updated. -## Working with Components +## License -### Models (`models/`) +Agent Control is Apache 2.0 licensed. See [LICENSE](LICENSE) for details. -Shared Pydantic models used by both server and SDK. - -```bash -# Location -models/src/agent_control_models/ - -# Key files -├── agent.py # Agent, Step models -├── controls.py # Control definitions, evaluators -├── evaluation.py # EvaluationRequest/Response -├── policy.py # Policy model -└── health.py # Health response -``` - -**When to modify:** -- Adding new API request/response models -- Changing shared data structures -- Adding validation rules - -**Testing:** -```bash -cd models -uv run pytest -``` - ---- - -### Server (`server/`) - -FastAPI server providing the Agent Control API. - -```bash -# Location -server/src/agent_control_server/ - -# Key files -├── main.py # FastAPI app entrypoint -├── endpoints/ # API route handlers -├── services/ # Business logic -└── db/ # Database models & queries -``` - -**Running the server:** -```bash -cd server - -# Start dependencies (PostgreSQL via Docker) -make start-dependencies - -# Run database migrations -make alembic-upgrade - -# Start server with hot-reload -make run -``` - -**Database migrations:** -```bash -cd server - -# Create new migration -make alembic-migrate MSG="add new column" - -# Apply migrations -make alembic-upgrade - -# Rollback one migration -make alembic-downgrade - -# View migration history -make alembic-history -``` - -**Testing:** -```bash -cd server -make test -``` - ---- - -### SDK (`sdks/python/`) - -Python client SDK for interacting with the Agent Control server. - -```bash -# Location -sdks/python/src/agent_control/ - -# Key files -├── __init__.py # Public API exports, init() function -├── client.py # AgentControlClient (HTTP client) -├── agents.py # Agent registration operations -├── policies.py # Policy management -├── controls.py # Control management -├── control_sets.py # Control set management -├── evaluation.py # Evaluation checks -├── control_decorators.py # @control decorator -└── evaluators/ # Evaluator system -``` - -**Key exports:** -```python -import agent_control - -# Initialization -agent_control.init(agent_name="...", agent_name="...") - -# Decorator -@agent_control.control() -async def my_function(): ... - -# Client -async with agent_control.AgentControlClient() as client: - await agent_control.agents.get_agent(client, "id") -``` - -**Testing:** -```bash -cd sdks/python -make test # Starts server automatically -``` - -**Adding new SDK functionality:** -1. Add operation function in appropriate module (e.g., `policies.py`) -2. Export in `__init__.py` if needed -3. Add tests in `tests/` -4. Update docstrings with examples - ---- - -### Engine (`engine/`) - -Core control evaluation logic. The engine loads evaluators and executes evaluations. - -```bash -# Location -engine/src/agent_control_engine/ - -# Key files -├── core.py # Main ControlEngine class -├── evaluators.py # Evaluator loader and caching -└── selectors.py # Data selection from payloads -``` - -**How it works:** -- The engine uses the evaluator registry to find evaluators -- Evaluators are cached for performance (LRU cache) -- Selectors extract data from payloads before evaluation - -**Testing:** -```bash -cd engine -make test -``` - -> **Note:** To add new evaluators, create an evaluator in `evaluators/` rather than modifying the engine directly. See the Evaluators section below. - ---- - -### Evaluators (`evaluators/`) - -Extensible evaluators for custom detection logic. - -```bash -evaluators/ -├── builtin/ # agent-control-evaluators package -│ ├── pyproject.toml -│ ├── src/agent_control_evaluators/ -│ │ ├── _base.py # Evaluator, EvaluatorConfig, EvaluatorMetadata -│ │ ├── _registry.py # register_evaluator, get_evaluator -│ │ ├── _discovery.py # Entry point discovery -│ │ ├── _factory.py # Instance caching -│ │ ├── regex/ # Type name: "regex" -│ │ ├── list/ # Type name: "list" -│ │ ├── json/ # Type name: "json" -│ │ └── sql/ # Type name: "sql" -│ └── tests/ -│ -└── extra/ # External evaluator packages - ├── galileo/ # agent-control-evaluator-galileo package - │ ├── pyproject.toml # Separate package with own entry points - │ ├── src/agent_control_evaluator_galileo/ - │ │ └── luna2/ # Type name: "galileo.luna2" - │ └── tests/ - └── template/ # Template for new external evaluators -``` - -> **Note:** Built-in evaluators live in the `builtin/` package. External evaluators are -> separate packages under `extra/`, each with their own `pyproject.toml` and entry points. - -**Creating a new evaluator:** - -Choose the appropriate type based on your use case: - -| Type | When to Use | Name Format | -|------|-------------|-------------| -| Built-in | Core functionality, no external deps | `my-evaluator` | -| External | External provider integration, optional deps | `provider.name` | -| Agent-scoped | Custom logic deployed with agent | `my-agent:custom` | - -### Creating an External Evaluator Package (Recommended for External Providers) - -External evaluators live in their own packages under `evaluators/extra/`. This example -creates an `acme.toxicity` evaluator as a separate package. - -**1. Copy the template and set up the package:** -```bash -cp -r evaluators/extra/template evaluators/extra/acme -cd evaluators/extra/acme -``` - -**2. Create `pyproject.toml`** (from the template): -```toml -[project] -name = "agent-control-evaluator-acme" -version = "1.0.0" -description = "Acme toxicity evaluator for agent-control" -requires-python = ">=3.12" -dependencies = [ - "agent-control-evaluators>=3.0.0", - "agent-control-models>=3.0.0", - "httpx>=0.24.0", # Your external dependencies -] - -[project.entry-points."agent_control.evaluators"] -"acme.toxicity" = "agent_control_evaluator_acme.toxicity:AcmeToxicityEvaluator" - -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[tool.hatch.build.targets.wheel] -packages = ["src/agent_control_evaluator_acme"] -``` - -**3. Create directory structure:** -```bash -mkdir -p src/agent_control_evaluator_acme/toxicity -touch src/agent_control_evaluator_acme/__init__.py -touch src/agent_control_evaluator_acme/toxicity/__init__.py -``` - -**4. Define configuration model (`toxicity/config.py`):** -```python -from pydantic import Field -from agent_control_evaluators import EvaluatorConfig - - -class AcmeToxicityEvaluatorConfig(EvaluatorConfig): - """Configuration for Acme Toxicity evaluator.""" - - threshold: float = Field( - default=0.7, - ge=0.0, - le=1.0, - description="Score threshold for triggering (0.0-1.0)", - ) - categories: list[str] = Field( - default_factory=lambda: ["hate", "violence"], - description="Toxicity categories to check", - ) -``` - -**5. Implement evaluator (`toxicity/evaluator.py`):** -```python -from typing import Any - -import httpx -from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator -from agent_control_models import EvaluatorResult - -from agent_control_evaluator_acme.toxicity.config import AcmeToxicityEvaluatorConfig - - -@register_evaluator -class AcmeToxicityEvaluator(Evaluator[AcmeToxicityEvaluatorConfig]): - """Acme Toxicity detection evaluator.""" - - metadata = EvaluatorMetadata( - name="acme.toxicity", # <-- External provider: org.name format - version="1.0.0", - description="Acme toxicity detection API", - requires_api_key=True, - timeout_ms=5000, - ) - config_model = AcmeToxicityEvaluatorConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - """Evaluate text for toxicity.""" - if data is None: - return EvaluatorResult(matched=False, confidence=1.0, message="No data") - - try: - score = await self._call_api(str(data)) - return EvaluatorResult( - matched=score >= self.config.threshold, - confidence=score, - message=f"Toxicity score: {score:.2f}", - ) - except Exception as e: - return EvaluatorResult( - matched=False, - confidence=0.0, - message=f"Evaluation failed: {e}", - error=str(e), - ) - - async def _call_api(self, text: str) -> float: - """Call Acme API and return toxicity score.""" - # Your implementation here - pass -``` - -**6. Export in `toxicity/__init__.py`:** -```python -from agent_control_evaluator_acme.toxicity.config import AcmeToxicityEvaluatorConfig -from agent_control_evaluator_acme.toxicity.evaluator import AcmeToxicityEvaluator - -__all__ = ["AcmeToxicityEvaluator", "AcmeToxicityEvaluatorConfig"] -``` - -**7. Add tests in `tests/`** and publish: -```bash -uv run pytest -uv build && uv publish -``` - -Once published, users install via `pip install agent-control-evaluator-acme` and the -evaluator is automatically discovered via entry points - -### Creating a Built-in Evaluator - -For evaluators with no external dependencies (to be included in core): - -1. Create directory: `evaluators/builtin/src/agent_control_evaluators/my_evaluator/` -2. Add `config.py` extending `EvaluatorConfig` -3. Add `evaluator.py` with `@register_evaluator` and simple name: `name="my-evaluator"` -4. Add entry point in `evaluators/builtin/pyproject.toml` -5. Import in `evaluators/builtin/src/agent_control_evaluators/__init__.py` for auto-registration: - ```python - from agent_control_evaluators.my_evaluator import MyEvaluator, MyEvaluatorConfig - ``` - -### Evaluator Best Practices - -**Thread Safety & Caching:** -- Evaluator instances are **cached and reused** across requests -- **DO NOT** store mutable request-scoped state on `self` -- Use local variables in `evaluate()` for request-specific data -- Initialize immutable resources in `__init__()` (compiled patterns, clients) - -**Error Handling:** -- Set `error` field for evaluator failures (API errors, timeouts) -- Return `matched=False` when `error` is set (fail-open) -- DO NOT set `error` for validation failures (bad input is a valid "matched" result) - -**Performance:** -- Pre-compile patterns in `__init__()` -- Use `asyncio.to_thread()` for CPU-bound work (see SQL evaluator) -- Respect `timeout_ms` config for external API calls - -**Config Validation:** -- Extend `EvaluatorConfig` (not plain `BaseModel`) -- Use Pydantic validators for complex rules -- Provide sensible defaults with `Field(default=...)` - ---- - -## Code Quality - -### Linting (Ruff) - -```bash -# Check all packages -make lint - -# Auto-fix issues -make lint-fix - -# Single package -cd server && make lint -``` - -### Type Checking (mypy) - -```bash -# Check all packages -make typecheck - -# Single package -cd sdks/python && make typecheck -``` - -### Pre-push Checks - -```bash -# Run all checks (test + lint + typecheck) -make check - -# Or manually run pre-push hook -make prepush -``` - ---- - -## Testing Conventions - -Write tests using **Given/When/Then** comments: - -```python -def test_create_control(client: TestClient) -> None: - # Given: a valid control payload - payload = {"name": "pii-protection"} - - # When: creating the control via API - response = client.put("/api/v1/controls", json=payload) - - # Then: the control is created successfully - assert response.status_code == 200 - assert "control_id" in response.json() -``` - -**Guidelines:** -- Keep tests small and focused -- Use explicit setup over hidden fixtures -- Test both success and error cases -- Mock external services (database, Galileo API) - ---- - -## Building & Publishing - -### Build Packages - -```bash -# Build all -make build - -# Build individual packages -make build-models -make build-server -make build-sdk -cd engine && make build -``` - -### Publish Packages - -```bash -# Publish all (requires PyPI credentials) -make publish - -# Publish individual packages -make publish-models -make publish-server -make publish-sdk -``` - -**Version bumping:** -Update `version` in respective `pyproject.toml` files: -- `models/pyproject.toml` -- `server/pyproject.toml` -- `sdks/python/pyproject.toml` -- `engine/pyproject.toml` -- `evaluators/builtin/pyproject.toml` -- `evaluators/extra/galileo/pyproject.toml` (and other external packages) - ---- - -## Git Workflow - -### Branch Naming - -- `feature/description` - New features -- `fix/description` - Bug fixes -- `refactor/description` - Code refactoring - -### Commit Messages - -Use conventional commits: -``` -feat: add policy assignment endpoint -fix: handle missing agent gracefully -refactor: extract evaluator logic to engine -docs: update SDK usage examples -test: add control set integration tests -``` - -### Pull Request Checklist - -- [ ] Tests pass (`make test`) -- [ ] Linting passes (`make lint`) -- [ ] Type checking passes (`make typecheck`) -- [ ] Documentation updated if needed -- [ ] Examples updated if API changed - ---- - -## Common Tasks - -### Add a new API endpoint - -1. Add Pydantic models in `models/` if needed -2. Add route handler in `server/src/agent_control_server/endpoints/` -3. Add service logic in `server/src/agent_control_server/services/` -4. Add SDK wrapper in `sdks/python/src/agent_control/` -5. Add tests for both server and SDK -6. Update examples if user-facing - -### Add a new evaluator - -See the **Evaluators** section above for detailed instructions. Summary: - -**Built-in evaluator:** -1. Create directory: `evaluators/builtin/src/agent_control_evaluators/my_evaluator/` -2. Add `config.py` extending `EvaluatorConfig` -3. Add `evaluator.py` with `@register_evaluator` decorator -4. Add entry point in `evaluators/builtin/pyproject.toml` -5. Add tests in `evaluators/builtin/tests/` - -**External evaluator (separate package):** -1. Copy template: `cp -r evaluators/extra/template evaluators/extra/myorg` -2. Create package with own `pyproject.toml` and entry points -3. Add tests and publish to PyPI - -### Update shared models - -1. Modify models in `models/src/agent_control_models/` -2. Run tests across all packages: `make test` -3. Update any affected server endpoints -4. Update SDK if client-facing - ---- - -## Quick Reference - -| Task | Command | -|------|---------| -| Install dependencies | `make sync` | -| Run server | `cd server && make run` | -| Run all tests | `make test` | -| Run linting | `make lint` | -| Run type checks | `make typecheck` | -| Run all checks | `make check` | -| Build packages | `make build` | -| Database migration | `cd server && make alembic-migrate MSG="..."` | - ---- - -## Evaluator Naming Conventions - -### Terminology - -There are three distinct concepts related to evaluators: - -| Concept | Definition | Example | -|---------|------------|---------| -| **Evaluator Type** | An implementation class with `evaluate()` method | `RegexEvaluator`, `Luna2Evaluator` | -| **Evaluator Schema** | Metadata about a custom type (name + JSON Schema for config validation) | Registered via `initAgent` | -| **Evaluator Config** | A saved configuration template (type + specific config values) | Stored via `/evaluator-configs` API | - -### Evaluator Type Name Formats - -Evaluator type names identify evaluator implementations. The format indicates the evaluator's origin: - -| Format | Origin | Examples | -|--------|--------|----------| -| `name` | Built-in (first-party, no dependencies) | `regex`, `list`, `json`, `sql` | -| `provider.name` | External (external providers, optional deps) | `galileo.luna2`, `nvidia.nemo` | -| `agent:name` | Agent-scoped (custom code deployed with agent) | `my-agent:pii-detector` | - -**Parsing rules:** -```python -if ":" in name: # Agent-scoped (split on first ":") - agent, evaluator = name.split(":", 1) -elif "." in name: # External provider (split on first ".") - provider, evaluator = name.split(".", 1) -else: # Built-in - evaluator = name -``` - -### Built-in vs Third-Party Evaluators - -**Built-in evaluators** (`regex`, `list`, `json`, `sql`): -- No namespace prefix -- Core dependencies only (included in base package) -- Imported and registered automatically on package import - -**External evaluators** (`galileo.luna2`): -- Use `provider.name` format with dot separator -- Are separate packages (e.g., `pip install agent-control-evaluator-galileo` or `pip install agent-control-evaluators[galileo]`) -- Discovered via Python entry points (not auto-imported) - -### Agent-Scoped Evaluators - -Agent-scoped evaluators (`my-agent:pii-detector`) are custom evaluator types that: -1. Are **implemented in the agent's code** (not in the evaluators package) -2. Have their **schema registered via `initAgent`** for config validation -3. Are **server-only** (SDK cannot run them locally) - -``` -Agent Code Server Database -┌─────────────────────┐ ┌─────────────────────────────┐ -│ @register_evaluator │ initAgent │ Agent: "my-agent" │ -│ class PIIDetector │ ─────────► │ Schemas: [{ │ -│ ... │ │ name: "pii-detector", │ -└─────────────────────┘ │ config_schema: {...} │ - │ }] │ - └─────────────────────────────┘ -``` - -Controls reference them as `my-agent:pii-detector` (the `:` indicates agent scope). - -### Folder and File Naming - -| Item | Convention | Example | -|------|------------|---------| -| Folder name | `snake_case` (Python package) | `galileo_luna2/` | -| Entry point key | Same as type name | `"galileo.luna2"` | -| Metadata name | Same as type name | `name="galileo.luna2"` | - -> **Note:** In code, use "provider" as the type identifier. In user-facing docs, -> use "external" as the descriptive term. - ---- - -## Evaluator Development Quick Reference - -| Task | Location | -|------|----------| -| Evaluator base class | `agent_control_evaluators.Evaluator` | -| Config base class | `agent_control_evaluators.EvaluatorConfig` | -| Evaluator metadata | `agent_control_evaluators.EvaluatorMetadata` | -| Evaluator result | `agent_control_models.EvaluatorResult` | -| Register decorator | `@agent_control_evaluators.register_evaluator` | -| Built-in evaluators | `evaluators/builtin/src/agent_control_evaluators/{regex,list,json,sql}/` | -| External evaluators | `evaluators/extra/galileo/` (separate packages) | -| Evaluator tests | `evaluators/builtin/tests/` or `evaluators/extra/*/tests/` | - -**Naming convention quick reference:** -``` -Built-in: regex, list, json, sql -External: galileo.luna2, nvidia.nemo -Agent-scoped: my-agent:pii-detector -``` - -**Evaluator config model fields:** -```python -from pydantic import Field -from agent_control_evaluators import EvaluatorConfig - -class MyEvaluatorConfig(EvaluatorConfig): - # Required field - pattern: str = Field(..., description="Pattern to match") - - # Optional with default - threshold: float = Field(0.5, ge=0.0, le=1.0) - - # List field - values: list[str] = Field(default_factory=list) -``` - -**EvaluatorResult fields:** -```python -EvaluatorResult( - matched=True, # Did this trigger the control? - confidence=0.95, # How confident (0.0-1.0)? - message="Explanation", # Human-readable message - metadata={"key": "val"} # Additional context -) -``` - ---- - -## Need Help? - -- **Documentation:** See `docs/OVERVIEW.md` for architecture overview -- **Examples:** Check `examples/` for usage patterns -- **Tests:** Look at existing tests for patterns to follow +By contributing, you agree that your contributions will be licensed under the Apache 2.0 License. diff --git a/Makefile b/Makefile index 7d54fa70..599ec081 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ SDK_DIR := sdks/python TS_SDK_DIR := sdks/typescript ENGINE_DIR := engine EVALUATORS_DIR := evaluators/builtin -GALILEO_DIR := evaluators/extra/galileo +GALILEO_DIR := evaluators/contrib/galileo help: @echo "Agent Control - Makefile commands" @@ -30,7 +30,7 @@ help: @echo "" @echo "Test:" @echo " make test - run tests for core packages (server, engine, sdk, evaluators)" - @echo " make test-extras - run tests for extra evaluators (galileo, etc.)" + @echo " make test-extras - run tests for contrib evaluators (galileo, etc.)" @echo " make test-all - run all tests (core + extras)" @echo " make sdk-ts-test - run TypeScript SDK tests" @echo "" @@ -81,7 +81,7 @@ openapi-spec-check: openapi-spec test: server-test engine-test sdk-test evaluators-test -# Run tests for extra evaluators (not included in default test target) +# Run tests for contrib evaluators (not included in default test target) test-extras: galileo-test # Run all tests (core + extras) @@ -215,7 +215,7 @@ server-%: $(MAKE) -C $(SERVER_DIR) $(patsubst server-%,%,$@) # --------------------------- -# Extra Evaluators (Galileo) +# Contrib Evaluators (Galileo) # --------------------------- galileo-test: diff --git a/docs/OVERVIEW.md b/docs/OVERVIEW.md index 67f31de9..0614ddfa 100644 --- a/docs/OVERVIEW.md +++ b/docs/OVERVIEW.md @@ -661,8 +661,8 @@ We welcome contributions! See [CONTRIBUTING.md](../CONTRIBUTING.md) for guidelin ### Adding an Evaluator 1. Fork the repository -2. Create your evaluator in `evaluators/builtin/src/agent_control_evaluators/` (for builtins) or in `evaluators/extra/` (for external packages) -3. Add tests in `evaluators/tests/` +2. Create your evaluator in `evaluators/builtin/src/agent_control_evaluators/` (for builtins) or in `evaluators/contrib/` (for external packages) +3. Add tests in `evaluators/builtin/tests/` (for builtins) or `evaluators/contrib//tests/` (for contrib) 4. Submit a pull request --- diff --git a/evaluators/builtin/pyproject.toml b/evaluators/builtin/pyproject.toml index ccba8afe..7ccf8c5a 100644 --- a/evaluators/builtin/pyproject.toml +++ b/evaluators/builtin/pyproject.toml @@ -34,4 +34,4 @@ packages = ["src/agent_control_evaluators"] [tool.uv.sources] agent-control-models = { workspace = true } # For local dev: use local galileo package instead of PyPI -agent-control-evaluator-galileo = { path = "../extra/galileo", editable = true } +agent-control-evaluator-galileo = { path = "../contrib/galileo", editable = true } diff --git a/evaluators/extra/galileo/Makefile b/evaluators/contrib/galileo/Makefile similarity index 100% rename from evaluators/extra/galileo/Makefile rename to evaluators/contrib/galileo/Makefile diff --git a/evaluators/extra/galileo/README.md b/evaluators/contrib/galileo/README.md similarity index 100% rename from evaluators/extra/galileo/README.md rename to evaluators/contrib/galileo/README.md diff --git a/evaluators/extra/galileo/pyproject.toml b/evaluators/contrib/galileo/pyproject.toml similarity index 100% rename from evaluators/extra/galileo/pyproject.toml rename to evaluators/contrib/galileo/pyproject.toml diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py similarity index 100% rename from evaluators/extra/galileo/src/agent_control_evaluator_galileo/__init__.py rename to evaluators/contrib/galileo/src/agent_control_evaluator_galileo/__init__.py diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py similarity index 100% rename from evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py rename to evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/__init__.py diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/client.py similarity index 100% rename from evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/client.py rename to evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/client.py diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/config.py similarity index 100% rename from evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/config.py rename to evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/config.py diff --git a/evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py b/evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py similarity index 100% rename from evaluators/extra/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py rename to evaluators/contrib/galileo/src/agent_control_evaluator_galileo/luna2/evaluator.py diff --git a/evaluators/extra/galileo/tests/__init__.py b/evaluators/contrib/galileo/tests/__init__.py similarity index 100% rename from evaluators/extra/galileo/tests/__init__.py rename to evaluators/contrib/galileo/tests/__init__.py diff --git a/evaluators/extra/galileo/tests/test_luna2_evaluator.py b/evaluators/contrib/galileo/tests/test_luna2_evaluator.py similarity index 100% rename from evaluators/extra/galileo/tests/test_luna2_evaluator.py rename to evaluators/contrib/galileo/tests/test_luna2_evaluator.py diff --git a/evaluators/contrib/template/README.md b/evaluators/contrib/template/README.md new file mode 100644 index 00000000..b71e1dff --- /dev/null +++ b/evaluators/contrib/template/README.md @@ -0,0 +1,230 @@ +Creating a Contrib Evaluator +============================ + +This guide walks you through building and publishing a new evaluator package for agent-control. Start to finish, it takes about 15 minutes. + +For a working reference, see the [Galileo evaluator](../galileo/). + +Quick Start +----------- + +Pick your org name and evaluator name. Everything else derives from these two: + +> **Example**: org = `acme`, evaluator = `toxicity` +> +> * PyPI package: `agent-control-evaluator-acme` +> * Python package: `agent_control_evaluator_acme` +> * Entry point: `acme.toxicity` +> * Evaluator class: `AcmeToxicityEvaluator` + +From the repo root: + +```bash +cp -r evaluators/contrib/template/ evaluators/contrib/acme/ +mv evaluators/contrib/acme/pyproject.toml.template evaluators/contrib/acme/pyproject.toml +``` + +Edit `pyproject.toml` and replace all placeholders (`{ORG}`, `{EVALUATOR}`, `{CLASS}`, `{AUTHOR}`). Then create the source layout: + +```bash +mkdir -p evaluators/contrib/acme/src/agent_control_evaluator_acme/toxicity +mkdir -p evaluators/contrib/acme/tests +touch evaluators/contrib/acme/src/agent_control_evaluator_acme/__init__.py +touch evaluators/contrib/acme/src/agent_control_evaluator_acme/toxicity/__init__.py +touch evaluators/contrib/acme/src/agent_control_evaluator_acme/toxicity/config.py +touch evaluators/contrib/acme/src/agent_control_evaluator_acme/toxicity/evaluator.py +touch evaluators/contrib/acme/tests/__init__.py +touch evaluators/contrib/acme/tests/test_toxicity.py +``` + +You'll end up with: + +``` +acme/ +├── pyproject.toml +├── src/agent_control_evaluator_acme/ +│ ├── __init__.py +│ └── toxicity/ +│ ├── __init__.py +│ ├── config.py +│ └── evaluator.py +└── tests/ + ├── __init__.py + └── test_toxicity.py +``` + +Writing the Evaluator +--------------------- + +**Config** - extend `EvaluatorConfig` with your evaluator's settings: + +```python +# toxicity/config.py +from pydantic import Field +from agent_control_evaluators import EvaluatorConfig + +class AcmeToxicityConfig(EvaluatorConfig): + threshold: float = Field(default=0.7, ge=0.0, le=1.0) + categories: list[str] = Field(default_factory=lambda: ["hate", "violence"]) +``` + +**Evaluator** - extend `Evaluator` and decorate with `@register_evaluator`: + +```python +# toxicity/evaluator.py +from typing import Any + +from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator +from agent_control_models import EvaluatorResult + +from agent_control_evaluator_acme.toxicity.config import AcmeToxicityConfig + +@register_evaluator +class AcmeToxicityEvaluator(Evaluator[AcmeToxicityConfig]): + metadata = EvaluatorMetadata( + name="acme.toxicity", # Must match entry point key exactly + version="1.0.0", + description="Acme toxicity detection", + requires_api_key=True, # Set if you need external credentials + timeout_ms=5000, # Timeout for external calls + ) + config_model = AcmeToxicityConfig + + async def evaluate(self, data: Any) -> EvaluatorResult: + if data is None: + return EvaluatorResult(matched=False, confidence=1.0, message="No data") + + try: + score = await self._score(str(data)) + return EvaluatorResult( + matched=score >= self.config.threshold, + confidence=score, + message=f"Toxicity: {score:.2f}", + ) + except Exception as e: + # Fail-open on infrastructure errors + return EvaluatorResult( + matched=False, confidence=0.0, + message=f"Failed: {e}", error=str(e), + ) + + async def _score(self, text: str) -> float: + # Your API call or local logic here + ... +``` + +**Entry point** in `pyproject.toml` - this is how discovery finds your evaluator: + +```toml +[project.entry-points."agent_control.evaluators"] +"acme.toxicity" = "agent_control_evaluator_acme.toxicity:AcmeToxicityEvaluator" +``` + +The entry point key (`acme.toxicity`) must exactly match `metadata.name` in the evaluator class. If these don't match, `get_evaluator()` returns `None`. + +**Exports** in `toxicity/__init__.py`: + +```python +from agent_control_evaluator_acme.toxicity.config import AcmeToxicityConfig +from agent_control_evaluator_acme.toxicity.evaluator import AcmeToxicityEvaluator + +__all__ = ["AcmeToxicityEvaluator", "AcmeToxicityConfig"] +``` + +Testing +------- + +Write tests using Given/When/Then style. Cover at least three cases: + +1. **Null input** - returns `matched=False`, no error +2. **Normal evaluation** - returns correct `matched` based on threshold +3. **Infrastructure failure** - returns `matched=False` with `error` set (fail-open) + +```python +# tests/test_toxicity.py +import pytest +from agent_control_evaluator_acme.toxicity import AcmeToxicityEvaluator, AcmeToxicityConfig + +@pytest.fixture +def evaluator() -> AcmeToxicityEvaluator: + return AcmeToxicityEvaluator(AcmeToxicityConfig(threshold=0.5)) + +@pytest.mark.asyncio +async def test_none_input(evaluator): + result = await evaluator.evaluate(None) + assert result.matched is False + assert result.error is None + +@pytest.mark.asyncio +async def test_score_above_threshold_matches(evaluator, monkeypatch): + async def _high(self, text): + return 0.8 + + monkeypatch.setattr(AcmeToxicityEvaluator, "_score", _high) + result = await evaluator.evaluate("test") + assert result.matched is True + assert result.error is None + +@pytest.mark.asyncio +async def test_api_failure_fails_open(evaluator, monkeypatch): + async def _fail(self, text): + raise ConnectionError("timeout") + + monkeypatch.setattr(AcmeToxicityEvaluator, "_score", _fail) + result = await evaluator.evaluate("test") + assert result.matched is False + assert result.error is not None +``` + +Rules to Know +------------- + +**Error handling** - The `error` field is only for infrastructure failures (network errors, API 500s, missing credentials). If your evaluator ran and produced a judgment, that's `matched=True` or `matched=False` - not an error. When `error` is set, `matched` must be `False` (fail-open). + +**Thread safety** - Evaluator instances are cached and reused across concurrent requests. Never store request-scoped state on `self`. Use local variables in `evaluate()`. + +**Performance** - Pre-compile patterns in `__init__()`. Use `asyncio.to_thread()` for CPU-bound work. Respect `timeout_ms` for external calls. + +Before You Submit +----------------- + +From the repo root: + +```bash +PKG=evaluators/contrib/acme + +# Check for leftover placeholders (should print nothing; non-zero exit is OK here) +grep -rn '{ORG}\|{EVALUATOR}\|{CLASS}\|{AUTHOR}' "$PKG"/ || true + +# Lint, typecheck, test +(cd "$PKG" && uv run --extra dev ruff check --config ../../../pyproject.toml src/) +(cd "$PKG" && uv run --extra dev mypy --config-file ../../../pyproject.toml src/) +(cd "$PKG" && uv run pytest) + +# Verify discovery works +(cd "$PKG" && uv run python -c " +from agent_control_evaluators import discover_evaluators, get_evaluator +discover_evaluators() +ev = get_evaluator('acme.toxicity') +assert ev is not None, 'Discovery failed - entry point key does not match metadata.name' +print(f'OK: {ev.metadata.name}') +") + +# Build +(cd "$PKG" && uv build) +``` + +Publishing +---------- + +```bash +(cd evaluators/contrib/acme && uv build && uv publish) +``` + +Users install with `pip install agent-control-evaluator-acme` and the evaluator is discovered automatically. + +Reference +--------- + +* [Galileo evaluator](../galileo/) - complete working example +* [Built-in evaluators](../../builtin/src/agent_control_evaluators/) - regex, list, json, sql patterns diff --git a/evaluators/extra/template/pyproject.toml.template b/evaluators/contrib/template/pyproject.toml.template similarity index 100% rename from evaluators/extra/template/pyproject.toml.template rename to evaluators/contrib/template/pyproject.toml.template diff --git a/evaluators/extra/template/README.md b/evaluators/extra/template/README.md deleted file mode 100644 index cd52e07b..00000000 --- a/evaluators/extra/template/README.md +++ /dev/null @@ -1,100 +0,0 @@ -# Evaluator Package Template - -This template provides a starting point for creating new evaluator packages for agent-control. - -## Setup - -1. Copy this template: `cp -r template/ {{org}}/` -2. Replace placeholders in pyproject.toml.template: - - `{{ORG}}`: Your organization name (e.g., `acme`) - - `{{EVALUATOR}}`: Evaluator name (e.g., `toxicity`) - - `{{CLASS}}`: Python class name (e.g., `ToxicityEvaluator`) - - `{{AUTHOR}}`: Author name -3. Rename to `pyproject.toml` -4. Create your evaluator in `src/agent_control_evaluator_{{org}}/` -5. Register via entry point in pyproject.toml - -## Directory Structure - -``` -{{org}}/ -├── pyproject.toml -├── src/agent_control_evaluator_{{org}}/ -│ ├── __init__.py -│ └── {{evaluator}}/ -│ ├── __init__.py -│ ├── config.py # Extends EvaluatorConfig -│ └── evaluator.py # Extends Evaluator, uses @register_evaluator -└── tests/ - ├── __init__.py - └── test_{{evaluator}}.py -``` - -## Entry Point Naming Convention - -Use `org.evaluator_name` format (e.g., `acme.toxicity`). - -This naming convention: -- Uses dots (`.`) as separators for external evaluators -- Distinguishes from built-in evaluators (no namespace) and agent-scoped evaluators (colon separator) - -## Implementation Pattern - -Your evaluator should: - -1. **Extend `EvaluatorConfig`** for configuration: - -```python -from agent_control_evaluators import EvaluatorConfig - -class MyEvaluatorConfig(EvaluatorConfig): - threshold: float = 0.5 - # ... other config fields -``` - -2. **Extend `Evaluator` and use `@register_evaluator`**: - -```python -from agent_control_evaluators import Evaluator, EvaluatorMetadata, register_evaluator -from agent_control_models import EvaluatorResult - -@register_evaluator -class MyEvaluator(Evaluator[MyEvaluatorConfig]): - metadata = EvaluatorMetadata( - name="myorg.myevaluator", # Must match entry point - version="1.0.0", - description="My custom evaluator", - ) - config_model = MyEvaluatorConfig - - async def evaluate(self, data: Any) -> EvaluatorResult: - # Your evaluation logic here - return EvaluatorResult( - matched=..., - confidence=..., - message=..., - ) -``` - -## Testing - -Run tests with: -```bash -cd evaluators/extra/{{org}} -uv run pytest -``` - -## Publishing - -Build and publish your package: -```bash -uv build -uv publish -``` - -Once published, users can install via: -```bash -pip install agent-control-evaluator-{{org}} -``` - -The evaluator will be automatically discovered via entry points when used alongside agent-control. diff --git a/examples/galileo/README.md b/examples/galileo/README.md index b87170db..ad3d8aac 100644 --- a/examples/galileo/README.md +++ b/examples/galileo/README.md @@ -84,4 +84,4 @@ Testing toxicity detection with Central Stage... - [Galileo Protect Overview](https://v2docs.galileo.ai/concepts/protect/overview) - [Luna-2 Python API Reference](https://v2docs.galileo.ai/sdk-api/python/reference/protect) -- [Agent Control Luna-2 Evaluator](../../evaluators/extra/galileo/) +- [Agent Control Luna-2 Evaluator](../../evaluators/contrib/galileo/) diff --git a/pyproject.toml b/pyproject.toml index e66ef1bf..61550e85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,7 @@ members = [ "sdks/python", "engine", "evaluators/builtin", - # NOTE: evaluators/extra/* excluded - install separately when needed + # NOTE: evaluators/contrib/* excluded - install separately when needed ] [tool.uv] @@ -69,7 +69,7 @@ version_toml = [ "sdks/python/pyproject.toml:project.version", "server/pyproject.toml:project.version", "evaluators/builtin/pyproject.toml:project.version", - "evaluators/extra/galileo/pyproject.toml:project.version", + "evaluators/contrib/galileo/pyproject.toml:project.version", ] version_source = "tag" commit_message = "chore(release): v{version}" diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index c8449fcf..95748ecc 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -79,4 +79,4 @@ agent-control-models = { workspace = true } agent-control-engine = { workspace = true } agent-control-evaluators = { workspace = true } # For local dev: use local galileo package instead of PyPI -agent-control-evaluator-galileo = { path = "../../evaluators/extra/galileo", editable = true } +agent-control-evaluator-galileo = { path = "../../evaluators/contrib/galileo", editable = true } diff --git a/server/pyproject.toml b/server/pyproject.toml index 2d214746..dd69535b 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -88,4 +88,4 @@ agent-control-models = { workspace = true } agent-control-engine = { workspace = true } agent-control-evaluators = { workspace = true } # For local dev: use local galileo package instead of PyPI -agent-control-evaluator-galileo = { path = "../evaluators/extra/galileo", editable = true } +agent-control-evaluator-galileo = { path = "../evaluators/contrib/galileo", editable = true }