This guide explains how to create benchmarks for agents that interact with ERP systems (Odoo, SAP, Oracle EBS, Microsoft Dynamics, etc.).
Unlike simple answer-checking benchmarks, ERP workflows require:
- External System State Management - Set up and verify state in a live system
- API/Database Integration - Connect to real ERP instances
- Side Effect Verification - Check that changes were actually made
- Cleanup/Reset - Return system to known state between tests
┌─────────────────┐
│ Agent in │ Reads config ┌──────────────────┐
│ Docker │─────────────────>│ erp_config.json │
│ Container │ │ (credentials) │
└────────┬────────┘ └──────────────────┘
│
│ Makes API calls
↓
┌─────────────────────────────────────────────┐
│ ERP System │
│ (Odoo / SAP / Oracle / Dynamics / etc.) │
│ │
│ ┌──────────┐ ┌─────────┐ ┌──────────┐ │
│ │ Orders │ │Invoices │ │Customers │ │
│ └──────────┘ └─────────┘ └──────────┘ │
└────────┬────────────────────────────────────┘
│
│ Verifies changes
↓
┌─────────────────┐
│ Benchmark │
│ Scoring │
└─────────────────┘
class ERPClient:
def __init__(self, url, api_key):
self.url = url
self.api_key = api_key
async def get_entity(self, entity_type, entity_id):
response = await requests.get(
f"{self.url}/api/{entity_type}/{entity_id}",
headers={"Authorization": f"Bearer {self.api_key}"}
)
return response.json()import asyncpg
class ERPDatabase:
async def connect(self, dsn):
self.conn = await asyncpg.connect(dsn)
async def query_entity(self, table, id):
return await self.conn.fetchrow(
f"SELECT * FROM {table} WHERE id = $1", id
)import xmlrpc.client
class OdooClient:
def __init__(self, url, db, username, password):
self.common = xmlrpc.client.ServerProxy(f'{url}/xmlrpc/2/common')
self.models = xmlrpc.client.ServerProxy(f'{url}/xmlrpc/2/object')
self.uid = self.common.authenticate(db, username, password, {})
self.db = db
self.password = password
def read_record(self, model, record_id):
return self.models.execute_kw(
self.db, self.uid, self.password,
model, 'read', [[record_id]]
)from .base import BaseBenchmark, Problem
from dataclasses import dataclass
@dataclass
class ERPProblem(Problem):
"""Extended Problem with ERP-specific fields."""
entity_type: str = None # 'sale.order', 'invoice', etc.
entity_id: str = None # Specific record ID
expected_changes: dict = None # What should change
class MyERPBenchmark(BaseBenchmark):
name = "my_erp_workflow"
def __init__(self, erp_url, api_key, **kwargs):
super().__init__(**kwargs)
self.erp_client = ERPClient(erp_url, api_key)
self._problems = self._load_problems()Provide the agent with:
- ERP connection credentials
- Entity IDs to work with
- Supporting documents/data
async def setup_problem(self, problem, problem_data_dir, container_name):
# 1. Create config file with credentials
config = {
"erp_url": self.erp_client.url,
"api_key": self.erp_client.api_key,
"entity_id": problem.entity_id,
"entity_type": problem.entity_type
}
(problem_data_dir / "erp_config.json").write_text(
json.dumps(config, indent=2)
)
# 2. Reset entity to known initial state
if problem.initial_state:
await self.erp_client.update_entity(
problem.entity_type,
problem.entity_id,
problem.initial_state
)
# 3. Create supporting files if needed
if problem.needs_approval_doc:
(problem_data_dir / "approval.pdf").write_text(
"Approved: Credit increase"
)Verify changes in the actual ERP system:
async def score_problem(self, problem, agent_workdir,
agent_answer_dir, container_name):
# Read agent's submission
answer_path = Path(agent_answer_dir) / "answer.txt"
submitted = answer_path.read_text().strip()
# Query ERP to get actual current state
current_state = await self.erp_client.get_entity(
problem.entity_type,
problem.entity_id
)
# Verify expected changes were made
score = 0.0
errors = []
for field, expected_value in problem.expected_changes.items():
actual_value = current_state.get(field)
if actual_value == expected_value:
score += 1.0 / len(problem.expected_changes)
else:
errors.append(
f"{field}: expected {expected_value}, got {actual_value}"
)
error_msg = "; ".join(errors) if errors else None
return score, error_msg, problem.answer_discussionReset state between problems:
async def cleanup_problem(self, problem):
"""Reset ERP state after test."""
if problem.initial_state:
await self.erp_client.update_entity(
problem.entity_type,
problem.entity_id,
problem.initial_state
)ERPProblem(
problem_id="discount_001",
statement="""
Access the ERP and apply a 10% discount to sale order SO12345.
Ensure the order remains in 'draft' state.
Submit 'DONE' when complete.
""",
answer="DONE",
entity_type="sale.order",
entity_id="SO12345",
initial_state={"discount": 0.0, "state": "draft"},
expected_changes={"discount": 10.0, "state": "draft"}
)ERPProblem(
problem_id="invoice_002",
statement="""
Verify invoice INV-2026-001 totals are correct.
If correct, change state from 'draft' to 'posted'.
Submit the final total amount.
""",
answer="1500.00",
entity_type="account.invoice",
entity_id="INV-2026-001",
expected_changes={"state": "posted"}
)ERPProblem(
problem_id="batch_003",
statement="""
Find all orders with status 'confirmed' and
delivery_date before 2026-02-01.
Update their delivery dates to 2026-03-15.
Submit the count of updated orders.
""",
answer="23", # Expected count
verification_query="count_delivery_updates"
)- Set up a dedicated test ERP instance
- Pre-populate with test data
- Reset after each benchmark run
- Wrap each problem in a database transaction
- Rollback after verification
- Requires database-level access
- Capture state before problem
- Verify changes
- Restore from snapshot
class MockERPClient:
def __init__(self):
self.entities = {}
async def get_entity(self, entity_type, entity_id):
return self.entities.get(f"{entity_type}:{entity_id}")
async def update_entity(self, entity_type, entity_id, values):
key = f"{entity_type}:{entity_id}"
self.entities[key] = {**self.entities.get(key, {}), **values}# Create
problem = ERPProblem(
statement="Create customer with name 'Acme Corp'",
answer="CUST_NEW", # Will be the new ID
entity_type="res.partner",
expected_changes={"name": "Acme Corp", "customer": True}
)
# Read - verify data retrieval
# Update - modify existing record
# Delete - remove record and verifyERPProblem(
statement="Move order through workflow: draft -> confirmed -> done",
entity_type="sale.order",
initial_state={"state": "draft"},
expected_changes={"state": "done"}
)ERPProblem(
statement="Only post invoice if total matches sale order",
verification_query="validate_invoice_total"
)
# In score_problem:
async def _verify_validation(self, problem):
invoice = await self.get_invoice(problem.entity_id)
order = await self.get_order(invoice.order_id)
if invoice.total == order.total and invoice.state == "posted":
return 1.0, None
else:
return 0.0, "Validation failed or incorrect state"ERPProblem(
statement="""
Apply early payment discount if:
- Payment received within 10 days
- Order amount > 1000
- Customer has good credit
Calculate and apply 2% discount.
"""
)-
Credentials Management
- Never commit real credentials
- Use environment variables
- Consider secrets management (Vault, etc.)
-
Access Control
- Use read-only API keys when possible
- Limit permissions to test data only
- Monitor agent actions
-
Data Isolation
- Separate test environment
- Namespaced test data (e.g., all IDs start with "TEST_")
- Automated cleanup
-
Rate Limiting
- Be aware of API rate limits
- Implement backoff strategies
- Cache frequently accessed data
-
Connection Pooling
# Reuse connections across problems self.erp_client = ERPClient() # Initialize once
-
Parallel Verification
# Verify multiple fields concurrently results = await asyncio.gather( self.verify_field_1(), self.verify_field_2(), self.verify_field_3() )
-
Caching
# Cache reference data that doesn't change self.product_catalog = await self.erp_client.get_products()
-
Log All API Calls
logger.info(f"Querying {entity_type} {entity_id}") logger.debug(f"Response: {response}")
-
Save State Snapshots
before_state = await self.get_entity_state() # ... agent works ... after_state = await self.get_entity_state() snapshot_file.write_text(json.dumps({ "before": before_state, "after": after_state }))
-
Verification Reports Generate detailed reports showing:
- What was expected
- What was found
- All checked fields
- Execution logs
See erp_workflow_example.py for a complete implementation.
For Odoo-specific integration:
import xmlrpc.client
class OdooERPBenchmark(BaseBenchmark):
def __init__(self, url, db, username, password, **kwargs):
super().__init__(**kwargs)
common = xmlrpc.client.ServerProxy(f'{url}/xmlrpc/2/common')
self.models = xmlrpc.client.ServerProxy(f'{url}/xmlrpc/2/object')
self.uid = common.authenticate(db, username, password, {})
self.db = db
self.password = password
def execute(self, model, method, *args):
return self.models.execute_kw(
self.db, self.uid, self.password,
model, method, args
)
async def score_problem(self, problem, ...):
# Read current state from Odoo
record = self.execute(
'sale.order', 'read',
[problem.entity_id],
{'fields': ['discount', 'state']}
)[0]
# Verify changes
# ...- Choose integration method (API/Database/SDK)
- Create ERP client wrapper
- Extend Problem class with ERP fields
- Implement
setup_problem()to provide credentials - Implement
score_problem()to verify ERP state - Add cleanup/reset logic
- Set up test environment
- Add logging and debugging
- Test with sample problems
- Document ERP-specific setup requirements
For more examples, see erp_workflow_example.py