From 96e55acfd8f6918fa77489d908f0f19c0857da4a Mon Sep 17 00:00:00 2001 From: Armaan Agrawal Date: Thu, 2 Apr 2026 16:29:09 -0400 Subject: [PATCH 1/3] docs: add AgentOpt skill for offline evaluation workflows --- .codex/skills/agentopt/SKILL.md | 220 ++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 .codex/skills/agentopt/SKILL.md diff --git a/.codex/skills/agentopt/SKILL.md b/.codex/skills/agentopt/SKILL.md new file mode 100644 index 0000000..899a7b9 --- /dev/null +++ b/.codex/skills/agentopt/SKILL.md @@ -0,0 +1,220 @@ +--- +name: agentopt +description: Use this skill when optimizing LLM model combinations for an existing agent workflow with AgentOpt, especially when creating an offline evaluation dataset, choosing selector/concurrency settings, and exporting benchmark-ready artifacts. +metadata: + short-description: Optimize agent model combos with offline eval +--- + +# AgentOpt Skill + +Use this skill when the user wants to: +- run model selection for an agent pipeline (single-step or multi-step), +- create or clean an offline evaluation dataset, +- tune `method` / `parallel` / `max_concurrent`, +- produce shareable benchmark outputs (CSV + best config + run metadata). + +## Fast Workflow + +1. **Wrap agent with AgentOpt contract** +2. **Create/validate offline dataset** +3. **Define candidate model space** +4. **Run selector with explicit concurrency budget** +5. **Export artifacts for review/submission** + +--- + +## 1) Agent Contract (required) + +AgentOpt expects: +- `__init__(self, models)` where `models` is a dict like `{"planner": "gpt-4o-mini", "solver": "gpt-4o"}` +- `run(self, input_data)` returning output for one datapoint + +```python +class MyAgent: + def __init__(self, models): + self.planner_model = models["planner"] + self.solver_model = models["solver"] + + def run(self, input_data): + # call your framework here (OpenAI / LangChain / CrewAI / etc.) + return {"answer": "..."} +``` + +No inheritance/base class is required (duck typing). + +--- + +## 2) Offline Dataset Creation (required) + +### Dataset shape AgentOpt enforces + +Dataset must: +- support `len(dataset)` and `dataset[i]`, +- be non-empty, +- contain elements that unpack as `(input_data, expected_answer)`. + +Canonical form: + +```python +dataset = [ + ({"question": "What is 2+2?"}, "4"), + ({"question": "Capital of France?"}, "Paris"), +] +``` + +### Recommended JSONL schema for offline evaluation + +Use JSONL with one object per line: + +```json +{"input": {"question": "What is 2+2?"}, "expected": "4", "id": "sample-0001"} +{"input": {"question": "Capital of France?"}, "expected": "Paris", "id": "sample-0002"} +``` + +### Build dataset from traces/logs (example) + +```python +import json +from pathlib import Path + +def load_offline_dataset(path: str, limit: int | None = None): + rows = [] + with Path(path).open("r", encoding="utf-8") as f: + for line in f: + obj = json.loads(line) + # Keep only labeled rows + if "input" not in obj or "expected" not in obj: + continue + rows.append((obj["input"], obj["expected"])) + if limit and len(rows) >= limit: + break + if not rows: + raise ValueError("No usable labeled rows found.") + return rows +``` + +If your production traces do not contain ground truth, create labels first (human or rubric-based) before running AgentOpt. + +--- + +## 3) Model Space Definition + +`models` is a dict mapping node name -> candidate list. + +```python +models = { + "planner": ["gpt-4o-mini", "gpt-4.1"], + "solver": ["gpt-4o-mini", "gpt-4.1"], +} +``` + +Candidate entries can be: +- model name strings, or +- prebuilt LLM/model instances (framework-dependent), as long as your agent wrapper consumes them correctly. + +Keep node keys in `models` aligned with what your agent reads in `__init__(self, models)`. + +--- + +## 4) Run Selection + +Use `ModelSelector(...)` for method dispatch: + +```python +from agentopt import ModelSelector + +def eval_fn(expected, actual): + text = actual.get("answer", str(actual)) if isinstance(actual, dict) else str(actual) + return 1.0 if str(expected).lower() in text.lower() else 0.0 + +selector = ModelSelector( + agent=MyAgent, + models=models, + eval_fn=eval_fn, + dataset=dataset, + method="auto", # auto -> arm_elimination +) + +results = selector.select_best(parallel=True, max_concurrent=40) +results.print_summary() +``` + +### Method selection guidance + +- `auto` / `arm_elimination`: default for most users. +- `brute_force`: exhaustive baseline. +- `random`: cheap exploratory baseline (`sample_fraction`). +- `hill_climbing`: local search with restarts (`num_restarts`). +- `epsilon_lucb`: near-best identification (`epsilon`, `n_initial`). +- `threshold`: classify combos above/below target score (`threshold`, `n_initial`). +- `lm_proposal`: proposer LLM picks one combo first (then evaluates that combo). + +### Concurrency semantics (important) + +`max_concurrent` is the **total API call budget** across all combos + datapoints. + +Internally AgentOpt splits it into: +- datapoint-level concurrency per combo (`dp_concurrent`), +- combo-level concurrency (`n_combo`), + +such that: +- `n_combo * dp_concurrent <= max_concurrent` + +So increasing `max_concurrent` raises total throughput, not just per-combo throughput. + +--- + +## 5) Export Benchmark-Ready Artifacts + +Always export at least: +- ranked results table (`CSV`), +- best combo config (`YAML`), +- run metadata (`JSON`) containing method + concurrency + dataset size. + +```python +from datetime import datetime, timezone +import json + +results.to_csv("artifacts/results.csv") +results.export_config("artifacts/best_config.yaml") + +meta = { + "timestamp_utc": datetime.now(timezone.utc).isoformat(), + "method": "auto", + "parallel": True, + "max_concurrent": 40, + "dataset_size": len(dataset), + "selection_wall_time_seconds": results.selection_wall_time_seconds, + "best_combo": results.get_best_combo(), +} +with open("artifacts/run_metadata.json", "w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) +``` + +For external benchmark submission/repro checks, include: +- commit SHA, +- environment (package versions), +- random seed (if applicable), +- exact dataset split ID/version. + +--- + +## 6) Troubleshooting Checklist + +- **`TypeError` on dataset**: ensure `(input, expected)` tuple elements and non-empty sequence. +- **No token/cost tracking**: ensure LLM calls happen through supported HTTP stack (AgentOpt tracker uses transport interception). +- **Too slow**: increase `max_concurrent`, reduce candidate space, or switch from `brute_force` to `auto`. +- **Unstable rankings**: increase dataset size and/or enforce deterministic prompts where possible. +- **High rerun cost**: use `LLMTracker(cache_dir=...)` for disk cache reuse. + +--- + +## Repo Pointers + +- Core API: `src/agentopt/__init__.py` +- Selector internals + concurrency split: `src/agentopt/model_selection/base.py` +- Brute force reference: `src/agentopt/model_selection/brute_force.py` +- Quickstart docs: `docs/getting-started/quickstart.md` +- Selector docs: `docs/api/selectors.md` +- End-to-end examples: `examples/*.py` + From 8b969dd937dfdeece602b5e14e1530bdf84b1fe2 Mon Sep 17 00:00:00 2001 From: Armaan Agrawal Date: Thu, 2 Apr 2026 16:30:19 -0400 Subject: [PATCH 2/3] docs: clarify AgentOpt skill purpose and execution model --- .codex/skills/agentopt/SKILL.md | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/.codex/skills/agentopt/SKILL.md b/.codex/skills/agentopt/SKILL.md index 899a7b9..733e16d 100644 --- a/.codex/skills/agentopt/SKILL.md +++ b/.codex/skills/agentopt/SKILL.md @@ -7,6 +7,25 @@ metadata: # AgentOpt Skill +## What this skill does + +This skill tells an agent exactly how to run AgentOpt end-to-end for offline model selection: +- define an agent wrapper (`__init__` + `run`), +- build/clean a labeled offline dataset, +- choose a selector strategy and concurrency budget, +- run evaluation and export reproducible artifacts. + +## How it works (mental model) + +AgentOpt evaluates model **combinations** over a labeled dataset: +1. Create one model combo (e.g., planner=`gpt-4o-mini`, solver=`gpt-4.1`). +2. Instantiate the user agent with that combo. +3. Run all dataset samples, score with `eval_fn`, and track latency/tokens/cost. +4. Repeat for other combos (or a searched subset, depending on `method`). +5. Rank combinations by quality first, then latency/cost tie-breakers. + +`parallel=True` with `max_concurrent=N` controls the total in-flight API budget across all combo/datapoint evaluations. + Use this skill when the user wants to: - run model selection for an agent pipeline (single-step or multi-step), - create or clean an offline evaluation dataset, @@ -217,4 +236,3 @@ For external benchmark submission/repro checks, include: - Quickstart docs: `docs/getting-started/quickstart.md` - Selector docs: `docs/api/selectors.md` - End-to-end examples: `examples/*.py` - From 1c3c778e77d4764126f6a9fee3afbf4d9d7c0316 Mon Sep 17 00:00:00 2001 From: Armaan Agrawal Date: Thu, 2 Apr 2026 17:56:49 -0400 Subject: [PATCH 3/3] docs: expand AgentOpt skill with full parameter and submission guidance --- .codex/skills/agentopt/SKILL.md | 87 +++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/.codex/skills/agentopt/SKILL.md b/.codex/skills/agentopt/SKILL.md index 733e16d..593b6dd 100644 --- a/.codex/skills/agentopt/SKILL.md +++ b/.codex/skills/agentopt/SKILL.md @@ -42,6 +42,56 @@ Use this skill when the user wants to: --- +## Parameter Reference (use this exact mapping) + +### `ModelSelector(...)` parameters + +| Parameter | Required | Description | +|---|---:|---| +| `agent` | yes | Agent class/factory that accepts one combo dict in `__init__` and exposes `run(input_data)` | +| `models` | yes | Dict of node -> candidate list (Cartesian product defines combo space) | +| `eval_fn` | yes | Scoring function `(expected, actual) -> bool|float` (higher is better) | +| `dataset` | yes | Sequence of `(input_data, expected_answer)` pairs | +| `method` | no | Selector algorithm (`auto`, `brute_force`, `random`, `hill_climbing`, `arm_elimination`, `epsilon_lucb`, `threshold`, `lm_proposal`, `bayesian`) | +| `model_prices` | no | Custom token pricing overrides for accurate cost reporting | +| `tracker` | no | Custom `LLMTracker` (e.g., enable persistent cache dir) | +| `node_descriptions` | no | Node role descriptions, used by `lm_proposal` prompting | +| `**kwargs` | depends | Method-specific knobs listed below | + +### `select_best(...)` parameters + +| Parameter | Default | Description | +|---|---:|---| +| `parallel` | `False` | Run async combo/datapoint evaluation when selector supports it | +| `max_concurrent` | `20` | **Global** max in-flight API calls across all combos + datapoints | + +### Concurrency semantics (exact) + +When `parallel=True`, AgentOpt splits global budget into: +- `dp_concurrent = min(max_concurrent, current_batch_size)` +- `n_combo = max_concurrent // dp_concurrent` + +Guarantee: +- `n_combo * dp_concurrent <= max_concurrent` + +Interpretation: +- Raise `max_concurrent` to increase total throughput. +- Keep it bounded by provider rate limits and local runtime capacity. + +### Method-specific `**kwargs` + +| Method | Extra params | +|---|---| +| `random` | `sample_fraction`, `seed` | +| `hill_climbing` | `max_iterations`, `num_restarts`, `patience`, `seed`, `batch_size` | +| `arm_elimination` | `n_initial`, `growth_factor`, `confidence` | +| `epsilon_lucb` | `epsilon`, `n_initial`, `confidence` | +| `threshold` | `threshold`, `n_initial`, `confidence` | +| `lm_proposal` | `proposer_model`, `proposer_client`, `objective`, `dataset_preview_size`, `node_descriptions` | +| `bayesian` | `batch_size`, `sample_fraction` | + +--- + ## 1) Agent Contract (required) AgentOpt expects: @@ -216,6 +266,43 @@ For external benchmark submission/repro checks, include: - random seed (if applicable), - exact dataset split ID/version. +## 5.1) Benchmark submission packet (recommended) + +Create one directory per benchmark run: + +```text +artifacts/ + // + results.csv + best_config.yaml + run_metadata.json + dataset_manifest.json + README_submission.md +``` + +`dataset_manifest.json` should include dataset source + labeling policy: + +```json +{ + "name": "my_offline_eval_v1", + "num_samples": 120, + "format": "jsonl(input, expected)", + "split": "offline_eval", + "label_policy": "human_verified", + "created_at_utc": "2026-04-02T00:00:00Z" +} +``` + +`README_submission.md` should state: +- benchmark/task name, +- method and all key params (`parallel`, `max_concurrent`, method kwargs), +- best combo and headline metrics, +- reproducibility commands. + +Submission channel depends on project policy: +- if using GitHub: upload packet in PR/issue comment and link commit SHA, +- if using external benchmark portal: upload the same packet unchanged. + --- ## 6) Troubleshooting Checklist