From ef7744e213fd6ed148d7378b3cf47cd6032ef80e Mon Sep 17 00:00:00 2001 From: "Ryan E. Hamilton" Date: Thu, 19 Mar 2026 10:44:15 -0400 Subject: [PATCH 1/4] fix: restore GitHub Actions workflow for build badge The workflow was removed during the CircleCI migration but the README badge still references it, causing a broken badge on the repo. Made-with: Cursor --- .github/workflows/go.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/go.yml diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 0000000..9209b34 --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,25 @@ +name: Go + +on: + push: + branches: [ "main" ] + pull_request: + branches: [ "main" ] + +jobs: + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: 'stable' + + - name: Build + run: go build -v ./... + + - name: Test + run: go test -tags=test -race -v ./... From 625baa335648b4da4d8c41ccae05e3fe2ce5a214 Mon Sep 17 00:00:00 2001 From: "Ryan E. Hamilton" Date: Thu, 19 Mar 2026 10:54:31 -0400 Subject: [PATCH 2/4] docs: add AGENTS.md with repo guidance and remote enforcement Co-Authored-By: Claude Sonnet 4.6 --- AGENTS.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..02cffc8 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,52 @@ +# AGENTS.md + +This file provides guidance to Claude Code (claude.ai/code) or any AI agent when working with code in this repository. + +## Remote Repository + +The canonical remote for this repo is `github.com/CircleCI-Research/evalbench`. Never target or push to `petmal/MindTrial` (the upstream fork origin). Always confirm PRs, pushes, and branch operations target `CircleCI-Research/evalbench`. + +## Commands + +```bash +# Build +go build -v ./... +go build -o evalbench ./cmd/evalbench/ + +# Test (all tests require the `test` build tag) +go test -tags=test -race -v ./... + +# Run a single test package +go test -tags=test -race -v ./runners/... + +# Run a specific test +go test -tags=test -race -v -run TestName ./runners/... +``` + +## Architecture + +EvalBench is a Go CLI that runs AI model evaluations across 9+ providers (OpenAI, Anthropic, Google Gemini, DeepSeek, Mistral, xAI, Alibaba, Moonshot, OpenRouter) in parallel and compares results. + +**Execution flow:** `cmd/evalbench/main.go` → parses CLI flags and config files → `runners/default_runner.go` orchestrates task execution → each task is sent to each provider via `providers/provider.go` → responses are validated by `validators/` → results are written by `formatters/`. + +**Key packages:** +- `config/` — YAML-driven config structs. `config.go` holds `AppConfig` (providers, judges, tools); `tasks.go` holds task definitions with validation rules and response schemas. +- `providers/` — One file per AI provider, all implementing a shared `Provider` interface. Provider-specific quirks (streaming, reasoning effort, tool use) are encapsulated here. `providers/tools/` runs sandboxed Docker tool execution. +- `runners/` — `default_runner.go` is the main orchestrator: parallel across providers, sequential within a provider. +- `validators/` — Pluggable validation: `value_validator.go` for exact/fuzzy matching, `judge_validator.go` for LLM-based semantic evaluation. +- `formatters/` — CSV, HTML, JSONL, and log output formats. +- `cmd/evalbench/tui/` — Bubble Tea terminal UI for interactive config selection and real-time task monitoring. +- `pkg/mistralai/` and `pkg/xai/` — Auto-generated OpenAPI clients (do not edit manually). + +**Configuration:** Two YAML files drive everything: +- `config.yaml` — providers (API keys, models, rate limits, parameters), judge configs, tool definitions, output paths. +- `tasks.yaml` — task prompts, expected outputs, response format (plain text or JSON schema), validation rules, file attachments. + +**Claude Code slash commands** in `.claude/commands/` implement a "model race" UX: `/run-model-comparison`, `/simulate-model-comparison`, `/announce-model-comparison` (live Ken Squier-style voice commentary via Kokoro TTS), and `/stop-model-comparison`. + +## Testing Notes + +- All tests use build tag `test` — always pass `-tags=test`. +- Tests use `stretchr/testify` for assertions. +- `pkg/testutils/` contains shared test helpers. +- Provider tests (`providers/provider_test.go`, ~35KB) and validator tests (`validators/validator_test.go`, ~60KB) are the heaviest test files. From 9a263d0487b9867f614754612e963e7044385cbe Mon Sep 17 00:00:00 2001 From: "Ryan E. Hamilton" Date: Thu, 19 Mar 2026 11:01:09 -0400 Subject: [PATCH 3/4] fix: make runs sequential within a provider and update log formatter golden files Runs within a single provider were incorrectly launched in parallel goroutines, causing non-deterministic result ordering that violated the documented contract ("individual runs on a single provider are executed sequentially") and broke TestRunnerRun assertions. Also updates log formatter golden files to include the Score column added to the log output format. Co-Authored-By: Claude Sonnet 4.6 --- formatters/testdata/empty.log | 2 +- formatters/testdata/results.log | 22 ++++++------- runners/default_runner.go | 55 +++++++++++++++------------------ 3 files changed, 37 insertions(+), 42 deletions(-) diff --git a/formatters/testdata/empty.log b/formatters/testdata/empty.log index 33f53b5..08d9a2a 100644 --- a/formatters/testdata/empty.log +++ b/formatters/testdata/empty.log @@ -1 +1 @@ -TraceID |Provider |Run |Task |Status |Duration |Answer | +TraceID |Provider |Run |Task |Status |Score |Duration |Answer | diff --git a/formatters/testdata/results.log b/formatters/testdata/results.log index 283d4ac..91dfb42 100644 --- a/formatters/testdata/results.log +++ b/formatters/testdata/results.log @@ -1,12 +1,12 @@ -TraceID |Provider |Run |Task |Status |Duration |Answer | -01JEDE7Z8X0000000000000001 |provider-name |run-success |task-name |Passed |1m35s |Quos aut rerum quaerat qui ad culpa. | -01JEDE7Z8X0000000000000002 |provider-name |run-failure |task-name |Failed |10s |@@ -1,67 +1,36 @@ +TraceID |Provider |Run |Task |Status |Score |Duration |Answer | +01JEDE7Z8X0000000000000001 |provider-name |run-success |task-name |Passed |100 |1m35s |Quos aut rerum quaerat qui ad culpa. | +01JEDE7Z8X0000000000000002 |provider-name |run-failure |task-name |Failed |1 |10s |@@ -1,67 +1,36 @@ -Nihil reprehenderit enim voluptatum dolore nisi neque quia aut qui +Ipsam ea et optio explicabo eius et . | -01JEDE7Z8X0000000000000003 |provider-name |run-success-multiple-answers |task-name |Passed |17s |Quos aut rerum quaerat qui ad culpa. | -01JEDE7Z8X0000000000000004 |provider-name |run-failure-multiple-answers |task-name |Failed |3m0.8s |[ +01JEDE7Z8X0000000000000003 |provider-name |run-success-multiple-answers |task-name |Passed |100 |17s |Quos aut rerum quaerat qui ad culpa. | +01JEDE7Z8X0000000000000004 |provider-name |run-failure-multiple-answers |task-name |Failed |8 |3m0.8s |[ @@ -1,48 +1,36 @@ -Dolores saepe ad sed rerum autem iure minima +Ipsam ea et optio explicabo eius @@ -19,11 +19,11 @@ TraceID |Provider |Run |Task |Status |Durat . ] | -01JEDE7Z8X0000000000000005 |provider-name |run-error |task-name |Error |0s |error message | -01JEDE7Z8X0000000000000006 |provider-name |run-not-supported |task-name |Skipped |500ms |Sequi molestiae iusto sit sit dolorum aut. | -01JEDE7Z8X0000000000000007 |provider-name |run-validation-error |task-name |Error |2s |Adipiscing elit sed do eiusmod tempor. | -01JEDE7Z8X0000000000000008 |provider-name |run-parsing-error |task-name |Error |5m14.159s |Invalid JSON: {broken | -01JEDE7Z8X0000000000000009 |provider-name |run-structured-success |task-name |Passed |42s |[ +01JEDE7Z8X0000000000000005 |provider-name |run-error |task-name |Error |0 |0s |error message | +01JEDE7Z8X0000000000000006 |provider-name |run-not-supported |task-name |Skipped |- |500ms |Sequi molestiae iusto sit sit dolorum aut. | +01JEDE7Z8X0000000000000007 |provider-name |run-validation-error |task-name |Error |0 |2s |Adipiscing elit sed do eiusmod tempor. | +01JEDE7Z8X0000000000000008 |provider-name |run-parsing-error |task-name |Error |0 |5m14.159s |Invalid JSON: {broken | +01JEDE7Z8X0000000000000009 |provider-name |run-structured-success |task-name |Passed |100 |42s |[ { "level": "INFO", "message": "User 'admin' logged in successfully.", @@ -36,7 +36,7 @@ TraceID |Provider |Run |Task |Status |Durat "timestamp": "2025-09-14T10:31:15Z" } ] | -01JEDE7Z8X0000000000000010 |provider-name |run-structured-failure |task-name |Failed |38s |[ +01JEDE7Z8X0000000000000010 |provider-name |run-structured-failure |task-name |Failed |69 |38s |[ @@ -11,12 +11,13 @@ %22: %22 -INFO diff --git a/runners/default_runner.go b/runners/default_runner.go index a9a2b24..320f819 100644 --- a/runners/default_runner.go +++ b/runners/default_runner.go @@ -269,41 +269,36 @@ func (r *defaultRunner) run(ctx context.Context, tasks []config.Task, rs resultC func (r *defaultRunner) runTasks(ctx context.Context, logger logging.Logger, provider providers.Provider, runs []config.RunConfig, tasks []config.Task, rs resultCollector) { logger.Message(ctx, logging.LevelInfo, "%s: starting %d task%s on this provider in %d configuration%s...", pluralize(provider.Name(), countable(len(tasks)), countable(len(runs)))...) providerStart := time.Now() - var wg sync.WaitGroup for _, run := range runs { - wg.Add(1) - go func(rc config.RunConfig) { - defer wg.Done() - if rc.MaxRequestsPerMinute > 0 { - logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute) - } - skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput - if skipTasksWithSchemaResultFormat { - logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name) - } - skipTasksWithFiles := rc.TextOnly - if skipTasksWithFiles { - logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name) - } - executor := execution.NewExecutor(provider, rc) + rc := run + if rc.MaxRequestsPerMinute > 0 { + logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute) + } + skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput + if skipTasksWithSchemaResultFormat { + logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name) + } + skipTasksWithFiles := rc.TextOnly + if skipTasksWithFiles { + logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name) + } + executor := execution.NewExecutor(provider, rc) - for _, task := range tasks { - runResult := RunResult{TraceID: ulid.Make().String()} + for _, task := range tasks { + runResult := RunResult{TraceID: ulid.Make().String()} - // Create prefixed logger for this specific task. - taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name)) + // Create prefixed logger for this specific task. + taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name)) - taskLogger.Message(ctx, logging.LevelInfo, "starting task...") - runStart := time.Now() - r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult) - taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart)) - taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score()) - rs.appendResult(runResult) - rs.emitProgressEvent() - } - }(run) + taskLogger.Message(ctx, logging.LevelInfo, "starting task...") + runStart := time.Now() + r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult) + taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart)) + taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score()) + rs.appendResult(runResult) + rs.emitProgressEvent() + } } - wg.Wait() logger.Message(ctx, logging.LevelInfo, "%s: all tasks in all configurations have finished on this provider in %s.", provider.Name(), time.Since(providerStart)) } From d0db7a898f6f628cb467accc7850547be5c042a4 Mon Sep 17 00:00:00 2001 From: "Ryan E. Hamilton" Date: Thu, 19 Mar 2026 11:06:08 -0400 Subject: [PATCH 4/4] fix: restore parallel run execution and make runner tests order-insensitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the sequential run change — runs within a provider intentionally execute in parallel for throughput. Updates the doc comment to match. Fixes TestRunnerRun by sorting results by (Run, Task, Got, Want) before comparing, since parallel execution produces non-deterministic ordering. Co-Authored-By: Claude Sonnet 4.6 --- runners/default_runner.go | 58 +++++++++++++++++++++------------------ runners/runner_test.go | 26 ++++++++++++++++++ 2 files changed, 58 insertions(+), 26 deletions(-) diff --git a/runners/default_runner.go b/runners/default_runner.go index 320f819..e3f9ace 100644 --- a/runners/default_runner.go +++ b/runners/default_runner.go @@ -113,7 +113,8 @@ func (r *asyncResultSet) emitMessageEvent(message string) { } // NewDefaultRunner creates a new Runner that executes tasks on all configured providers -// in parallel. The individual runs on a single provider are executed sequentially. +// in parallel. The runs within a single provider also execute in parallel; tasks within +// each run are executed sequentially. // It returns an error if any provider initialization fails. func NewDefaultRunner(ctx context.Context, cfg []config.ProviderConfig, judges []config.JudgeConfig, tools []config.ToolConfig, logger zerolog.Logger) (Runner, error) { toolValidator, err := providertools.NewDockerToolExecutor(ctx) @@ -269,36 +270,41 @@ func (r *defaultRunner) run(ctx context.Context, tasks []config.Task, rs resultC func (r *defaultRunner) runTasks(ctx context.Context, logger logging.Logger, provider providers.Provider, runs []config.RunConfig, tasks []config.Task, rs resultCollector) { logger.Message(ctx, logging.LevelInfo, "%s: starting %d task%s on this provider in %d configuration%s...", pluralize(provider.Name(), countable(len(tasks)), countable(len(runs)))...) providerStart := time.Now() + var wg sync.WaitGroup for _, run := range runs { - rc := run - if rc.MaxRequestsPerMinute > 0 { - logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute) - } - skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput - if skipTasksWithSchemaResultFormat { - logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name) - } - skipTasksWithFiles := rc.TextOnly - if skipTasksWithFiles { - logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name) - } - executor := execution.NewExecutor(provider, rc) + wg.Add(1) + go func(rc config.RunConfig) { + defer wg.Done() + if rc.MaxRequestsPerMinute > 0 { + logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute) + } + skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput + if skipTasksWithSchemaResultFormat { + logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name) + } + skipTasksWithFiles := rc.TextOnly + if skipTasksWithFiles { + logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name) + } + executor := execution.NewExecutor(provider, rc) - for _, task := range tasks { - runResult := RunResult{TraceID: ulid.Make().String()} + for _, task := range tasks { + runResult := RunResult{TraceID: ulid.Make().String()} - // Create prefixed logger for this specific task. - taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name)) + // Create prefixed logger for this specific task. + taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name)) - taskLogger.Message(ctx, logging.LevelInfo, "starting task...") - runStart := time.Now() - r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult) - taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart)) - taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score()) - rs.appendResult(runResult) - rs.emitProgressEvent() - } + taskLogger.Message(ctx, logging.LevelInfo, "starting task...") + runStart := time.Now() + r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult) + taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart)) + taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score()) + rs.appendResult(runResult) + rs.emitProgressEvent() + } + }(run) } + wg.Wait() logger.Message(ctx, logging.LevelInfo, "%s: all tasks in all configurations have finished on this provider in %s.", provider.Name(), time.Since(providerStart)) } diff --git a/runners/runner_test.go b/runners/runner_test.go index 866f3ec..643d41d 100644 --- a/runners/runner_test.go +++ b/runners/runner_test.go @@ -9,6 +9,8 @@ package runners import ( "context" "errors" + "fmt" + "sort" "testing" "time" @@ -1201,6 +1203,12 @@ func TestRunnerRun(t *testing.T) { assert.NotEmpty(t, results[provider][i].TraceID, "TraceID should not be empty") results[provider][i].TraceID = "" } + // Runs within a provider execute in parallel so result order is + // non-deterministic. Sort by (Run, Task, Got) before comparing. + sortResults(results[provider]) + } + for provider := range tt.want { + sortResults(tt.want[provider]) } assert.Equal(t, tt.want, results) @@ -2182,3 +2190,21 @@ func TestToLines(t *testing.T) { }) } } + +// sortResults sorts a RunResult slice by (Run, Task, Got) so that order-insensitive +// comparisons work correctly even when runs execute in parallel. +func sortResults(results []RunResult) { + sort.Slice(results, func(i, j int) bool { + a, b := results[i], results[j] + if a.Run != b.Run { + return a.Run < b.Run + } + if a.Task != b.Task { + return a.Task < b.Task + } + if fmt.Sprint(a.Got) != fmt.Sprint(b.Got) { + return fmt.Sprint(a.Got) < fmt.Sprint(b.Got) + } + return fmt.Sprint(a.Want) < fmt.Sprint(b.Want) + }) +}