From ef7744e213fd6ed148d7378b3cf47cd6032ef80e Mon Sep 17 00:00:00 2001
From: "Ryan E. Hamilton" <ryan.hamilton@circleci.com>
Date: Thu, 19 Mar 2026 10:44:15 -0400
Subject: [PATCH 1/4] fix: restore GitHub Actions workflow for build badge

The workflow was removed during the CircleCI migration but the README
badge still references it, causing a broken badge on the repo.

Made-with: Cursor
---
 .github/workflows/go.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 .github/workflows/go.yml

diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
new file mode 100644
index 0000000..9209b34
--- /dev/null
+++ b/.github/workflows/go.yml
@@ -0,0 +1,25 @@
+name: Go
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Go
+      uses: actions/setup-go@v5
+      with:
+        go-version: 'stable'
+
+    - name: Build
+      run: go build -v ./...
+
+    - name: Test
+      run: go test -tags=test -race -v ./...

From 625baa335648b4da4d8c41ccae05e3fe2ce5a214 Mon Sep 17 00:00:00 2001
From: "Ryan E. Hamilton" <ryan.hamilton@circleci.com>
Date: Thu, 19 Mar 2026 10:54:31 -0400
Subject: [PATCH 2/4] docs: add AGENTS.md with repo guidance and remote
 enforcement

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 AGENTS.md | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..02cffc8
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,52 @@
+# AGENTS.md
+
+This file provides guidance to Claude Code (claude.ai/code) or any AI agent when working with code in this repository.
+
+## Remote Repository
+
+The canonical remote for this repo is `github.com/CircleCI-Research/evalbench`. Never target or push to `petmal/MindTrial` (the upstream fork origin). Always confirm PRs, pushes, and branch operations target `CircleCI-Research/evalbench`.
+
+## Commands
+
+```bash
+# Build
+go build -v ./...
+go build -o evalbench ./cmd/evalbench/
+
+# Test (all tests require the `test` build tag)
+go test -tags=test -race -v ./...
+
+# Run a single test package
+go test -tags=test -race -v ./runners/...
+
+# Run a specific test
+go test -tags=test -race -v -run TestName ./runners/...
+```
+
+## Architecture
+
+EvalBench is a Go CLI that runs AI model evaluations across 9+ providers (OpenAI, Anthropic, Google Gemini, DeepSeek, Mistral, xAI, Alibaba, Moonshot, OpenRouter) in parallel and compares results.
+
+**Execution flow:** `cmd/evalbench/main.go` → parses CLI flags and config files → `runners/default_runner.go` orchestrates task execution → each task is sent to each provider via `providers/provider.go` → responses are validated by `validators/` → results are written by `formatters/`.
+
+**Key packages:**
+- `config/` — YAML-driven config structs. `config.go` holds `AppConfig` (providers, judges, tools); `tasks.go` holds task definitions with validation rules and response schemas.
+- `providers/` — One file per AI provider, all implementing a shared `Provider` interface. Provider-specific quirks (streaming, reasoning effort, tool use) are encapsulated here. `providers/tools/` runs sandboxed Docker tool execution.
+- `runners/` — `default_runner.go` is the main orchestrator: parallel across providers, sequential within a provider.
+- `validators/` — Pluggable validation: `value_validator.go` for exact/fuzzy matching, `judge_validator.go` for LLM-based semantic evaluation.
+- `formatters/` — CSV, HTML, JSONL, and log output formats.
+- `cmd/evalbench/tui/` — Bubble Tea terminal UI for interactive config selection and real-time task monitoring.
+- `pkg/mistralai/` and `pkg/xai/` — Auto-generated OpenAPI clients (do not edit manually).
+
+**Configuration:** Two YAML files drive everything:
+- `config.yaml` — providers (API keys, models, rate limits, parameters), judge configs, tool definitions, output paths.
+- `tasks.yaml` — task prompts, expected outputs, response format (plain text or JSON schema), validation rules, file attachments.
+
+**Claude Code slash commands** in `.claude/commands/` implement a "model race" UX: `/run-model-comparison`, `/simulate-model-comparison`, `/announce-model-comparison` (live Ken Squier-style voice commentary via Kokoro TTS), and `/stop-model-comparison`.
+
+## Testing Notes
+
+- All tests use build tag `test` — always pass `-tags=test`.
+- Tests use `stretchr/testify` for assertions.
+- `pkg/testutils/` contains shared test helpers.
+- Provider tests (`providers/provider_test.go`, ~35KB) and validator tests (`validators/validator_test.go`, ~60KB) are the heaviest test files.

From 9a263d0487b9867f614754612e963e7044385cbe Mon Sep 17 00:00:00 2001
From: "Ryan E. Hamilton" <ryan.hamilton@circleci.com>
Date: Thu, 19 Mar 2026 11:01:09 -0400
Subject: [PATCH 3/4] fix: make runs sequential within a provider and update
 log formatter golden files

Runs within a single provider were incorrectly launched in parallel
goroutines, causing non-deterministic result ordering that violated the
documented contract ("individual runs on a single provider are executed
sequentially") and broke TestRunnerRun assertions.

Also updates log formatter golden files to include the Score column
added to the log output format.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 formatters/testdata/empty.log   |  2 +-
 formatters/testdata/results.log | 22 ++++++-------
 runners/default_runner.go       | 55 +++++++++++++++------------------
 3 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/formatters/testdata/empty.log b/formatters/testdata/empty.log
index 33f53b5..08d9a2a 100644
--- a/formatters/testdata/empty.log
+++ b/formatters/testdata/empty.log
@@ -1 +1 @@
-TraceID |Provider |Run |Task |Status |Duration |Answer |
+TraceID |Provider |Run |Task |Status |Score |Duration |Answer |
diff --git a/formatters/testdata/results.log b/formatters/testdata/results.log
index 283d4ac..91dfb42 100644
--- a/formatters/testdata/results.log
+++ b/formatters/testdata/results.log
@@ -1,12 +1,12 @@
-TraceID                    |Provider      |Run         |Task      |Status |Duration |Answer                               |
-01JEDE7Z8X0000000000000001 |provider-name |run-success |task-name |Passed |1m35s    |Quos aut rerum quaerat qui ad culpa. |
-01JEDE7Z8X0000000000000002 |provider-name |run-failure |task-name |Failed |10s      |@@ -1,67 +1,36 @@
+TraceID                    |Provider      |Run         |Task      |Status |Score |Duration |Answer                               |
+01JEDE7Z8X0000000000000001 |provider-name |run-success |task-name |Passed |100   |1m35s    |Quos aut rerum quaerat qui ad culpa. |
+01JEDE7Z8X0000000000000002 |provider-name |run-failure |task-name |Failed |1     |10s      |@@ -1,67 +1,36 @@
 -Nihil reprehenderit enim voluptatum dolore nisi neque quia aut qui
 +Ipsam ea et optio explicabo eius et
  .
                            |
-01JEDE7Z8X0000000000000003 |provider-name |run-success-multiple-answers |task-name |Passed |17s    |Quos aut rerum quaerat qui ad culpa. |
-01JEDE7Z8X0000000000000004 |provider-name |run-failure-multiple-answers |task-name |Failed |3m0.8s |[
+01JEDE7Z8X0000000000000003 |provider-name |run-success-multiple-answers |task-name |Passed |100 |17s    |Quos aut rerum quaerat qui ad culpa. |
+01JEDE7Z8X0000000000000004 |provider-name |run-failure-multiple-answers |task-name |Failed |8   |3m0.8s |[
     @@ -1,48 +1,36 @@
     -Dolores saepe ad sed rerum autem iure minima
     +Ipsam ea et optio explicabo eius
@@ -19,11 +19,11 @@ TraceID                    |Provider      |Run         |Task      |Status |Durat
      .
     
 ]                          |
-01JEDE7Z8X0000000000000005 |provider-name |run-error              |task-name |Error   |0s        |error message                              |
-01JEDE7Z8X0000000000000006 |provider-name |run-not-supported      |task-name |Skipped |500ms     |Sequi molestiae iusto sit sit dolorum aut. |
-01JEDE7Z8X0000000000000007 |provider-name |run-validation-error   |task-name |Error   |2s        |Adipiscing elit sed do eiusmod tempor.     |
-01JEDE7Z8X0000000000000008 |provider-name |run-parsing-error      |task-name |Error   |5m14.159s |Invalid JSON: {broken                      |
-01JEDE7Z8X0000000000000009 |provider-name |run-structured-success |task-name |Passed  |42s       |[
+01JEDE7Z8X0000000000000005 |provider-name |run-error              |task-name |Error   |0   |0s        |error message                              |
+01JEDE7Z8X0000000000000006 |provider-name |run-not-supported      |task-name |Skipped |-   |500ms     |Sequi molestiae iusto sit sit dolorum aut. |
+01JEDE7Z8X0000000000000007 |provider-name |run-validation-error   |task-name |Error   |0   |2s        |Adipiscing elit sed do eiusmod tempor.     |
+01JEDE7Z8X0000000000000008 |provider-name |run-parsing-error      |task-name |Error   |0   |5m14.159s |Invalid JSON: {broken                      |
+01JEDE7Z8X0000000000000009 |provider-name |run-structured-success |task-name |Passed  |100 |42s       |[
   {
     "level": "INFO",
     "message": "User 'admin' logged in successfully.",
@@ -36,7 +36,7 @@ TraceID                    |Provider      |Run         |Task      |Status |Durat
     "timestamp": "2025-09-14T10:31:15Z"
   }
 ]                          |
-01JEDE7Z8X0000000000000010 |provider-name |run-structured-failure |task-name |Failed |38s |[
+01JEDE7Z8X0000000000000010 |provider-name |run-structured-failure |task-name |Failed |69 |38s |[
     @@ -11,12 +11,13 @@
      %22: %22
     -INFO
diff --git a/runners/default_runner.go b/runners/default_runner.go
index a9a2b24..320f819 100644
--- a/runners/default_runner.go
+++ b/runners/default_runner.go
@@ -269,41 +269,36 @@ func (r *defaultRunner) run(ctx context.Context, tasks []config.Task, rs resultC
 func (r *defaultRunner) runTasks(ctx context.Context, logger logging.Logger, provider providers.Provider, runs []config.RunConfig, tasks []config.Task, rs resultCollector) {
 	logger.Message(ctx, logging.LevelInfo, "%s: starting %d task%s on this provider in %d configuration%s...", pluralize(provider.Name(), countable(len(tasks)), countable(len(runs)))...)
 	providerStart := time.Now()
-	var wg sync.WaitGroup
 	for _, run := range runs {
-		wg.Add(1)
-		go func(rc config.RunConfig) {
-			defer wg.Done()
-			if rc.MaxRequestsPerMinute > 0 {
-				logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute)
-			}
-			skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput
-			if skipTasksWithSchemaResultFormat {
-				logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name)
-			}
-			skipTasksWithFiles := rc.TextOnly
-			if skipTasksWithFiles {
-				logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name)
-			}
-			executor := execution.NewExecutor(provider, rc)
+		rc := run
+		if rc.MaxRequestsPerMinute > 0 {
+			logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute)
+		}
+		skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput
+		if skipTasksWithSchemaResultFormat {
+			logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name)
+		}
+		skipTasksWithFiles := rc.TextOnly
+		if skipTasksWithFiles {
+			logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name)
+		}
+		executor := execution.NewExecutor(provider, rc)
 
-			for _, task := range tasks {
-				runResult := RunResult{TraceID: ulid.Make().String()}
+		for _, task := range tasks {
+			runResult := RunResult{TraceID: ulid.Make().String()}
 
-				// Create prefixed logger for this specific task.
-				taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name))
+			// Create prefixed logger for this specific task.
+			taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name))
 
-				taskLogger.Message(ctx, logging.LevelInfo, "starting task...")
-				runStart := time.Now()
-				r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult)
-				taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart))
-				taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score())
-				rs.appendResult(runResult)
-				rs.emitProgressEvent()
-			}
-		}(run)
+			taskLogger.Message(ctx, logging.LevelInfo, "starting task...")
+			runStart := time.Now()
+			r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult)
+			taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart))
+			taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score())
+			rs.appendResult(runResult)
+			rs.emitProgressEvent()
+		}
 	}
-	wg.Wait()
 	logger.Message(ctx, logging.LevelInfo, "%s: all tasks in all configurations have finished on this provider in %s.", provider.Name(), time.Since(providerStart))
 }
 

From d0db7a898f6f628cb467accc7850547be5c042a4 Mon Sep 17 00:00:00 2001
From: "Ryan E. Hamilton" <ryan.hamilton@circleci.com>
Date: Thu, 19 Mar 2026 11:06:08 -0400
Subject: [PATCH 4/4] fix: restore parallel run execution and make runner tests
 order-insensitive
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts the sequential run change — runs within a provider intentionally
execute in parallel for throughput. Updates the doc comment to match.

Fixes TestRunnerRun by sorting results by (Run, Task, Got, Want) before
comparing, since parallel execution produces non-deterministic ordering.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 runners/default_runner.go | 58 +++++++++++++++++++++------------------
 runners/runner_test.go    | 26 ++++++++++++++++++
 2 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/runners/default_runner.go b/runners/default_runner.go
index 320f819..e3f9ace 100644
--- a/runners/default_runner.go
+++ b/runners/default_runner.go
@@ -113,7 +113,8 @@ func (r *asyncResultSet) emitMessageEvent(message string) {
 }
 
 // NewDefaultRunner creates a new Runner that executes tasks on all configured providers
-// in parallel. The individual runs on a single provider are executed sequentially.
+// in parallel. The runs within a single provider also execute in parallel; tasks within
+// each run are executed sequentially.
 // It returns an error if any provider initialization fails.
 func NewDefaultRunner(ctx context.Context, cfg []config.ProviderConfig, judges []config.JudgeConfig, tools []config.ToolConfig, logger zerolog.Logger) (Runner, error) {
 	toolValidator, err := providertools.NewDockerToolExecutor(ctx)
@@ -269,36 +270,41 @@ func (r *defaultRunner) run(ctx context.Context, tasks []config.Task, rs resultC
 func (r *defaultRunner) runTasks(ctx context.Context, logger logging.Logger, provider providers.Provider, runs []config.RunConfig, tasks []config.Task, rs resultCollector) {
 	logger.Message(ctx, logging.LevelInfo, "%s: starting %d task%s on this provider in %d configuration%s...", pluralize(provider.Name(), countable(len(tasks)), countable(len(runs)))...)
 	providerStart := time.Now()
+	var wg sync.WaitGroup
 	for _, run := range runs {
-		rc := run
-		if rc.MaxRequestsPerMinute > 0 {
-			logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute)
-		}
-		skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput
-		if skipTasksWithSchemaResultFormat {
-			logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name)
-		}
-		skipTasksWithFiles := rc.TextOnly
-		if skipTasksWithFiles {
-			logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name)
-		}
-		executor := execution.NewExecutor(provider, rc)
+		wg.Add(1)
+		go func(rc config.RunConfig) {
+			defer wg.Done()
+			if rc.MaxRequestsPerMinute > 0 {
+				logger.Message(ctx, logging.LevelInfo, "%s: %s: request rate limited to %d requests/min.", provider.Name(), rc.Name, rc.MaxRequestsPerMinute)
+			}
+			skipTasksWithSchemaResultFormat := rc.DisableStructuredOutput
+			if skipTasksWithSchemaResultFormat {
+				logger.Message(ctx, logging.LevelInfo, "%s: %s: structured output disabled for this configuration.", provider.Name(), rc.Name)
+			}
+			skipTasksWithFiles := rc.TextOnly
+			if skipTasksWithFiles {
+				logger.Message(ctx, logging.LevelInfo, "%s: %s: text-only mode enabled for this configuration.", provider.Name(), rc.Name)
+			}
+			executor := execution.NewExecutor(provider, rc)
 
-		for _, task := range tasks {
-			runResult := RunResult{TraceID: ulid.Make().String()}
+			for _, task := range tasks {
+				runResult := RunResult{TraceID: ulid.Make().String()}
 
-			// Create prefixed logger for this specific task.
-			taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name))
+				// Create prefixed logger for this specific task.
+				taskLogger := logger.WithContext(fmt.Sprintf("[%s] %s: %s: %s: ", runResult.TraceID, provider.Name(), rc.Name, task.Name))
 
-			taskLogger.Message(ctx, logging.LevelInfo, "starting task...")
-			runStart := time.Now()
-			r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult)
-			taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart))
-			taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score())
-			rs.appendResult(runResult)
-			rs.emitProgressEvent()
-		}
+				taskLogger.Message(ctx, logging.LevelInfo, "starting task...")
+				runStart := time.Now()
+				r.runTask(ctx, taskLogger, executor, task, skipTasksWithSchemaResultFormat, skipTasksWithFiles, &runResult)
+				taskLogger.Message(ctx, logging.LevelInfo, "task has finished in %s.", time.Since(runStart))
+				taskLogger.Message(ctx, logging.LevelDebug, "result: status=%s score=%s", toStatus(runResult.Kind), runResult.Score())
+				rs.appendResult(runResult)
+				rs.emitProgressEvent()
+			}
+		}(run)
 	}
+	wg.Wait()
 	logger.Message(ctx, logging.LevelInfo, "%s: all tasks in all configurations have finished on this provider in %s.", provider.Name(), time.Since(providerStart))
 }
 
diff --git a/runners/runner_test.go b/runners/runner_test.go
index 866f3ec..643d41d 100644
--- a/runners/runner_test.go
+++ b/runners/runner_test.go
@@ -9,6 +9,8 @@ package runners
 import (
 	"context"
 	"errors"
+	"fmt"
+	"sort"
 	"testing"
 	"time"
 
@@ -1201,6 +1203,12 @@ func TestRunnerRun(t *testing.T) {
 						assert.NotEmpty(t, results[provider][i].TraceID, "TraceID should not be empty")
 						results[provider][i].TraceID = ""
 					}
+					// Runs within a provider execute in parallel so result order is
+					// non-deterministic. Sort by (Run, Task, Got) before comparing.
+					sortResults(results[provider])
+				}
+				for provider := range tt.want {
+					sortResults(tt.want[provider])
 				}
 
 				assert.Equal(t, tt.want, results)
@@ -2182,3 +2190,21 @@ func TestToLines(t *testing.T) {
 		})
 	}
 }
+
+// sortResults sorts a RunResult slice by (Run, Task, Got) so that order-insensitive
+// comparisons work correctly even when runs execute in parallel.
+func sortResults(results []RunResult) {
+	sort.Slice(results, func(i, j int) bool {
+		a, b := results[i], results[j]
+		if a.Run != b.Run {
+			return a.Run < b.Run
+		}
+		if a.Task != b.Task {
+			return a.Task < b.Task
+		}
+		if fmt.Sprint(a.Got) != fmt.Sprint(b.Got) {
+			return fmt.Sprint(a.Got) < fmt.Sprint(b.Got)
+		}
+		return fmt.Sprint(a.Want) < fmt.Sprint(b.Want)
+	})
+}