CircleCI-Research · ryan-circleci · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026 · Mar 19, 2026
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
@@ -0,0 +1,25 @@
+name: Go
+
+on:
+  push:
+    branches: [ "main" ]
+  pull_request:
+    branches: [ "main" ]
+
+jobs:
+
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Set up Go
+      uses: actions/setup-go@v5
+      with:
+        go-version: 'stable'
+
+    - name: Build
+      run: go build -v ./...
+
+    - name: Test
+      run: go test -tags=test -race -v ./...
diff --git a/AGENTS.md b/AGENTS.md
@@ -0,0 +1,52 @@
+# AGENTS.md
+
+This file provides guidance to Claude Code (claude.ai/code) or any AI agent when working with code in this repository.
+
+## Remote Repository
+
+The canonical remote for this repo is `github.com/CircleCI-Research/evalbench`. Never target or push to `petmal/MindTrial` (the upstream fork origin). Always confirm PRs, pushes, and branch operations target `CircleCI-Research/evalbench`.
+
+## Commands
+
+```bash
+# Build
+go build -v ./...
+go build -o evalbench ./cmd/evalbench/
+
+# Test (all tests require the `test` build tag)
+go test -tags=test -race -v ./...
+
+# Run a single test package
+go test -tags=test -race -v ./runners/...
+
+# Run a specific test
+go test -tags=test -race -v -run TestName ./runners/...
+```
+
+## Architecture
+
+EvalBench is a Go CLI that runs AI model evaluations across 9+ providers (OpenAI, Anthropic, Google Gemini, DeepSeek, Mistral, xAI, Alibaba, Moonshot, OpenRouter) in parallel and compares results.
+
+**Execution flow:** `cmd/evalbench/main.go` → parses CLI flags and config files → `runners/default_runner.go` orchestrates task execution → each task is sent to each provider via `providers/provider.go` → responses are validated by `validators/` → results are written by `formatters/`.
+
+**Key packages:**
+- `config/` — YAML-driven config structs. `config.go` holds `AppConfig` (providers, judges, tools); `tasks.go` holds task definitions with validation rules and response schemas.
+- `providers/` — One file per AI provider, all implementing a shared `Provider` interface. Provider-specific quirks (streaming, reasoning effort, tool use) are encapsulated here. `providers/tools/` runs sandboxed Docker tool execution.
+- `runners/` — `default_runner.go` is the main orchestrator: parallel across providers, sequential within a provider.
+- `validators/` — Pluggable validation: `value_validator.go` for exact/fuzzy matching, `judge_validator.go` for LLM-based semantic evaluation.
+- `formatters/` — CSV, HTML, JSONL, and log output formats.
+- `cmd/evalbench/tui/` — Bubble Tea terminal UI for interactive config selection and real-time task monitoring.
+- `pkg/mistralai/` and `pkg/xai/` — Auto-generated OpenAPI clients (do not edit manually).
+
+**Configuration:** Two YAML files drive everything:
+- `config.yaml` — providers (API keys, models, rate limits, parameters), judge configs, tool definitions, output paths.
+- `tasks.yaml` — task prompts, expected outputs, response format (plain text or JSON schema), validation rules, file attachments.
+
+**Claude Code slash commands** in `.claude/commands/` implement a "model race" UX: `/run-model-comparison`, `/simulate-model-comparison`, `/announce-model-comparison` (live Ken Squier-style voice commentary via Kokoro TTS), and `/stop-model-comparison`.
+
+## Testing Notes
+
+- All tests use build tag `test` — always pass `-tags=test`.
+- Tests use `stretchr/testify` for assertions.
+- `pkg/testutils/` contains shared test helpers.
+- Provider tests (`providers/provider_test.go`, ~35KB) and validator tests (`validators/validator_test.go`, ~60KB) are the heaviest test files.
diff --git a/formatters/testdata/empty.log b/formatters/testdata/empty.log
@@ -1 +1 @@
-TraceID |Provider |Run |Task |Status |Duration |Answer |
+TraceID |Provider |Run |Task |Status |Score |Duration |Answer |
diff --git a/formatters/testdata/results.log b/formatters/testdata/results.log
@@ -1,12 +1,12 @@
-TraceID                    |Provider      |Run         |Task      |Status |Duration |Answer                               |
-01JEDE7Z8X0000000000000001 |provider-name |run-success |task-name |Passed |1m35s    |Quos aut rerum quaerat qui ad culpa. |
-01JEDE7Z8X0000000000000002 |provider-name |run-failure |task-name |Failed |10s      |@@ -1,67 +1,36 @@
+TraceID                    |Provider      |Run         |Task      |Status |Score |Duration |Answer                               |
+01JEDE7Z8X0000000000000001 |provider-name |run-success |task-name |Passed |100   |1m35s    |Quos aut rerum quaerat qui ad culpa. |
+01JEDE7Z8X0000000000000002 |provider-name |run-failure |task-name |Failed |1     |10s      |@@ -1,67 +1,36 @@
 -Nihil reprehenderit enim voluptatum dolore nisi neque quia aut qui
 +Ipsam ea et optio explicabo eius et
  .
                            |
-01JEDE7Z8X0000000000000003 |provider-name |run-success-multiple-answers |task-name |Passed |17s    |Quos aut rerum quaerat qui ad culpa. |
-01JEDE7Z8X0000000000000004 |provider-name |run-failure-multiple-answers |task-name |Failed |3m0.8s |[
+01JEDE7Z8X0000000000000003 |provider-name |run-success-multiple-answers |task-name |Passed |100 |17s    |Quos aut rerum quaerat qui ad culpa. |
+01JEDE7Z8X0000000000000004 |provider-name |run-failure-multiple-answers |task-name |Failed |8   |3m0.8s |[
     @@ -1,48 +1,36 @@
     -Dolores saepe ad sed rerum autem iure minima
     +Ipsam ea et optio explicabo eius
@@ -19,11 +19,11 @@ TraceID                    |Provider      |Run         |Task      |Status |Durat
      .
 
 ]                          |
-01JEDE7Z8X0000000000000005 |provider-name |run-error              |task-name |Error   |0s        |error message                              |
-01JEDE7Z8X0000000000000006 |provider-name |run-not-supported      |task-name |Skipped |500ms     |Sequi molestiae iusto sit sit dolorum aut. |
-01JEDE7Z8X0000000000000007 |provider-name |run-validation-error   |task-name |Error   |2s        |Adipiscing elit sed do eiusmod tempor.     |
-01JEDE7Z8X0000000000000008 |provider-name |run-parsing-error      |task-name |Error   |5m14.159s |Invalid JSON: {broken                      |
-01JEDE7Z8X0000000000000009 |provider-name |run-structured-success |task-name |Passed  |42s       |[
+01JEDE7Z8X0000000000000005 |provider-name |run-error              |task-name |Error   |0   |0s        |error message                              |
+01JEDE7Z8X0000000000000006 |provider-name |run-not-supported      |task-name |Skipped |-   |500ms     |Sequi molestiae iusto sit sit dolorum aut. |
+01JEDE7Z8X0000000000000007 |provider-name |run-validation-error   |task-name |Error   |0   |2s        |Adipiscing elit sed do eiusmod tempor.     |
+01JEDE7Z8X0000000000000008 |provider-name |run-parsing-error      |task-name |Error   |0   |5m14.159s |Invalid JSON: {broken                      |
+01JEDE7Z8X0000000000000009 |provider-name |run-structured-success |task-name |Passed  |100 |42s       |[
   {
     "level": "INFO",
     "message": "User 'admin' logged in successfully.",
@@ -36,7 +36,7 @@ TraceID                    |Provider      |Run         |Task      |Status |Durat
     "timestamp": "2025-09-14T10:31:15Z"
   }
 ]                          |
-01JEDE7Z8X0000000000000010 |provider-name |run-structured-failure |task-name |Failed |38s |[
+01JEDE7Z8X0000000000000010 |provider-name |run-structured-failure |task-name |Failed |69 |38s |[
     @@ -11,12 +11,13 @@
      %22: %22
     -INFO

diff --git a/runners/default_runner.go b/runners/default_runner.go
@@ -113,7 +113,8 @@ func (r *asyncResultSet) emitMessageEvent(message string) {
 }
 
 // NewDefaultRunner creates a new Runner that executes tasks on all configured providers
-// in parallel. The individual runs on a single provider are executed sequentially.
+// in parallel. The runs within a single provider also execute in parallel; tasks within
+// each run are executed sequentially.
 // It returns an error if any provider initialization fails.
 func NewDefaultRunner(ctx context.Context, cfg []config.ProviderConfig, judges []config.JudgeConfig, tools []config.ToolConfig, logger zerolog.Logger) (Runner, error) {
 	toolValidator, err := providertools.NewDockerToolExecutor(ctx)

diff --git a/runners/runner_test.go b/runners/runner_test.go
@@ -9,6 +9,8 @@ package runners
 import (
 	"context"
 	"errors"
+	"fmt"
+	"sort"
 	"testing"
 	"time"
 
@@ -1201,6 +1203,12 @@ func TestRunnerRun(t *testing.T) {
 						assert.NotEmpty(t, results[provider][i].TraceID, "TraceID should not be empty")
 						results[provider][i].TraceID = ""
 					}
+					// Runs within a provider execute in parallel so result order is
+					// non-deterministic. Sort by (Run, Task, Got) before comparing.
+					sortResults(results[provider])
+				}
+				for provider := range tt.want {
+					sortResults(tt.want[provider])
 				}
 
 				assert.Equal(t, tt.want, results)
@@ -2182,3 +2190,21 @@ func TestToLines(t *testing.T) {
 		})
 	}
 }
+
+// sortResults sorts a RunResult slice by (Run, Task, Got) so that order-insensitive
+// comparisons work correctly even when runs execute in parallel.
+func sortResults(results []RunResult) {
+	sort.Slice(results, func(i, j int) bool {
+		a, b := results[i], results[j]
+		if a.Run != b.Run {
+			return a.Run < b.Run
+		}
+		if a.Task != b.Task {
+			return a.Task < b.Task
+		}
+		if fmt.Sprint(a.Got) != fmt.Sprint(b.Got) {
+			return fmt.Sprint(a.Got) < fmt.Sprint(b.Got)
+		}
+		return fmt.Sprint(a.Want) < fmt.Sprint(b.Want)
+	})
+}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		TraceID \|Provider \|Run \|Task \|Status \|Duration \|Answer \|
		TraceID \|Provider \|Run \|Task \|Status \|Score \|Duration \|Answer \|