diff --git a/.gitignore b/.gitignore
index d1fa6450..e92ab111 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,6 @@ docs/development/workplan-*.md
 
 # OS
 .DS_Store
+
+# Temporary files
+*.tmp*
\ No newline at end of file
diff --git a/docs/development/agents.md b/docs/development/agents.md
index 7f7e6de5..6d64b2b9 100644
--- a/docs/development/agents.md
+++ b/docs/development/agents.md
@@ -55,6 +55,7 @@ Main docs landing pages:
 - [Internals](./internals/README.md)
 - [Architecture](./internals/architecture.md)
 - [Runtime](./internals/runtime.md)
+- [Memory](./internals/memory.md)
 - [Session Routing](./internals/session-routing.md)
 
 ## Reference
diff --git a/docs/development/getting-started/authentication.md b/docs/development/getting-started/authentication.md
index b89fabf5..009dcdf1 100644
--- a/docs/development/getting-started/authentication.md
+++ b/docs/development/getting-started/authentication.md
@@ -12,6 +12,9 @@ HybridClaw uses one provider-focused command surface:
 hybridclaw auth login hybridai --browser
 hybridclaw auth login hybridai --base-url http://localhost:5000
 hybridclaw auth login codex --import
+hybridclaw auth login google-workspace --client-secret /path/to/client_secret.json
+hybridclaw auth login google-workspace --auth-url
+hybridclaw auth login google-workspace --auth-code "http://localhost:1/?code=..."
 hybridclaw auth login openrouter anthropic/claude-sonnet-4 --api-key sk-or-...
 hybridclaw auth login mistral mistral-large-latest --api-key mistral_...
 hybridclaw auth login huggingface meta-llama/Llama-3.1-8B-Instruct --api-key hf_...
@@ -20,6 +23,7 @@ hybridclaw auth login local ollama llama3.2
 hybridclaw auth login msteams --app-id 00000000-0000-0000-0000-000000000000 --tenant-id 11111111-1111-1111-1111-111111111111 --app-password secret
 hybridclaw auth status hybridai
 hybridclaw auth status codex
+hybridclaw auth status google-workspace
 hybridclaw auth status openrouter
 hybridclaw auth status mistral
 hybridclaw auth status huggingface
@@ -27,6 +31,7 @@ hybridclaw auth status local
 hybridclaw auth status msteams
 hybridclaw auth logout hybridai
 hybridclaw auth logout codex
+hybridclaw auth logout google-workspace
 hybridclaw auth logout openrouter
 hybridclaw auth logout mistral
 hybridclaw auth logout huggingface
@@ -44,6 +49,14 @@ hybridclaw auth whatsapp reset
   and `--base-url` updates `hybridai.baseUrl` before login.
 - `hybridclaw auth login codex` prefers browser PKCE locally and device code on
   headless or remote shells.
+- `hybridclaw auth login google-workspace` uses a Hermes-style stepwise OAuth
+  flow. Save a Google OAuth desktop client JSON with `--client-secret`, print a
+  consent URL with `--auth-url`, then exchange the pasted redirect URL or code
+  with `--auth-code`. In an interactive terminal, running the command with no
+  auth-step flags walks through the same flow and stores the client secret,
+  pending PKCE session, and token in the encrypted runtime secret store. The
+  current built-in scope bundle covers Gmail, Calendar, Drive, Docs, Sheets,
+  and Contacts.
 - `hybridclaw auth login openrouter`, `hybridclaw auth login mistral`, and
   `hybridclaw auth login huggingface` can take `--api-key`, otherwise they fall
   back to `OPENROUTER_API_KEY`, `MISTRAL_API_KEY`, or `HF_TOKEN`, or prompt
@@ -67,6 +80,9 @@ hybridclaw auth whatsapp reset
   any saved vLLM API key.
 - `hybridclaw auth logout msteams` clears the stored Teams app password and
   disables the Teams integration in config.
+- `hybridclaw auth logout google-workspace` clears the stored OAuth token and
+  pending Google auth session, but keeps the saved OAuth client secret so you
+  can authorize again later without re-importing the JSON file.
 - `hybridclaw auth whatsapp reset` clears linked WhatsApp Web auth without
   starting a new pairing session.
 - Local TUI and web sessions can store additional named secrets with
@@ -80,9 +96,10 @@ hybridclaw auth whatsapp reset
 
 ## Where Credentials Live
 
-- `~/.hybridclaw/credentials.json` stores HybridAI, OpenRouter, Mistral,
-  Hugging Face, Discord, email, Teams, BlueBubbles iMessage, related runtime
-  secrets, and named `/secret set` values in encrypted form
+- `~/.hybridclaw/credentials.json` stores HybridAI, Google Workspace OAuth
+  client and token state, OpenRouter, Mistral, Hugging Face, Discord, email,
+  Teams, BlueBubbles iMessage, related runtime secrets, and named `/secret set`
+  values in encrypted form
 - `~/.hybridclaw/credentials.master.key`, `HYBRIDCLAW_MASTER_KEY`, or
   `/run/secrets/hybridclaw_master_key` supplies the master key used to decrypt
   runtime secrets
diff --git a/docs/development/internals/README.md b/docs/development/internals/README.md
index 815e98aa..ce453adc 100644
--- a/docs/development/internals/README.md
+++ b/docs/development/internals/README.md
@@ -13,5 +13,7 @@ These pages focus on how HybridClaw is built and operated under the hood.
 - [Architecture](./architecture.md) for the major runtime pieces and data flow
 - [Runtime Internals](./runtime.md) for sandboxing, diagnostics, audit, and
   operational behavior
+- [Memory](./memory.md) for the built-in memory layers, prompt injection path,
+  and compaction/consolidation lifecycle
 - [Session Routing](./session-routing.md) for canonical session keys and
   linked-identity boundaries
diff --git a/docs/development/internals/memory.md b/docs/development/internals/memory.md
new file mode 100644
index 00000000..9ccd3576
--- /dev/null
+++ b/docs/development/internals/memory.md
@@ -0,0 +1,330 @@
+---
+title: Memory
+description: Built-in HybridClaw memory layers, prompt injection paths, compaction, and default limits.
+sidebar_position: 4
+---
+
+# Memory
+
+HybridClaw's built-in memory is layered. Different stores solve different
+problems:
+
+- `MEMORY.md` is curated long-term workspace memory
+- `memory/YYYY-MM-DD.md` is today's raw memory intake
+- raw session messages preserve the current conversation
+- `session_summary` compresses older current-session history
+- semantic memory stores query-recallable interaction summaries
+- canonical memory preserves cross-session and cross-channel continuity
+
+On a normal turn, HybridClaw does not inject every stored artifact wholesale.
+Some layers are loaded through the bootstrap prompt, some are injected through
+the memory hook, and some remain storage-only until a later consolidation or
+recall step.
+
+## At A Glance
+
+```text
+user turn
+  |
+  +--> raw session messages ------------------------------+
+  |                                                      |
+  |                                                      v
+  |                                            recent history in prompt
+  |                                            older history -> compaction
+  |                                                      |
+  |                                                      v
+  |                                               session_summary
+  |                                                      |
+  |                                                      v
+  |                                                memory hook
+  |
+  +--> semantic memory ("User asked ... I responded ...")
+  |                                                      |
+  |                                                      v
+  |                                         query-matched semantic recall
+  |                                                      |
+  |                                                      v
+  |                                                memory hook
+  |
+  +--> canonical cross-channel log ----------------------+
+  |                                                      |
+  |                                                      v
+  |                               canonical summary + recent other-session recall
+  |                                                      |
+  |                                                      v
+  |                                                memory hook
+  |
+  +--> memory tool / pre-compaction memory flush ------> memory/YYYY-MM-DD.md
+                                                         (today only)
+                                                                |
+                                                                v
+                                               bootstrap hook loads today's file
+                                                                |
+                                                   nightly dream / /dream rewrites
+                                                                |
+                                                                v
+                                                            MEMORY.md
+                                                                |
+                                                                v
+                                             bootstrap hook loads MEMORY.md
+```
+
+## Prompt-Time View
+
+On a standard built-in-memory turn, HybridClaw assembles memory from three main
+places:
+
+1. Bootstrap files
+   This includes `MEMORY.md` and today's `memory/YYYY-MM-DD.md` note when it
+   exists.
+2. Memory hook
+   This includes canonical cross-channel context, the current session summary,
+   and relevant semantic recall.
+3. Recent raw history
+   Recent session messages are passed as normal chat history messages, not as a
+   summary block.
+
+That means `MEMORY.md` and today's daily note are not the same thing as the
+semantic DB or the current session summary. They enter the prompt through
+different paths and follow different update rules.
+
+## The Memory Layers
+
+### `MEMORY.md`
+
+`MEMORY.md` is the curated long-term memory file in the agent workspace.
+HybridClaw loads it as part of the bootstrap prompt on every normal turn.
+
+Important properties:
+
+- it is prompt-time context, not a database row
+- it is meant to stay curated and relatively stable
+- normal `memory` tool writes should not append to it directly
+- `/dream` and the scheduled consolidation pass rewrite it from older daily
+  notes
+
+Use `MEMORY.md` for durable, cleaned-up context that should persist across
+sessions without carrying all raw intake forever.
+
+### `memory/YYYY-MM-DD.md`
+
+This is the raw daily memory intake file.
+
+Important properties:
+
+- the `memory` tool appends here
+- the pre-compaction memory flush writes here before older history is
+  summarized away
+- only today's daily note is injected into the prompt
+- older daily notes are not loaded directly once they are no longer "today"
+- older daily notes are later folded into `MEMORY.md` during dream
+  consolidation
+
+This is the staging area between "the model noticed something worth keeping"
+and "the workspace has a cleaned-up long-term memory file."
+
+### Raw Session History
+
+Raw session history lives in the SQLite messages table.
+
+Important properties:
+
+- recent raw turns are passed directly in the conversation history
+- HybridClaw keeps only a bounded recent slice for prompt assembly
+- that slice is further compressed by character limits before sending
+- older turns are eventually compacted out of raw history
+
+This is the highest-fidelity memory for the current session, but it is the
+least durable because it is the first thing that gets compacted.
+
+### `session_summary`
+
+When the current session gets large enough, HybridClaw compacts older messages
+into `session_summary` and deletes those older raw rows from the active session
+history.
+
+Important properties:
+
+- it belongs to the current session only
+- it summarizes older turns that were compacted away
+- it is injected through the memory hook under `## Session Summary`
+- it decays by age and can eventually be dropped when confidence falls too low
+
+This is the bridge between short-term raw history and long-lived workspace
+memory files.
+
+### Semantic Memory DB
+
+Semantic memory is stored in SQLite separately from raw messages.
+
+On each normal built-in-memory turn, HybridClaw stores one episodic semantic
+memory for the completed interaction, roughly:
+
+```text
+User asked: ...
+I responded: ...
+```
+
+Compaction can also store semantic summary memories.
+
+Important properties:
+
+- semantic memories are query-matched, not always injected
+- prompt assembly recalls only a small top-N set
+- recalled rows update their access metadata, so recall is stateful
+- stale rows decay during dream consolidation
+
+This is the main built-in retrieval layer for "older but still relevant"
+context.
+
+### Canonical Cross-Channel Memory
+
+Canonical memory is the cross-session and cross-channel continuity layer keyed
+by `(agentId, userId)`.
+
+After each turn, HybridClaw appends the exchange to the canonical log for that
+user. At prompt time, it can inject:
+
+- a compacted canonical summary
+- a recent window of messages from other sessions or channels for the same user
+
+Important properties:
+
+- it is separate from the current session's raw history
+- it is meant for continuity across channels, not only within one thread
+- prompt assembly excludes the current session from this recall so it does not
+  duplicate the main history window
+
+If a user talks to the same agent in Discord, web chat, and TUI, this is the
+layer that helps those sessions remember each other.
+
+## Normal Turn Lifecycle
+
+The built-in path for a successful turn is roughly:
+
+1. Store the user and assistant messages in the session message log.
+2. Store one semantic memory for the completed interaction.
+3. Append the same exchange to canonical cross-channel memory.
+4. On the next turn, load bootstrap files including `MEMORY.md` and today's
+   daily note.
+5. Build the memory hook from canonical context, `session_summary`, and
+   semantic recall.
+6. Include recent raw session history as chat messages.
+7. When thresholds are exceeded, run pre-compaction memory flush, write a new
+   `session_summary`, and delete older raw messages.
+8. On `/dream` or the scheduled consolidation run, fold older daily notes into
+   `MEMORY.md` and decay stale semantic memories.
+
+## Defaults, Caps, And Thresholds
+
+The values below describe the built-in defaults in the current codebase.
+
+### Bootstrap File And Daily Note Caps
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| bootstrap file read cap | `20,000` chars per file | `MEMORY.md` and other bootstrap files are trimmed before prompt assembly |
+| daily note prompt load | today only | only `memory/YYYY-MM-DD.md` for the current date is injected |
+
+### Recent Session History
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| recent history fetch window | `40` messages | max recent non-silent messages loaded into prompt history |
+| history char budget | `24,000` chars | total char budget after history optimization |
+| per-message history cap | `1,200` chars | each history message is bounded before budgeting |
+| protected head messages | `4` | oldest messages kept during middle compression |
+| protected tail messages | `8` | newest messages kept during middle compression |
+
+### Session Compaction
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| compaction enabled | `true` | current-session compaction runs automatically |
+| message-count trigger | `200` messages | compaction can trigger on message volume alone |
+| token budget | `100,000` estimated tokens | base compaction budget |
+| trigger ratio | `0.7` | compaction triggers at about `70,000` estimated tokens |
+| keep recent after compaction | `40` messages | recent raw turns retained after older rows are summarized |
+| summary max size | `8,000` chars | `session_summary` is truncated to this size |
+| compaction source transcript | `240` messages / `80,000` chars | max older-history excerpt sent into the compaction summary prompt |
+
+### Pre-Compaction Memory Flush
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| memory flush enabled | `true` | run a memory-writing pass before compaction |
+| flush source window | `80` messages | max older messages shown to the memory flush turn |
+| flush source cap | `24,000` chars | max transcript chars shown to the memory flush turn |
+
+### Semantic Memory
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| semantic write cap | `1,200` chars | stored semantic memory content is normalized and truncated to this size |
+| prompt recall default | `5` memories | normal prompt assembly asks for up to five semantic matches |
+| prompt recall hard cap | `12` memories | `buildPromptMemoryContext()` clamps prompt injection to at most twelve memories |
+| low-level recall hard cap | `50` memories | lower-level semantic recall API maximum for non-prompt callers |
+| semantic min confidence | `0.2` | rows below this are not recalled by default |
+| semantic decay rate | `0.1` | nightly decay multiplies stale confidence by `0.9` |
+| semantic stale threshold | `7` days | only memories not accessed for at least seven days are decayed |
+| semantic decay floor | `0.1` | nightly decay never pushes confidence below this floor |
+
+### Session Summary Decay
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| summary decay rate | `0.04` | summary confidence decays by age before prompt injection |
+| summary min confidence | `0.1` | minimum decayed summary confidence |
+| summary discard threshold | `0.22` | summaries below this confidence are omitted from the prompt |
+
+### Canonical Cross-Channel Memory
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| canonical storage window | `50` messages | recent canonical messages kept before another compaction cycle |
+| canonical compaction threshold | `100` messages | canonical summary refresh can trigger after this many stored messages |
+| canonical summary cap | `4,000` chars | max size of compacted canonical summary |
+| canonical message cap | `220` chars | each canonical message is shortened before summary building |
+| prompt fetch window | `12` messages | prompt-time canonical recall window before excluding current session |
+| prompt display window | `6` messages | only the last six canonical messages are rendered into the prompt section |
+
+### Dream Consolidation
+
+| Limit | Default | Meaning |
+| --- | ---: | --- |
+| scheduled interval | `24` hours | default nightly consolidation cadence |
+| disable scheduled runs | `0` hours | disables automatic consolidation |
+| consolidation language | `en` | language hint for model-backed cleanup passes |
+| semantic decay rate source | `memory.decayRate` | scheduled consolidation reuses this runtime config value |
+
+## What Operators Can Tune
+
+These areas are explicitly configurable:
+
+- `sessionCompaction.tokenBudget`
+- `sessionCompaction.budgetRatio`
+- `sessionCompaction.threshold`
+- `sessionCompaction.keepRecent`
+- `sessionCompaction.summaryMaxChars`
+- `sessionCompaction.preCompactionMemoryFlush.*`
+- `memory.decayRate`
+- `memory.consolidationIntervalHours`
+- `memory.consolidationLanguage`
+
+These commonly noticed caps are currently code defaults rather than normal
+operator config:
+
+- bootstrap file cap of `20,000` chars
+- recent prompt history fetch window of `40` messages
+- prompt-time semantic recall hard cap of `12`
+- prompt-time canonical fetch window of `12`
+
+## Plugin Caveat
+
+This page describes HybridClaw's built-in memory layer.
+
+If a plugin declares that it replaces built-in memory behavior, the gateway can
+skip parts of the native memory injection and compaction flow. Plugins that
+layer on top of native memory, such as additive external memory providers, do
+not change the built-in behavior described above unless they explicitly replace
+it.
diff --git a/docs/development/internals/session-routing.md b/docs/development/internals/session-routing.md
index 6d73b822..1a2dcbea 100644
--- a/docs/development/internals/session-routing.md
+++ b/docs/development/internals/session-routing.md
@@ -1,7 +1,7 @@
 ---
 title: Session Routing
 description: Canonical session keys, routing boundaries, and identity isolation across HybridClaw channels.
-sidebar_position: 4
+sidebar_position: 5
 ---
 
 # Session Routing
diff --git a/docs/development/reference/commands.md b/docs/development/reference/commands.md
index aa22a9d8..7b63b25f 100644
--- a/docs/development/reference/commands.md
+++ b/docs/development/reference/commands.md
@@ -18,6 +18,7 @@ hybridclaw gateway <command...>
 hybridclaw gateway compact
 hybridclaw gateway reset [yes|no]
 hybridclaw eval [list|env|<suite>] [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>]
+hybridclaw eval locomo [setup|run|status|stop|results|logs]
 hybridclaw eval terminal-bench-2.0 [setup|run|status|stop|results|logs]
 hybridclaw eval tau2 [setup|run|status|stop|results]
 hybridclaw eval [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>] <command...>
@@ -55,6 +56,9 @@ HybridClaw's loopback OpenAI-compatible API.
 ```bash
 hybridclaw eval list
 hybridclaw eval env
+hybridclaw eval locomo setup
+hybridclaw eval locomo run --budget 4000 --max-questions 20
+hybridclaw eval locomo run --mode retrieval --budget 4000 --max-questions 20
 hybridclaw eval tau2 setup
 hybridclaw eval tau2 run --domain telecom --num-trials 1 --num-tasks 10
 hybridclaw eval terminal-bench-2.0 setup
@@ -64,11 +68,22 @@ hybridclaw eval --fresh-agent --omit-prompt=bootstrap inspect eval inspect_evals
 
 - local-only surface from CLI, TUI, or embedded web chat; it is not intended
   for Discord, Teams, WhatsApp, email, or other remote chat channels
-- managed suites today: `tau2` and `terminal-bench-2.0`
+- managed suites today: `locomo`, `tau2`, and `terminal-bench-2.0`
+- `locomo --mode qa` runs a native HybridClaw QA harness against the official
+  LoCoMo conversations, generates answers through the local OpenAI-compatible
+  gateway, and scores those answers with LoCoMo-style question metrics
+- `locomo --mode retrieval` skips model generation, ingests each conversation
+  into an isolated native memory session, and scores evidence hit-rate from
+  recalled semantic memories
+- `locomo --num-samples` limits conversation records; use `--max-questions`
+  for quick smoke tests over a small question slice
+- by default, `locomo --mode qa` creates one fresh template-seeded agent
+  workspace per conversation sample; use `--current-agent` to reuse the current
+  agent workspace
 - `swebench-verified`, `agentbench`, and `gaia` currently print starter
   recipes and setup guidance rather than a native managed runner
-- the default eval mode keeps the current agent workspace but opens a fresh
-  OpenAI-compatible session per request
+- outside suite-specific overrides, the default eval mode keeps the current
+  agent workspace but opens a fresh OpenAI-compatible session per request
 - `--fresh-agent` uses a temporary template-seeded agent workspace for each
   eval request
 - detached run logs and summaries are stored under
@@ -99,6 +114,7 @@ hybridclaw auth login [provider] ...
 hybridclaw auth status <provider>
 hybridclaw auth logout <provider>
 hybridclaw auth whatsapp reset
+hybridclaw auth login google-workspace [--client-secret <path>] [--auth-url|--auth-code <code-or-url>]
 hybridclaw auth login msteams [--app-id <id>] [--app-password <secret>] [--tenant-id <id>]
 hybridclaw local status
 hybridclaw local configure <backend> [model-id] [--base-url <url>] [--api-key <key>] [--no-default]
@@ -108,8 +124,8 @@ hybridclaw help mistral
 hybridclaw help huggingface
 ```
 
-`auth status` supports `hybridai`, `codex`, `openrouter`, `mistral`,
-`huggingface`, `local`, and `msteams`.
+`auth status` supports `hybridai`, `codex`, `google-workspace`,
+`openrouter`, `mistral`, `huggingface`, `local`, and `msteams`.
 Legacy aliases such as `hybridclaw hybridai ...`, `hybridclaw codex ...`, and
 `hybridclaw local ...` still work, but `auth` is the primary surface.
 
diff --git a/docs/development/reference/configuration.md b/docs/development/reference/configuration.md
index cdf62e4c..265230af 100644
--- a/docs/development/reference/configuration.md
+++ b/docs/development/reference/configuration.md
@@ -23,7 +23,8 @@ or restore tracked config snapshots.
 ## Runtime Files
 
 - `~/.hybridclaw/config.json` for typed runtime config
-- `~/.hybridclaw/credentials.json` for encrypted runtime secrets
+- `~/.hybridclaw/credentials.json` for encrypted runtime secrets, including
+  Google Workspace OAuth client/token state
 - `~/.hybridclaw/credentials.master.key` for the local owner-only fallback
   master key when no external key source is configured
 - `~/.hybridclaw/codex-auth.json` for Codex OAuth state
@@ -168,7 +169,8 @@ Keep runtime secrets in the encrypted `~/.hybridclaw/credentials.json` store.
 Common built-in entries include `HYBRIDAI_API_KEY`, `OPENROUTER_API_KEY`,
 `HF_TOKEN`, `OPENAI_API_KEY`, `GROQ_API_KEY`, `DEEPGRAM_API_KEY`,
 `GEMINI_API_KEY`, `GOOGLE_API_KEY`, `DISCORD_TOKEN`, `EMAIL_PASSWORD`,
-`IMESSAGE_PASSWORD`, and `MSTEAMS_APP_PASSWORD`.
+`IMESSAGE_PASSWORD`, `MSTEAMS_APP_PASSWORD`, and the stored Google Workspace
+OAuth client/token payloads used by `auth login google-workspace`.
 
 Codex OAuth sessions are stored separately in `~/.hybridclaw/codex-auth.json`.
 Trust-model acceptance is persisted in `config.json` under `security.*` and is
diff --git a/docs/static/docs.js b/docs/static/docs.js
index d2fd62af..f45fe0d6 100644
--- a/docs/static/docs.js
+++ b/docs/static/docs.js
@@ -62,6 +62,7 @@ export const DEVELOPMENT_DOCS_SECTIONS = [
       { title: 'Internals', path: 'internals/README.md' },
       { title: 'Architecture', path: 'internals/architecture.md' },
       { title: 'Runtime', path: 'internals/runtime.md' },
+      { title: 'Memory', path: 'internals/memory.md' },
       { title: 'Session Routing', path: 'internals/session-routing.md' },
     ],
   },
diff --git a/package-lock.json b/package-lock.json
index 7220732c..6497ef30 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -34,6 +34,7 @@
         "pptxgenjs": "4.0.1",
         "qrcode-terminal": "0.12.0",
         "sanitize-html": "2.17.1",
+        "stemmer": "2.0.1",
         "ws": "8.20.0",
         "xlsx-populate": "1.21.0",
         "yaml": "2.8.3",
@@ -86,7 +87,7 @@
     },
     "container": {
       "name": "hybridclaw-agent",
-      "version": "0.12.0",
+      "version": "0.12.2",
       "dependencies": {
         "@modelcontextprotocol/sdk": "1.27.1",
         "@mozilla/readability": "0.6.0",
@@ -10880,6 +10881,19 @@
       "dev": true,
       "license": "MIT"
     },
+    "node_modules/stemmer": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/stemmer/-/stemmer-2.0.1.tgz",
+      "integrity": "sha512-bkWvSX2JR4nSZFfs113kd4C6X13bBBrg4fBKv2pVdzpdQI2LA5pZcWzTFNdkYsiUNl13E4EzymSRjZ0D55jBYg==",
+      "license": "MIT",
+      "bin": {
+        "stemmer": "cli.js"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/wooorm"
+      }
+    },
     "node_modules/streamx": {
       "version": "2.25.0",
       "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.25.0.tgz",
diff --git a/package.json b/package.json
index 4e8ab199..5d55b3f0 100644
--- a/package.json
+++ b/package.json
@@ -100,6 +100,7 @@
     "pptxgenjs": "4.0.1",
     "qrcode-terminal": "0.12.0",
     "sanitize-html": "2.17.1",
+    "stemmer": "2.0.1",
     "ws": "8.20.0",
     "xlsx-populate": "1.21.0",
     "yaml": "2.8.3",
diff --git a/skills/google-workspace/SKILL.md b/skills/google-workspace/SKILL.md
index 96da207b..a30e6ebb 100644
--- a/skills/google-workspace/SKILL.md
+++ b/skills/google-workspace/SKILL.md
@@ -33,6 +33,7 @@ Use this skill for Google Workspace workflows that go beyond HybridClaw's built-
 2. For Calendar, Drive, Docs, or Sheets tasks, **default to browser automation** using the persistent browser profile. Do not ask which method to use — just try the browser first.
 3. If the browser hits a login page, tell the user to run `hybridclaw browser login` to sign in once, then retry. Do not ask for credentials in chat.
 4. Only fall back to API-based access if the user explicitly requests it or browser automation is unavailable.
+5. When the user wants an API path, prefer HybridClaw's built-in Google Workspace OAuth flow before inventing ad hoc token files.
 
 ## Proactive Behavior
 
@@ -59,8 +60,12 @@ If the user explicitly requests API automation and does not have credentials:
 1. Tell them to create a Google Cloud project.
 2. Enable the APIs they need: Gmail, Calendar, Drive, Docs, Sheets, People.
 3. Create an OAuth desktop client or service account, depending on their environment.
-4. Keep credential files outside the repo and outside version control.
-5. Confirm which scopes are actually needed before proceeding.
+4. Keep the downloaded client secret JSON outside the repo and outside version control.
+5. Use HybridClaw's auth flow:
+   `hybridclaw auth login google-workspace --client-secret /path/to/client_secret.json`
+   `hybridclaw auth login google-workspace --auth-url`
+   `hybridclaw auth login google-workspace --auth-code "<redirect-url-or-code>"`
+6. Tell the user that the current built-in auth flow requests HybridClaw's standard Google Workspace scope bundle (Gmail, Calendar, Drive, Docs, Sheets, Contacts) rather than per-service scopes.
 
 ## Rules
 
diff --git a/src/auth/google-workspace-auth.ts b/src/auth/google-workspace-auth.ts
new file mode 100644
index 00000000..ed277ef1
--- /dev/null
+++ b/src/auth/google-workspace-auth.ts
@@ -0,0 +1,710 @@
+import { createHash, randomBytes } from 'node:crypto';
+import fs from 'node:fs';
+
+import {
+  readStoredRuntimeSecret,
+  runtimeSecretsPath,
+  saveNamedRuntimeSecrets,
+} from '../security/runtime-secrets.js';
+
+export const GOOGLE_WORKSPACE_PROVIDER = 'google-workspace';
+export const GOOGLE_WORKSPACE_REDIRECT_URI = 'http://localhost:1';
+export const GOOGLE_WORKSPACE_AUTH_URL =
+  'https://accounts.google.com/o/oauth2/v2/auth';
+export const GOOGLE_WORKSPACE_TOKEN_URL = 'https://oauth2.googleapis.com/token';
+export const GOOGLE_WORKSPACE_REFRESH_SKEW_MS = 2 * 60_000;
+export const GOOGLE_WORKSPACE_CLIENT_SECRET_KEY =
+  'GOOGLE_WORKSPACE_CLIENT_SECRET_JSON';
+export const GOOGLE_WORKSPACE_TOKEN_KEY = 'GOOGLE_WORKSPACE_TOKEN_JSON';
+export const GOOGLE_WORKSPACE_PENDING_AUTH_KEY =
+  'GOOGLE_WORKSPACE_PENDING_AUTH_JSON';
+export const GOOGLE_WORKSPACE_SCOPES = [
+  'https://www.googleapis.com/auth/gmail.readonly',
+  'https://www.googleapis.com/auth/gmail.send',
+  'https://www.googleapis.com/auth/gmail.modify',
+  'https://www.googleapis.com/auth/calendar',
+  'https://www.googleapis.com/auth/drive.readonly',
+  'https://www.googleapis.com/auth/contacts.readonly',
+  'https://www.googleapis.com/auth/spreadsheets',
+  'https://www.googleapis.com/auth/documents.readonly',
+] as const;
+
+type GoogleWorkspaceAuthErrorCode =
+  | 'google_workspace_client_secret_missing'
+  | 'google_workspace_client_secret_invalid'
+  | 'google_workspace_pending_auth_missing'
+  | 'google_workspace_pending_auth_invalid'
+  | 'google_workspace_state_mismatch'
+  | 'google_workspace_token_missing'
+  | 'google_workspace_token_invalid'
+  | 'google_workspace_token_exchange_failed'
+  | 'google_workspace_refresh_failed';
+
+interface StoredGoogleWorkspaceClientSecret {
+  clientId: string;
+  clientSecret: string;
+  authUri: string;
+  tokenUri: string;
+}
+
+interface StoredGoogleWorkspaceToken {
+  accessToken: string;
+  refreshToken: string;
+  tokenType: string;
+  scopes: string[];
+  expiresAt: number;
+  updatedAt: string;
+}
+
+interface StoredGoogleWorkspacePendingAuth {
+  state: string;
+  codeVerifier: string;
+  redirectUri: string;
+  scopes: string[];
+  createdAt: string;
+}
+
+interface RawGoogleClientSecretSection {
+  client_id?: unknown;
+  client_secret?: unknown;
+  auth_uri?: unknown;
+  token_uri?: unknown;
+}
+
+interface RawGoogleClientSecretFile {
+  installed?: RawGoogleClientSecretSection;
+  web?: RawGoogleClientSecretSection;
+}
+
+export interface GoogleWorkspaceAuthStatus {
+  authenticated: boolean;
+  path: string;
+  clientConfigured: boolean;
+  pendingAuthorization: boolean;
+  refreshTokenConfigured: boolean;
+  reloginRequired: boolean;
+  expiresAt: number | null;
+  scopes: string[];
+}
+
+export interface SaveGoogleWorkspaceClientSecretResult {
+  path: string;
+  clientId: string;
+}
+
+export interface StartGoogleWorkspaceAuthResult {
+  path: string;
+  authUrl: string;
+  redirectUri: string;
+}
+
+export interface ExchangeGoogleWorkspaceAuthCodeResult {
+  path: string;
+  expiresAt: number;
+  scopes: string[];
+}
+
+export interface EnsureFreshGoogleWorkspaceAccessTokenResult {
+  accessToken: string;
+  expiresAt: number;
+  scopes: string[];
+  refreshed: boolean;
+}
+
+export class GoogleWorkspaceAuthError extends Error {
+  code: GoogleWorkspaceAuthErrorCode;
+  reloginRequired: boolean;
+
+  constructor(
+    code: GoogleWorkspaceAuthErrorCode,
+    message: string,
+    options?: {
+      cause?: unknown;
+      reloginRequired?: boolean;
+    },
+  ) {
+    super(message, { cause: options?.cause });
+    this.name = 'GoogleWorkspaceAuthError';
+    this.code = code;
+    this.reloginRequired = options?.reloginRequired === true;
+  }
+}
+
+function nowIso(): string {
+  return new Date().toISOString();
+}
+
+function normalizeString(value: unknown): string {
+  return typeof value === 'string' ? value.trim() : '';
+}
+
+function normalizeScopes(value: unknown): string[] {
+  if (Array.isArray(value)) {
+    return value
+      .map((entry) => normalizeString(entry))
+      .filter((entry) => entry.length > 0);
+  }
+  const raw = normalizeString(value);
+  if (!raw) return [];
+  return raw
+    .split(/\s+/)
+    .map((entry) => entry.trim())
+    .filter((entry) => entry.length > 0);
+}
+
+function normalizeTimestamp(value: unknown): number {
+  if (typeof value === 'number' && Number.isFinite(value)) return value;
+  if (typeof value === 'string' && value.trim()) {
+    const parsed = Date.parse(value);
+    if (Number.isFinite(parsed)) return parsed;
+    const numeric = Number(value);
+    if (Number.isFinite(numeric)) return numeric;
+  }
+  return 0;
+}
+
+function toBase64Url(buffer: Buffer): string {
+  return buffer
+    .toString('base64')
+    .replace(/\+/g, '-')
+    .replace(/\//g, '_')
+    .replace(/=+$/g, '');
+}
+
+function generatePkcePair(): {
+  verifier: string;
+  challenge: string;
+} {
+  const verifier = toBase64Url(randomBytes(32));
+  const challenge = toBase64Url(createHash('sha256').update(verifier).digest());
+  return { verifier, challenge };
+}
+
+function generateState(): string {
+  return toBase64Url(randomBytes(32));
+}
+
+function parseJsonSecret(
+  key: string,
+  raw: string | null,
+): Record<string, unknown> | null {
+  const normalized = raw?.trim() || '';
+  if (!normalized) return null;
+  try {
+    const parsed = JSON.parse(normalized) as unknown;
+    if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
+      return parsed as Record<string, unknown>;
+    }
+  } catch (cause) {
+    throw new GoogleWorkspaceAuthError(
+      key === GOOGLE_WORKSPACE_CLIENT_SECRET_KEY
+        ? 'google_workspace_client_secret_invalid'
+        : key === GOOGLE_WORKSPACE_PENDING_AUTH_KEY
+          ? 'google_workspace_pending_auth_invalid'
+          : 'google_workspace_token_invalid',
+      `Stored Google Workspace auth data for ${key} is not valid JSON.`,
+      {
+        cause,
+        reloginRequired: key !== GOOGLE_WORKSPACE_CLIENT_SECRET_KEY,
+      },
+    );
+  }
+
+  throw new GoogleWorkspaceAuthError(
+    key === GOOGLE_WORKSPACE_CLIENT_SECRET_KEY
+      ? 'google_workspace_client_secret_invalid'
+      : key === GOOGLE_WORKSPACE_PENDING_AUTH_KEY
+        ? 'google_workspace_pending_auth_invalid'
+        : 'google_workspace_token_invalid',
+    `Stored Google Workspace auth data for ${key} has an invalid structure.`,
+    {
+      reloginRequired: key !== GOOGLE_WORKSPACE_CLIENT_SECRET_KEY,
+    },
+  );
+}
+
+function readStoredClientSecret(): StoredGoogleWorkspaceClientSecret | null {
+  const parsed = parseJsonSecret(
+    GOOGLE_WORKSPACE_CLIENT_SECRET_KEY,
+    readStoredRuntimeSecret(GOOGLE_WORKSPACE_CLIENT_SECRET_KEY),
+  );
+  if (!parsed) return null;
+
+  const clientId = normalizeString(parsed.clientId);
+  const clientSecret = normalizeString(parsed.clientSecret);
+  if (!clientId || !clientSecret) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_client_secret_invalid',
+      'Stored Google Workspace client secret is missing required fields.',
+    );
+  }
+
+  return {
+    clientId,
+    clientSecret,
+    authUri: normalizeString(parsed.authUri) || GOOGLE_WORKSPACE_AUTH_URL,
+    tokenUri: normalizeString(parsed.tokenUri) || GOOGLE_WORKSPACE_TOKEN_URL,
+  };
+}
+
+function requireStoredClientSecret(): StoredGoogleWorkspaceClientSecret {
+  const clientSecret = readStoredClientSecret();
+  if (clientSecret) return clientSecret;
+  throw new GoogleWorkspaceAuthError(
+    'google_workspace_client_secret_missing',
+    'Google Workspace client secret is not configured. Run `hybridclaw auth login google-workspace --client-secret <path>` first.',
+  );
+}
+
+function readStoredToken(): StoredGoogleWorkspaceToken | null {
+  const parsed = parseJsonSecret(
+    GOOGLE_WORKSPACE_TOKEN_KEY,
+    readStoredRuntimeSecret(GOOGLE_WORKSPACE_TOKEN_KEY),
+  );
+  if (!parsed) return null;
+
+  const accessToken = normalizeString(parsed.accessToken);
+  const refreshToken = normalizeString(parsed.refreshToken);
+  const tokenType = normalizeString(parsed.tokenType) || 'Bearer';
+  const expiresAt = normalizeTimestamp(parsed.expiresAt);
+  const scopes = normalizeScopes(parsed.scopes);
+  if (!accessToken && !refreshToken) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_token_invalid',
+      'Stored Google Workspace token is missing both access and refresh tokens.',
+      { reloginRequired: true },
+    );
+  }
+
+  return {
+    accessToken,
+    refreshToken,
+    tokenType,
+    expiresAt,
+    scopes,
+    updatedAt: normalizeString(parsed.updatedAt) || nowIso(),
+  };
+}
+
+function requireStoredPendingAuth(): StoredGoogleWorkspacePendingAuth {
+  const parsed = parseJsonSecret(
+    GOOGLE_WORKSPACE_PENDING_AUTH_KEY,
+    readStoredRuntimeSecret(GOOGLE_WORKSPACE_PENDING_AUTH_KEY),
+  );
+  if (!parsed) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_pending_auth_missing',
+      'No pending Google Workspace OAuth session was found. Run `hybridclaw auth login google-workspace --auth-url` first.',
+    );
+  }
+
+  const state = normalizeString(parsed.state);
+  const codeVerifier = normalizeString(parsed.codeVerifier);
+  const redirectUri =
+    normalizeString(parsed.redirectUri) || GOOGLE_WORKSPACE_REDIRECT_URI;
+  const scopes = normalizeScopes(parsed.scopes);
+  if (!state || !codeVerifier) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_pending_auth_invalid',
+      'Stored Google Workspace OAuth session is missing PKCE state.',
+    );
+  }
+
+  return {
+    state,
+    codeVerifier,
+    redirectUri,
+    scopes: scopes.length > 0 ? scopes : [...GOOGLE_WORKSPACE_SCOPES],
+    createdAt: normalizeString(parsed.createdAt) || nowIso(),
+  };
+}
+
+function buildTokenExpiresAt(expiresIn: unknown): number {
+  const numeric =
+    typeof expiresIn === 'number'
+      ? expiresIn
+      : typeof expiresIn === 'string'
+        ? Number(expiresIn)
+        : 0;
+  if (!Number.isFinite(numeric) || numeric <= 0) {
+    return Date.now() + 60 * 60_000;
+  }
+  return Date.now() + numeric * 1_000;
+}
+
+function extractClientSecretSection(
+  raw: RawGoogleClientSecretFile,
+): RawGoogleClientSecretSection {
+  return raw.installed || raw.web || {};
+}
+
+function parseGoogleClientSecretJson(
+  raw: string,
+): StoredGoogleWorkspaceClientSecret {
+  let parsed: RawGoogleClientSecretFile;
+  try {
+    parsed = JSON.parse(raw) as RawGoogleClientSecretFile;
+  } catch (cause) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_client_secret_invalid',
+      'Google Workspace client secret file is not valid JSON.',
+      { cause },
+    );
+  }
+
+  const section = extractClientSecretSection(parsed);
+  const clientId = normalizeString(section.client_id);
+  const clientSecret = normalizeString(section.client_secret);
+  if (!clientId || !clientSecret) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_client_secret_invalid',
+      'Google Workspace client secret file is missing `client_id` or `client_secret`.',
+    );
+  }
+
+  return {
+    clientId,
+    clientSecret,
+    authUri: normalizeString(section.auth_uri) || GOOGLE_WORKSPACE_AUTH_URL,
+    tokenUri: normalizeString(section.token_uri) || GOOGLE_WORKSPACE_TOKEN_URL,
+  };
+}
+
+function parseTokenResponsePayload(
+  payload: unknown,
+  fallbackMessage: string,
+): string {
+  if (!payload || typeof payload !== 'object' || Array.isArray(payload)) {
+    return fallbackMessage;
+  }
+
+  const record = payload as Record<string, unknown>;
+  const error = normalizeString(record.error);
+  const description =
+    normalizeString(record.error_description) ||
+    normalizeString(record.errorDescription) ||
+    normalizeString(record.message);
+  if (error && description) return `${error}: ${description}`;
+  if (description) return description;
+  if (error) return error;
+  return fallbackMessage;
+}
+
+function buildAuthUrl(params: {
+  clientId: string;
+  authUri: string;
+  redirectUri: string;
+  state: string;
+  challenge: string;
+  scopes: string[];
+}): string {
+  const query = new URLSearchParams({
+    client_id: params.clientId,
+    redirect_uri: params.redirectUri,
+    response_type: 'code',
+    scope: params.scopes.join(' '),
+    access_type: 'offline',
+    prompt: 'consent',
+    code_challenge: params.challenge,
+    code_challenge_method: 'S256',
+    include_granted_scopes: 'true',
+    state: params.state,
+  });
+  return `${params.authUri}?${query.toString()}`;
+}
+
+function extractCodeAndState(codeOrUrl: string): {
+  code: string;
+  state: string | null;
+} {
+  const trimmed = codeOrUrl.trim();
+  if (!trimmed) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_pending_auth_missing',
+      'Google Workspace authorization code cannot be empty.',
+    );
+  }
+  if (!trimmed.startsWith('http://') && !trimmed.startsWith('https://')) {
+    return {
+      code: trimmed,
+      state: null,
+    };
+  }
+
+  const parsed = new URL(trimmed);
+  const code = normalizeString(parsed.searchParams.get('code'));
+  if (!code) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_pending_auth_invalid',
+      'Google Workspace redirect URL does not contain a `code` parameter.',
+    );
+  }
+
+  return {
+    code,
+    state: normalizeString(parsed.searchParams.get('state')) || null,
+  };
+}
+
+async function exchangeTokenRequest(
+  tokenUri: string,
+  body: URLSearchParams,
+  errorCode:
+    | 'google_workspace_token_exchange_failed'
+    | 'google_workspace_refresh_failed',
+): Promise<Record<string, unknown>> {
+  let response: Response;
+  try {
+    response = await fetch(tokenUri, {
+      method: 'POST',
+      headers: {
+        'content-type': 'application/x-www-form-urlencoded',
+      },
+      body,
+    });
+  } catch (cause) {
+    throw new GoogleWorkspaceAuthError(
+      errorCode,
+      `Google Workspace token request failed: ${cause instanceof Error ? cause.message : String(cause)}.`,
+      {
+        cause,
+        reloginRequired: errorCode === 'google_workspace_refresh_failed',
+      },
+    );
+  }
+
+  let payload: unknown = null;
+  try {
+    payload = (await response.json()) as unknown;
+  } catch {
+    payload = null;
+  }
+  if (!response.ok || !payload || typeof payload !== 'object') {
+    throw new GoogleWorkspaceAuthError(
+      errorCode,
+      parseTokenResponsePayload(
+        payload,
+        `Google Workspace token request failed with HTTP ${response.status}.`,
+      ),
+      {
+        reloginRequired: errorCode === 'google_workspace_refresh_failed',
+      },
+    );
+  }
+
+  return payload as Record<string, unknown>;
+}
+
+export function saveGoogleWorkspaceClientSecretFile(
+  filePath: string,
+): SaveGoogleWorkspaceClientSecretResult {
+  const resolved = filePath.trim();
+  if (!resolved) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_client_secret_missing',
+      'Google Workspace client secret path cannot be empty.',
+    );
+  }
+  const raw = fs.readFileSync(resolved, 'utf-8');
+  const clientSecret = parseGoogleClientSecretJson(raw);
+  const path = saveNamedRuntimeSecrets({
+    [GOOGLE_WORKSPACE_CLIENT_SECRET_KEY]: JSON.stringify(clientSecret),
+  });
+  return {
+    path,
+    clientId: clientSecret.clientId,
+  };
+}
+
+export function startGoogleWorkspaceAuth(): StartGoogleWorkspaceAuthResult {
+  const clientSecret = requireStoredClientSecret();
+  const pkce = generatePkcePair();
+  const state = generateState();
+  const pending: StoredGoogleWorkspacePendingAuth = {
+    state,
+    codeVerifier: pkce.verifier,
+    redirectUri: GOOGLE_WORKSPACE_REDIRECT_URI,
+    scopes: [...GOOGLE_WORKSPACE_SCOPES],
+    createdAt: nowIso(),
+  };
+  const path = saveNamedRuntimeSecrets({
+    [GOOGLE_WORKSPACE_PENDING_AUTH_KEY]: JSON.stringify(pending),
+  });
+
+  return {
+    path,
+    authUrl: buildAuthUrl({
+      clientId: clientSecret.clientId,
+      authUri: clientSecret.authUri,
+      redirectUri: pending.redirectUri,
+      state,
+      challenge: pkce.challenge,
+      scopes: pending.scopes,
+    }),
+    redirectUri: pending.redirectUri,
+  };
+}
+
+export async function exchangeGoogleWorkspaceAuthCode(
+  codeOrUrl: string,
+): Promise<ExchangeGoogleWorkspaceAuthCodeResult> {
+  const clientSecret = requireStoredClientSecret();
+  const pending = requireStoredPendingAuth();
+  const existingToken = readStoredToken();
+  const { code, state } = extractCodeAndState(codeOrUrl);
+  if (state && state !== pending.state) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_state_mismatch',
+      'Google Workspace authorization response state mismatch. Run `hybridclaw auth login google-workspace --auth-url` again.',
+    );
+  }
+
+  const payload = await exchangeTokenRequest(
+    clientSecret.tokenUri,
+    new URLSearchParams({
+      client_id: clientSecret.clientId,
+      client_secret: clientSecret.clientSecret,
+      code,
+      code_verifier: pending.codeVerifier,
+      redirect_uri: pending.redirectUri,
+      grant_type: 'authorization_code',
+    }),
+    'google_workspace_token_exchange_failed',
+  );
+
+  const accessToken = normalizeString(payload.access_token);
+  const refreshToken =
+    normalizeString(payload.refresh_token) || existingToken?.refreshToken || '';
+  if (!accessToken || !refreshToken) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_token_exchange_failed',
+      'Google Workspace token exchange did not return a usable access and refresh token.',
+      { reloginRequired: true },
+    );
+  }
+
+  const scopes = normalizeScopes(payload.scope);
+  const nextToken: StoredGoogleWorkspaceToken = {
+    accessToken,
+    refreshToken,
+    tokenType: normalizeString(payload.token_type) || 'Bearer',
+    scopes: scopes.length > 0 ? scopes : pending.scopes,
+    expiresAt: buildTokenExpiresAt(payload.expires_in),
+    updatedAt: nowIso(),
+  };
+  const path = saveNamedRuntimeSecrets({
+    [GOOGLE_WORKSPACE_TOKEN_KEY]: JSON.stringify(nextToken),
+    [GOOGLE_WORKSPACE_PENDING_AUTH_KEY]: null,
+  });
+
+  return {
+    path,
+    expiresAt: nextToken.expiresAt,
+    scopes: nextToken.scopes,
+  };
+}
+
+export async function ensureFreshGoogleWorkspaceAccessToken(): Promise<EnsureFreshGoogleWorkspaceAccessTokenResult> {
+  const token = readStoredToken();
+  if (!token || (!token.accessToken && !token.refreshToken)) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_token_missing',
+      'Google Workspace OAuth token is not configured. Run `hybridclaw auth login google-workspace` first.',
+      { reloginRequired: true },
+    );
+  }
+
+  if (
+    token.accessToken &&
+    token.expiresAt > Date.now() + GOOGLE_WORKSPACE_REFRESH_SKEW_MS
+  ) {
+    return {
+      accessToken: token.accessToken,
+      expiresAt: token.expiresAt,
+      scopes: token.scopes,
+      refreshed: false,
+    };
+  }
+
+  if (!token.refreshToken) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_refresh_failed',
+      'Google Workspace refresh token is missing. Re-run `hybridclaw auth login google-workspace`.',
+      { reloginRequired: true },
+    );
+  }
+
+  const clientSecret = requireStoredClientSecret();
+  const payload = await exchangeTokenRequest(
+    clientSecret.tokenUri,
+    new URLSearchParams({
+      client_id: clientSecret.clientId,
+      client_secret: clientSecret.clientSecret,
+      refresh_token: token.refreshToken,
+      grant_type: 'refresh_token',
+    }),
+    'google_workspace_refresh_failed',
+  );
+
+  const accessToken = normalizeString(payload.access_token);
+  if (!accessToken) {
+    throw new GoogleWorkspaceAuthError(
+      'google_workspace_refresh_failed',
+      'Google Workspace refresh did not return a new access token.',
+      { reloginRequired: true },
+    );
+  }
+
+  const refreshedToken: StoredGoogleWorkspaceToken = {
+    accessToken,
+    refreshToken: normalizeString(payload.refresh_token) || token.refreshToken,
+    tokenType:
+      normalizeString(payload.token_type) || token.tokenType || 'Bearer',
+    scopes: normalizeScopes(payload.scope).length
+      ? normalizeScopes(payload.scope)
+      : token.scopes,
+    expiresAt: buildTokenExpiresAt(payload.expires_in),
+    updatedAt: nowIso(),
+  };
+  saveNamedRuntimeSecrets({
+    [GOOGLE_WORKSPACE_TOKEN_KEY]: JSON.stringify(refreshedToken),
+  });
+
+  return {
+    accessToken: refreshedToken.accessToken,
+    expiresAt: refreshedToken.expiresAt,
+    scopes: refreshedToken.scopes,
+    refreshed: true,
+  };
+}
+
+export function getGoogleWorkspaceAuthStatus(): GoogleWorkspaceAuthStatus {
+  const clientConfigured = readStoredClientSecret() != null;
+  const token = readStoredToken();
+  const pendingAuthorization =
+    readStoredRuntimeSecret(GOOGLE_WORKSPACE_PENDING_AUTH_KEY) != null;
+  const expiresAt = token?.expiresAt || 0;
+  const expiresSoon =
+    expiresAt > 0 && expiresAt <= Date.now() + GOOGLE_WORKSPACE_REFRESH_SKEW_MS;
+
+  return {
+    authenticated: Boolean(token?.accessToken || token?.refreshToken),
+    path: runtimeSecretsPath(),
+    clientConfigured,
+    pendingAuthorization,
+    refreshTokenConfigured: Boolean(token?.refreshToken),
+    reloginRequired:
+      Boolean(token) &&
+      expiresSoon &&
+      (!token?.refreshToken || !clientConfigured),
+    expiresAt: expiresAt || null,
+    scopes: token?.scopes || [],
+  };
+}
+
+export function clearGoogleWorkspaceCredentials(): string {
+  return saveNamedRuntimeSecrets({
+    [GOOGLE_WORKSPACE_TOKEN_KEY]: null,
+    [GOOGLE_WORKSPACE_PENDING_AUTH_KEY]: null,
+  });
+}
diff --git a/src/cli.ts b/src/cli.ts
index 3929069f..b128bbf9 100644
--- a/src/cli.ts
+++ b/src/cli.ts
@@ -1511,6 +1511,11 @@ export async function main(
       await runTerminalBenchNativeCli(subargs);
       break;
     }
+    case '__eval-locomo-native': {
+      const { runLocomoNativeCli } = await import('./evals/locomo-native.js');
+      await runLocomoNativeCli(subargs);
+      break;
+    }
     case 'tui':
       await launchTui(subargs);
       break;
diff --git a/src/cli/auth-command.ts b/src/cli/auth-command.ts
index edbc5de2..11ce0c0e 100644
--- a/src/cli/auth-command.ts
+++ b/src/cli/auth-command.ts
@@ -26,6 +26,7 @@ import {
   isHelpRequest,
   printAuthUsage,
   printCodexUsage,
+  printGoogleWorkspaceUsage,
   printHuggingFaceUsage,
   printHybridAIUsage,
   printLocalUsage,
@@ -39,6 +40,7 @@ import { ensureWhatsAppAuthApi, getWhatsAppAuthApi } from './whatsapp-api.js';
 
 type HybridAIAuthApi = typeof import('../auth/hybridai-auth.js');
 type CodexAuthApi = typeof import('../auth/codex-auth.js');
+type GoogleWorkspaceAuthApi = typeof import('../auth/google-workspace-auth.js');
 
 const hybridAIAuthApiState = makeLazyApi<HybridAIAuthApi>(
   () => import('../auth/hybridai-auth.js'),
@@ -48,6 +50,10 @@ const codexAuthApiState = makeLazyApi<CodexAuthApi>(
   () => import('../auth/codex-auth.js'),
   'Codex auth API accessed before it was initialized. Call ensureCodexAuthApi() first.',
 );
+const googleWorkspaceAuthApiState = makeLazyApi<GoogleWorkspaceAuthApi>(
+  () => import('../auth/google-workspace-auth.js'),
+  'Google Workspace auth API accessed before it was initialized. Call ensureGoogleWorkspaceAuthApi() first.',
+);
 const CONFIGURED_SECRET_STATUS = 'configured';
 
 async function ensureHybridAIAuthApi(): Promise<HybridAIAuthApi> {
@@ -66,6 +72,14 @@ function getCodexAuthApi(): CodexAuthApi {
   return codexAuthApiState.get();
 }
 
+async function ensureGoogleWorkspaceAuthApi(): Promise<GoogleWorkspaceAuthApi> {
+  return googleWorkspaceAuthApiState.ensure();
+}
+
+function getGoogleWorkspaceAuthApi(): GoogleWorkspaceAuthApi {
+  return googleWorkspaceAuthApiState.get();
+}
+
 function parseExclusiveLoginMethodFlag<T extends string>(
   args: string[],
   params: {
@@ -123,6 +137,12 @@ interface ParsedHybridAILoginArgs {
   baseUrl?: string;
 }
 
+interface ParsedGoogleWorkspaceLoginArgs {
+  clientSecretPath: string | null;
+  authCode: string | null;
+  printAuthUrl: boolean;
+}
+
 function extractBaseUrlArg(args: string[]): {
   baseUrl?: string;
   remaining: string[];
@@ -169,6 +189,64 @@ function parseHybridAILoginArgs(args: string[]): ParsedHybridAILoginArgs {
   };
 }
 
+function parseGoogleWorkspaceLoginArgs(
+  args: string[],
+): ParsedGoogleWorkspaceLoginArgs {
+  let clientSecretPath: string | null = null;
+  let authCode: string | null = null;
+  let printAuthUrl = false;
+
+  for (let index = 0; index < args.length; index += 1) {
+    const arg = args[index] || '';
+    const clientSecretFlag = parseValueFlag({
+      arg,
+      args,
+      index,
+      name: '--client-secret',
+      placeholder: '<path>',
+      allowEmptyEquals: true,
+    });
+    if (clientSecretFlag) {
+      clientSecretPath = clientSecretFlag.value || null;
+      index = clientSecretFlag.nextIndex;
+      continue;
+    }
+    const authCodeFlag = parseValueFlag({
+      arg,
+      args,
+      index,
+      name: '--auth-code',
+      placeholder: '<code-or-url>',
+      allowEmptyEquals: true,
+    });
+    if (authCodeFlag) {
+      authCode = authCodeFlag.value || null;
+      index = authCodeFlag.nextIndex;
+      continue;
+    }
+    if (arg === '--auth-url') {
+      printAuthUrl = true;
+      continue;
+    }
+    if (arg.startsWith('-')) {
+      throw new Error(`Unknown flag: ${arg}`);
+    }
+    throw new Error(
+      `Unexpected argument: ${arg}. Use \`hybridclaw auth login google-workspace [--client-secret <path>] [--auth-url|--auth-code <code-or-url>]\`.`,
+    );
+  }
+
+  if (printAuthUrl && authCode) {
+    throw new Error('Use only one of `--auth-url` or `--auth-code <value>`.');
+  }
+
+  return {
+    clientSecretPath,
+    authCode,
+    printAuthUrl,
+  };
+}
+
 interface ParsedOpenRouterLoginArgs {
   modelId?: string;
   baseUrl?: string;
@@ -492,6 +570,7 @@ async function configureHuggingFace(args: string[]): Promise<void> {
 type UnifiedProvider =
   | 'hybridai'
   | 'codex'
+  | 'google-workspace'
   | 'openrouter'
   | 'mistral'
   | 'huggingface'
@@ -515,6 +594,15 @@ function normalizeUnifiedProvider(
   if (normalized === 'codex' || normalized === 'openai-codex') {
     return 'codex';
   }
+  if (
+    normalized === 'google-workspace' ||
+    normalized === 'googleworkspace' ||
+    normalized === 'google_workspace' ||
+    normalized === 'gworkspace' ||
+    normalized === 'gws'
+  ) {
+    return 'google-workspace';
+  }
   if (normalized === 'openrouter' || normalized === 'or') {
     return 'openrouter';
   }
@@ -562,7 +650,7 @@ function parseUnifiedProviderArgs(args: string[]): {
     const provider = normalizeUnifiedProvider(rawProvider);
     if (!provider) {
       throw new Error(
-        `Unknown provider "${rawProvider}". Use \`hybridai\`, \`codex\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
+        `Unknown provider "${rawProvider}". Use \`hybridai\`, \`codex\`, \`google-workspace\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
       );
     }
     return {
@@ -576,7 +664,7 @@ function parseUnifiedProviderArgs(args: string[]): {
     const provider = normalizeUnifiedProvider(rawProvider);
     if (!provider) {
       throw new Error(
-        `Unknown provider "${rawProvider}". Use \`hybridai\`, \`codex\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
+        `Unknown provider "${rawProvider}". Use \`hybridai\`, \`codex\`, \`google-workspace\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
       );
     }
     return {
@@ -627,6 +715,33 @@ function printOpenRouterStatus(): void {
   console.log('Catalog: auto-discovered');
 }
 
+function printGoogleWorkspaceStatus(): void {
+  const status = getGoogleWorkspaceAuthApi().getGoogleWorkspaceAuthStatus();
+
+  console.log(`Path: ${status.path}`);
+  console.log(`Authenticated: ${status.authenticated ? 'yes' : 'no'}`);
+  console.log(
+    `Client secret: ${status.clientConfigured ? CONFIGURED_SECRET_STATUS : 'not set'}`,
+  );
+  console.log(
+    `Pending auth session: ${status.pendingAuthorization ? 'yes' : 'no'}`,
+  );
+  console.log(
+    `Refresh token: ${status.refreshTokenConfigured ? CONFIGURED_SECRET_STATUS : 'not set'}`,
+  );
+  console.log(`Relogin required: ${status.reloginRequired ? 'yes' : 'no'}`);
+  if (status.authenticated) {
+    console.log('Source: runtime-secrets');
+    console.log(`Access token: ${CONFIGURED_SECRET_STATUS}`);
+    if (status.expiresAt) {
+      console.log(`Expires: ${new Date(status.expiresAt).toISOString()}`);
+    }
+    if (status.scopes.length > 0) {
+      console.log(`Granted scopes: ${status.scopes.length}`);
+    }
+  }
+}
+
 function printMistralStatus(): void {
   ensureRuntimeConfigFile();
   const config = getRuntimeConfig();
@@ -716,6 +831,15 @@ function clearHuggingFaceCredentials(): void {
   );
 }
 
+function clearGoogleWorkspaceCredentials(): void {
+  const filePath =
+    getGoogleWorkspaceAuthApi().clearGoogleWorkspaceCredentials();
+  console.log(`Cleared Google Workspace OAuth token in ${filePath}.`);
+  console.log(
+    'Stored Google Workspace client secret was kept. Re-run `hybridclaw auth login google-workspace` to authorize again.',
+  );
+}
+
 function normalizeHybridAIBaseUrl(rawBaseUrl: string): string {
   const trimmed = rawBaseUrl.trim().replace(/\/+$/g, '');
   if (!trimmed) return 'https://hybridai.one';
@@ -860,6 +984,10 @@ function printUnifiedProviderUsage(provider: UnifiedProvider): void {
     printCodexUsage();
     return;
   }
+  if (provider === 'google-workspace') {
+    printGoogleWorkspaceUsage();
+    return;
+  }
   if (provider === 'openrouter') {
     printOpenRouterUsage();
     return;
@@ -1116,7 +1244,7 @@ async function handleAuthLoginCommand(normalizedArgs: string[]): Promise<void> {
   const parsed = parseUnifiedProviderArgs(normalizedArgs);
   if (!parsed.provider) {
     throw new Error(
-      `Unknown auth login provider "${normalizedArgs[0]}". Use \`hybridai\`, \`codex\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
+      `Unknown auth login provider "${normalizedArgs[0]}". Use \`hybridai\`, \`codex\`, \`google-workspace\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
     );
   }
   if (isHelpRequest(parsed.remaining)) {
@@ -1132,6 +1260,10 @@ async function handleAuthLoginCommand(normalizedArgs: string[]): Promise<void> {
     await handleCodexCommand(['login', ...parsed.remaining]);
     return;
   }
+  if (parsed.provider === 'google-workspace') {
+    await configureGoogleWorkspaceAuth(parsed.remaining);
+    return;
+  }
   if (parsed.provider === 'openrouter') {
     await configureOpenRouter(parsed.remaining);
     return;
@@ -1247,6 +1379,15 @@ async function dispatchProviderAction(
     await handleCodexCommand([action]);
     return;
   }
+  if (provider === 'google-workspace') {
+    await ensureGoogleWorkspaceAuthApi();
+    if (action === 'status') {
+      printGoogleWorkspaceStatus();
+      return;
+    }
+    clearGoogleWorkspaceCredentials();
+    return;
+  }
   if (provider === 'openrouter') {
     if (action === 'status') {
       printOpenRouterStatus();
@@ -1299,7 +1440,7 @@ async function handleProviderActionCommand(
   const parsed = parseUnifiedProviderArgs(normalizedArgs);
   if (!parsed.provider) {
     throw new Error(
-      `Unknown ${action} provider "${normalizedArgs[0]}". Use \`hybridai\`, \`codex\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
+      `Unknown ${action} provider "${normalizedArgs[0]}". Use \`hybridai\`, \`codex\`, \`google-workspace\`, \`openrouter\`, \`mistral\`, \`huggingface\`, \`local\`, or \`msteams\`.`,
     );
   }
   if (parsed.remaining.length > 0) {
@@ -1543,6 +1684,111 @@ async function configureMSTeamsAuth(args: string[]): Promise<void> {
   );
 }
 
+async function configureGoogleWorkspaceAuth(args: string[]): Promise<void> {
+  await ensureGoogleWorkspaceAuthApi();
+  const parsed = parseGoogleWorkspaceLoginArgs(args);
+  let clientConfigured =
+    getGoogleWorkspaceAuthApi().getGoogleWorkspaceAuthStatus().clientConfigured;
+
+  const saveClientSecretFromPath = (secretPath: string): void => {
+    const result =
+      getGoogleWorkspaceAuthApi().saveGoogleWorkspaceClientSecretFile(
+        secretPath,
+      );
+    clientConfigured = true;
+    console.log(`Saved Google Workspace client secret to ${result.path}.`);
+    console.log(`Client ID: ${result.clientId}`);
+  };
+
+  if (parsed.clientSecretPath) {
+    saveClientSecretFromPath(parsed.clientSecretPath);
+  }
+
+  if (!clientConfigured) {
+    if (!process.stdin.isTTY || !process.stdout.isTTY) {
+      throw new Error(
+        'Google Workspace client secret is not configured. Pass `--client-secret <path>` first, then use `--auth-url` and `--auth-code <code-or-url>` to finish login.',
+      );
+    }
+    const rl = readline.createInterface({
+      input: process.stdin,
+      output: process.stdout,
+    });
+    try {
+      const secretPath = await promptWithDefault({
+        rl,
+        question: 'Path to Google OAuth client secret JSON',
+        required: true,
+      });
+      saveClientSecretFromPath(secretPath);
+    } finally {
+      rl.close();
+    }
+  }
+
+  if (parsed.authCode) {
+    const result =
+      await getGoogleWorkspaceAuthApi().exchangeGoogleWorkspaceAuthCode(
+        parsed.authCode,
+      );
+    console.log(`Saved Google Workspace OAuth token to ${result.path}.`);
+    console.log(`Expires: ${new Date(result.expiresAt).toISOString()}`);
+    console.log(`Granted scopes: ${result.scopes.length}`);
+    return;
+  }
+
+  if (parsed.printAuthUrl) {
+    const result = getGoogleWorkspaceAuthApi().startGoogleWorkspaceAuth();
+    console.log('Google Workspace OAuth prepared.');
+    console.log(`Path: ${result.path}`);
+    console.log(`Auth URL: ${result.authUrl}`);
+    console.log(`Redirect URI: ${result.redirectUri}`);
+    console.log('Next:');
+    console.log(
+      '  hybridclaw auth login google-workspace --auth-code "<redirect-url-or-code>"',
+    );
+    return;
+  }
+
+  if (!process.stdin.isTTY || !process.stdout.isTTY) {
+    console.log('Next:');
+    console.log('  hybridclaw auth login google-workspace --auth-url');
+    console.log(
+      '  hybridclaw auth login google-workspace --auth-code "<redirect-url-or-code>"',
+    );
+    return;
+  }
+
+  const startResult = getGoogleWorkspaceAuthApi().startGoogleWorkspaceAuth();
+  console.log('Google Workspace OAuth');
+  console.log(`Auth URL: ${startResult.authUrl}`);
+  console.log(`Redirect URI: ${startResult.redirectUri}`);
+  console.log(
+    'Open the URL in your browser, authorize HybridClaw, then paste the final redirect URL or authorization code here.',
+  );
+
+  const rl = readline.createInterface({
+    input: process.stdin,
+    output: process.stdout,
+  });
+  try {
+    const authCode = await promptWithDefault({
+      rl,
+      question: 'Authorization code or redirect URL',
+      required: true,
+    });
+    const result =
+      await getGoogleWorkspaceAuthApi().exchangeGoogleWorkspaceAuthCode(
+        authCode,
+      );
+    console.log(`Saved Google Workspace OAuth token to ${result.path}.`);
+    console.log(`Expires: ${new Date(result.expiresAt).toISOString()}`);
+    console.log(`Granted scopes: ${result.scopes.length}`);
+  } finally {
+    rl.close();
+  }
+}
+
 export async function handleHybridAICommand(args: string[]): Promise<void> {
   const normalized = normalizeArgs(args);
   if (normalized.length === 0 || isHelpRequest(normalized)) {
diff --git a/src/cli/help.ts b/src/cli/help.ts
index f8bf9056..17edb29e 100644
--- a/src/cli/help.ts
+++ b/src/cli/help.ts
@@ -47,6 +47,7 @@ Commands:
 
 export function printEvalUsage(): void {
   console.log(`Usage: hybridclaw eval [list|env|<suite>] [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>]
+       hybridclaw eval locomo [setup|run|status|stop|results|logs]
        hybridclaw eval terminal-bench-2.0 [setup|run|status|stop|results|logs]
        hybridclaw eval tau2 [setup|run|status|stop|results]
        hybridclaw eval [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>] <command...>
@@ -57,6 +58,10 @@ Examples:
   hybridclaw eval list
   hybridclaw eval env
   hybridclaw eval env --fresh-agent
+  hybridclaw eval locomo
+  hybridclaw eval locomo setup
+  hybridclaw eval locomo run --budget 4000 --max-questions 20
+  hybridclaw eval locomo run --mode retrieval --budget 4000 --max-questions 20
   hybridclaw eval tau2
   hybridclaw eval tau2 setup
   hybridclaw eval terminal-bench-2.0 setup
@@ -72,14 +77,19 @@ Examples:
 Notes:
   - This is a local-only command. It is not intended for remote chat channels.
   - Detached benchmark commands are launched directly with \`hybridclaw eval <command...>\`.
-  - Only \`terminal-bench-2.0\` and \`tau2\` have active HybridClaw implementations today.
+  - Only \`locomo\`, \`terminal-bench-2.0\`, and \`tau2\` have active HybridClaw implementations today.
   - \`swebench-verified\`, \`agentbench\`, and \`gaia\` are stub entries that return \`not implemented yet\`.
+  - \`locomo\` downloads the official \`locomo10.json\` dataset during \`setup\`.
+  - \`locomo --mode qa\` sends evaluate_gpts-style QA prompts through HybridClaw's local OpenAI-compatible gateway and scores the generated answers.
+  - \`locomo --mode retrieval\` skips model generation, ingests each conversation into an isolated native memory session, and scores evidence hit-rate from recalled semantic memories.
+  - \`locomo --num-samples\` limits conversation records; use \`--max-questions\` for fast smoke runs over a small QA slice.
+  - By default, \`locomo --mode qa\` creates one fresh template-seeded agent workspace per conversation sample. Use \`--current-agent\` to reuse the current agent workspace.
   - \`terminal-bench-2.0 run --num-tasks 10\` runs the native HybridClaw Terminal-Bench harness against local task containers.
   - \`tau2\` has managed subcommands: \`setup\`, \`run\`, \`status\`, \`stop\`, and \`results\`.
   - \`tau2 setup\` prefers a uv-managed Python 3.12 virtual environment when \`uv\` is available, then smoke-tests the installed \`tau2\` CLI.
   - For \`tau2 run\`, omitted \`--agent-llm\` and \`--user-llm\` flags default to \`$HYBRIDCLAW_EVAL_MODEL\`.
   - TUI and web sessions receive proactive ASCII progress bars for supported evals like \`tau2 run --num-tasks ...\`.
-  - The default eval mode uses the current agent workspace but a fresh transient OpenAI-compatible session per request.
+  - Outside suite-specific overrides, the default eval mode uses the current agent workspace but a fresh transient OpenAI-compatible session per request.
   - \`--fresh-agent\` uses a temporary template-seeded agent workspace for each eval request.
   - \`--ablate-system\` removes HybridClaw's injected system prompt for the eval request.
   - Prompt parts include hook names like \`memory\`, \`runtime\`, \`safety\`, \`bootstrap\` and bootstrap subparts like \`soul\`, \`identity\`, \`user\`, \`tools\`, \`memory-file\`, and \`skills\`.
@@ -173,9 +183,9 @@ export function printAuthUsage(): void {
 
 Commands:
   hybridclaw auth login
-  hybridclaw auth login <hybridai|codex|openrouter|mistral|huggingface|local|msteams> ...
-  hybridclaw auth status <hybridai|codex|openrouter|mistral|huggingface|local|msteams>
-  hybridclaw auth logout <hybridai|codex|openrouter|mistral|huggingface|local|msteams>
+  hybridclaw auth login <hybridai|codex|google-workspace|openrouter|mistral|huggingface|local|msteams> ...
+  hybridclaw auth status <hybridai|codex|google-workspace|openrouter|mistral|huggingface|local|msteams>
+  hybridclaw auth logout <hybridai|codex|google-workspace|openrouter|mistral|huggingface|local|msteams>
   hybridclaw auth whatsapp reset
 
 Examples:
@@ -183,6 +193,9 @@ Examples:
   hybridclaw auth login hybridai --browser
   hybridclaw auth login hybridai --base-url http://localhost:5000
   hybridclaw auth login codex --import
+  hybridclaw auth login google-workspace --client-secret /path/to/client_secret.json
+  hybridclaw auth login google-workspace --auth-url
+  hybridclaw auth login google-workspace --auth-code "http://localhost:1/?code=..."
   hybridclaw auth login openrouter anthropic/claude-sonnet-4 --api-key sk-or-...
   hybridclaw auth login mistral mistral-large-latest --api-key mistral_...
   hybridclaw auth login huggingface meta-llama/Llama-3.1-8B-Instruct --api-key hf_...
@@ -194,8 +207,10 @@ Examples:
   hybridclaw auth status openrouter
   hybridclaw auth status mistral
   hybridclaw auth status huggingface
+  hybridclaw auth status google-workspace
   hybridclaw auth status msteams
   hybridclaw auth logout codex
+  hybridclaw auth logout google-workspace
   hybridclaw auth logout mistral
   hybridclaw auth logout huggingface
   hybridclaw auth logout msteams
@@ -204,6 +219,7 @@ Notes:
   - \`auth login\` without a provider runs the normal interactive onboarding flow.
   - \`local logout\` disables configured local backends and clears any saved vLLM API key.
   - \`auth login msteams\` enables Microsoft Teams and stores \`MSTEAMS_APP_PASSWORD\` in ${runtimeSecretsPath()}.
+  - \`auth login google-workspace\` stores the OAuth client secret, pending PKCE state, and refreshable token in ${runtimeSecretsPath()}.
   - \`auth whatsapp reset\` clears linked WhatsApp Web auth so you can re-pair cleanly.
   - \`auth login openrouter\` prompts for the API key when \`--api-key\` and \`OPENROUTER_API_KEY\` are both absent.
   - \`auth login mistral\` prompts for the API key when \`--api-key\` and \`MISTRAL_API_KEY\` are both absent.
@@ -341,6 +357,21 @@ Notes:
   - If \`--app-id\` or \`--app-password\` is missing and the terminal is interactive, HybridClaw prompts for them and also offers an optional tenant id prompt.`);
 }
 
+export function printGoogleWorkspaceUsage(): void {
+  console.log(`Usage:
+  hybridclaw auth login google-workspace [--client-secret <path>] [--auth-url|--auth-code <code-or-url>]
+  hybridclaw auth status google-workspace
+  hybridclaw auth logout google-workspace
+
+Notes:
+  - \`--client-secret <path>\` stores a Google OAuth desktop client JSON in ${runtimeSecretsPath()}.
+  - \`--auth-url\` creates a pending PKCE session and prints the Google consent URL.
+  - \`--auth-code <code-or-url>\` exchanges a pasted authorization code or final redirect URL for a refreshable token.
+  - The current built-in scope bundle covers Gmail, Calendar, Drive, Docs, Sheets, and Contacts.
+  - With no flags in an interactive terminal, HybridClaw prompts for the client secret path if needed, prints the auth URL, and then prompts for the pasted redirect URL or code.
+  - \`auth logout google-workspace\` clears the stored OAuth token and pending session but keeps the saved client secret for future re-login.`);
+}
+
 export function printCodexUsage(): void {
   console.log(`Usage: hybridclaw codex <command> (deprecated)
 
@@ -616,6 +647,7 @@ Topics:
   config      Help for local runtime config commands
   plugin      Help for plugin management
   msteams     Help for Microsoft Teams auth/setup commands
+  google-workspace Help for Google Workspace OAuth setup/status/logout commands
   openrouter  Help for OpenRouter setup/status/logout commands
   mistral     Help for Mistral setup/status/logout commands
   huggingface Help for Hugging Face setup/status/logout commands
@@ -699,6 +731,11 @@ export async function printHelpTopic(topic: string): Promise<boolean> {
     case 'teams':
       printMSTeamsUsage();
       return true;
+    case 'google-workspace':
+    case 'googleworkspace':
+    case 'gws':
+      printGoogleWorkspaceUsage();
+      return true;
     case 'local':
       printLocalUsage();
       return true;
diff --git a/src/command-registry.ts b/src/command-registry.ts
index 674d4d5a..7c3e9895 100644
--- a/src/command-registry.ts
+++ b/src/command-registry.ts
@@ -1707,6 +1707,32 @@ function buildSlashCommandCatalogDefinitions(
           insertText: '/eval swebench-verified',
           description: 'Stub entry for a planned SWE-bench Verified runner',
         },
+        {
+          id: 'eval.locomo',
+          label: '/eval locomo',
+          insertText: '/eval locomo',
+          description: 'Show the native LOCOMO memory benchmark commands',
+        },
+        {
+          id: 'eval.locomo.setup',
+          label: '/eval locomo setup',
+          insertText: '/eval locomo setup',
+          description:
+            'Download the official LOCOMO dataset into the local eval workspace',
+        },
+        {
+          id: 'eval.locomo.run',
+          label: '/eval locomo run --budget 4000 --num-samples 2',
+          insertText: '/eval locomo run --budget 4000 --num-samples 2',
+          description:
+            'Run a small native LOCOMO memory benchmark sample with recent-tail and semantic-recall modes',
+        },
+        {
+          id: 'eval.locomo.results',
+          label: '/eval locomo results',
+          insertText: '/eval locomo results',
+          description: 'Show the latest LOCOMO summary and comparison metrics',
+        },
         {
           id: 'eval.terminal-bench-2.0',
           label: '/eval terminal-bench-2.0',
diff --git a/src/evals/eval-command.ts b/src/evals/eval-command.ts
index a179d467..3ca94755 100644
--- a/src/evals/eval-command.ts
+++ b/src/evals/eval-command.ts
@@ -5,6 +5,8 @@ import { fileURLToPath } from 'node:url';
 import { MAX_CONCURRENT_CONTAINERS } from '../config/config.js';
 import { isContainerMaxConcurrentExplicit } from '../config/runtime-config.js';
 import type { GatewayCommandResult } from '../gateway/gateway-types.js';
+import { resolveInstallRoot } from '../infra/install-root.js';
+import { logger } from '../logger.js';
 import {
   enqueueProactiveMessage,
   isDatabaseInitialized,
@@ -16,9 +18,19 @@ import {
   encodeEvalProfileModel,
   isKnownEvalPromptPart,
 } from './eval-profile.js';
+import type {
+  LocomoAgentMode,
+  LocomoCategoryAggregate as LocomoNativeCategoryAggregate,
+  LocomoTokenUsage as LocomoNativeTokenUsage,
+} from './locomo-types.js';
+import {
+  LOCOMO_DATASET_FILENAME,
+  LOCOMO_SETUP_MARKER,
+} from './locomo-types.js';
 
 type EvalSuiteId =
   | 'swebench-verified'
+  | 'locomo'
   | 'terminal-bench-2.0'
   | 'agentbench'
   | 'gaia';
@@ -77,9 +89,17 @@ interface EvalRunPreparation {
   progress: EvalProgressSpec | null;
 }
 
+interface ParsedEvalAction {
+  action: string;
+  profile: EvalProfile;
+  commandArgs: string[];
+  workspaceModeExplicit: boolean;
+  error?: string;
+}
+
 interface EvalSetupCommand {
   command: string;
-  strategy: 'uv' | 'system-python';
+  strategy: 'native' | 'uv' | 'system-python';
 }
 
 interface EvalRunMeta {
@@ -171,6 +191,44 @@ interface TerminalBenchNativeTokenUsage {
   apiUsageAvailable: boolean;
 }
 
+interface LocomoNativeSummary {
+  jobDir: string;
+  resultPath: string;
+  predictionsPath: string | null;
+  mode: 'qa' | 'retrieval';
+  dataset: string | null;
+  model: string | null;
+  budgetTokens: number | null;
+  sampleCount: number | null;
+  questionCount: number | null;
+  overallScore: number | null;
+  contextF1: number | null;
+  categories: Record<string, LocomoNativeCategoryAggregate>;
+  tokenUsage: LocomoNativeTokenUsage | null;
+}
+
+interface LocomoNativeProgress {
+  jobDir: string;
+  progressPath: string;
+  resultPath: string;
+  predictionsPath: string | null;
+  mode: 'qa' | 'retrieval';
+  dataset: string | null;
+  model: string | null;
+  budgetTokens: number | null;
+  sampleCount: number | null;
+  completedSampleCount: number | null;
+  questionCount: number | null;
+  completedQuestionCount: number | null;
+  overallScore: number | null;
+  contextF1: number | null;
+  currentSampleId: string | null;
+  currentSampleQuestionCount: number | null;
+  currentSampleQuestionTotal: number | null;
+  categories: Record<string, LocomoNativeCategoryAggregate>;
+  tokenUsage: LocomoNativeTokenUsage | null;
+}
+
 const MAX_QUEUED_EVAL_MESSAGES = 200;
 const EVAL_PROGRESS_BAR_WIDTH = 20;
 const EVAL_PROGRESS_POLL_INTERVAL_MS = 1000;
@@ -207,6 +265,30 @@ const EVAL_SUITES: EvalSuiteDefinition[] = [
       '`/eval ...` injects the OpenAI-compatible HybridClaw endpoint for any predictor step you run through this helper.',
     ],
   },
+  {
+    id: 'locomo',
+    title: 'LOCOMO',
+    summary:
+      'Native HybridClaw LoCoMo QA benchmark over the official long-conversation dataset.',
+    aliases: ['lo-co-mo', 'locomo-memory'],
+    prereqs: [
+      'Node.js 22',
+      'network access during `setup` to download `locomo10.json`',
+    ],
+    starter: [
+      '/eval locomo setup',
+      '/eval locomo run --budget 4000 --max-questions 20',
+      '/eval locomo run --mode retrieval --budget 4000 --max-questions 20',
+    ],
+    notes: [
+      'The default `qa` mode generates LoCoMo answers through HybridClaw’s local OpenAI-compatible gateway and scores the model outputs directly.',
+      '`--mode retrieval` skips model generation, ingests each conversation into an isolated native memory session, and scores evidence hit-rate from recalled semantic memories.',
+      'The `qa` prompt shape follows the upstream `evaluate_gpts` flow: truncated conversation context plus a short-answer QA prompt for each LoCoMo question.',
+      '`--num-samples` limits conversation records. Use `--max-questions` for quick smoke runs over a small number of LoCoMo questions.',
+      'By default, LOCOMO creates one fresh template-seeded agent per conversation sample. Use `--current-agent` to reuse the current agent workspace.',
+      'Prompt/profile eval flags flow through `HYBRIDCLAW_EVAL_MODEL`, so agent/workspace mode and prompt ablations affect the benchmarked run.',
+    ],
+  },
   {
     id: 'terminal-bench-2.0',
     title: 'Terminal-Bench 2.0',
@@ -330,7 +412,7 @@ function renderSuiteList(): string[] {
 }
 
 function isImplementedManagedSuite(suite: EvalSuiteDefinition): boolean {
-  return suite.id === 'terminal-bench-2.0';
+  return suite.id === 'terminal-bench-2.0' || suite.id === 'locomo';
 }
 
 function renderUnimplementedSuite(
@@ -351,6 +433,7 @@ function renderUnimplementedSuite(
     ...describeEvalProfile(env.profile).map((entry) => `- ${entry}`),
     '',
     'Implemented suites today:',
+    '- `/eval locomo ...`',
     '- `/eval terminal-bench-2.0 ...`',
     '- `/eval tau2 ...`',
   ].join('\n');
@@ -363,6 +446,7 @@ function renderUsage(env: EvalEnvironment): string {
     'Usage:',
     '- `/eval list`',
     '- `/eval env [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>]`',
+    '- `/eval locomo [setup|run|status|stop|results|logs]`',
     '- `/eval terminal-bench-2.0 [setup|run|status|stop|results|logs]`',
     '- `/eval tau2 [setup|run|status|stop|results]`',
     '- `/eval <suite> [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>]`',
@@ -379,7 +463,7 @@ function renderUsage(env: EvalEnvironment): string {
     'Suites:',
     ...renderSuiteList(),
     '',
-    'Only `terminal-bench-2.0` and `tau2` are implemented today.',
+    'Only `locomo`, `terminal-bench-2.0`, and `tau2` are implemented today.',
   ].join('\n');
 }
 
@@ -523,7 +607,7 @@ function renderRecipe(
     '',
     'Managed commands:',
     `- \`/eval ${suite.id} setup\``,
-    `- \`/eval ${suite.id} run --num-tasks 10\``,
+    `- \`${getManagedSuiteRunExample(suite)}\``,
     `- \`/eval ${suite.id} status\``,
     `- \`/eval ${suite.id} stop\``,
     `- \`/eval ${suite.id} results\``,
@@ -533,6 +617,17 @@ function renderRecipe(
   ].join('\n');
 }
 
+function getManagedSuiteRunExample(suite: EvalSuiteDefinition): string {
+  switch (suite.id) {
+    case 'locomo':
+      return '/eval locomo run --budget 4000 --max-questions 20';
+    case 'terminal-bench-2.0':
+      return '/eval terminal-bench-2.0 run --num-tasks 10';
+    default:
+      return `/eval ${suite.id} run`;
+  }
+}
+
 function renderTau2Usage(env: EvalEnvironment, dataDir: string): string {
   const installDir = getTau2InstallDir(dataDir);
   return [
@@ -602,12 +697,22 @@ function getTau2InstallDir(dataDir: string): string {
 function getManagedSuiteSetup(
   suite: EvalSuiteDefinition,
 ): ManagedEvalSuiteSetup | null {
-  if (suite.id !== 'terminal-bench-2.0') return null;
-  return {
-    installDirName: 'terminal-bench-2.0',
-    strategyDescription:
-      'uv-managed Python 3.12 venv with Hugging Face datasets install and native Terminal-Bench helper smoke test',
-  };
+  switch (suite.id) {
+    case 'locomo':
+      return {
+        installDirName: 'locomo',
+        strategyDescription:
+          'native HybridClaw LOCOMO harness with official locomo10 dataset download',
+      };
+    case 'terminal-bench-2.0':
+      return {
+        installDirName: 'terminal-bench-2.0',
+        strategyDescription:
+          'uv-managed Python 3.12 venv with Hugging Face datasets install and native Terminal-Bench helper smoke test',
+      };
+    default:
+      return null;
+  }
 }
 
 function getManagedSuiteInstallDir(
@@ -627,7 +732,16 @@ function getManagedSuiteMarkerPath(
 ): string {
   return path.join(
     getManagedSuiteInstallDir(suite, dataDir),
-    '.hybridclaw-setup-ok',
+    LOCOMO_SETUP_MARKER,
+  );
+}
+
+function getLocomoDatasetPath(dataDir: string): string {
+  return path.join(
+    getEvalBaseDir(dataDir),
+    'locomo',
+    'data',
+    LOCOMO_DATASET_FILENAME,
   );
 }
 
@@ -654,6 +768,12 @@ function isManagedSuiteInstalled(
   suite: EvalSuiteDefinition,
   dataDir: string,
 ): boolean {
+  if (suite.id === 'locomo') {
+    return (
+      fs.existsSync(getManagedSuiteMarkerPath(suite, dataDir)) &&
+      fs.existsSync(getLocomoDatasetPath(dataDir))
+    );
+  }
   return (
     fs.existsSync(getManagedSuiteMarkerPath(suite, dataDir)) &&
     fs.existsSync(getManagedSuitePythonPath(suite, dataDir))
@@ -684,8 +804,19 @@ function getManagedSuiteSetupCommand(
   }
 
   const installDir = getManagedSuiteInstallDir(suite, dataDir);
+  if (suite.id === 'locomo') {
+    return {
+      strategy: 'native',
+      command: buildInternalEvalCommand('__eval-locomo-native', [
+        'setup',
+        '--install-dir',
+        installDir,
+      ]),
+    };
+  }
+
   const installDirQuoted = quoteShellArg(installDir);
-  const markerFile = '.hybridclaw-setup-ok';
+  const markerFile = LOCOMO_SETUP_MARKER;
   const venvPython =
     process.platform === 'win32'
       ? '.venv\\Scripts\\python.exe'
@@ -761,6 +892,9 @@ function getManagedSuiteNextStep(
   _dataDir: string,
 ): string {
   switch (suite.id) {
+    case 'locomo': {
+      return '/eval locomo run --budget 4000 --max-questions 20';
+    }
     case 'terminal-bench-2.0': {
       return `/eval terminal-bench-2.0 run --num-tasks 10`;
     }
@@ -769,6 +903,45 @@ function getManagedSuiteNextStep(
   }
 }
 
+function buildInternalEvalCommand(commandName: string, args: string[]): string {
+  const commandArgs =
+    resolveInternalEvalLauncherCommandArgs().map(quoteShellArg);
+
+  return buildCommandString([
+    ...commandArgs,
+    commandName,
+    ...args.map((entry) =>
+      entry.startsWith('--') ? entry : quoteShellArg(entry),
+    ),
+  ]);
+}
+
+function resolveInternalEvalLauncherCommandArgs(): string[] {
+  const installRoot = resolveInstallRoot();
+  const sourceCliPath = path.join(installRoot, 'src', 'cli.ts');
+  const tsxCliPath = path.join(
+    installRoot,
+    'node_modules',
+    'tsx',
+    'dist',
+    'cli.mjs',
+  );
+  const distCliPath = path.join(installRoot, 'dist', 'cli.js');
+
+  if (fs.existsSync(sourceCliPath) && fs.existsSync(tsxCliPath)) {
+    return [process.execPath, tsxCliPath, sourceCliPath];
+  }
+  if (fs.existsSync(distCliPath)) {
+    return [process.execPath, distCliPath];
+  }
+
+  const cliEntry = process.argv[1]?.trim();
+  if (!cliEntry) {
+    throw new Error('Unable to resolve the HybridClaw CLI entry point.');
+  }
+  return [process.execPath, path.resolve(cliEntry)];
+}
+
 function getTerminalBenchDatasetHelperPath(dataDir: string): string {
   return path.join(
     getEvalBaseDir(dataDir),
@@ -830,7 +1003,24 @@ function prepareManagedSuiteRun(
   env: EvalEnvironment,
   effectiveAgentId: string,
   args: string[],
+  workspaceModeExplicit: boolean,
 ): ManagedSuiteRunPreparation | null {
+  if (suite.id === 'locomo') {
+    const agentMode = resolveLocomoAgentMode(workspaceModeExplicit);
+    return {
+      commandArgs: ['locomo', 'run', ...args],
+      command: buildInternalEvalCommand('__eval-locomo-native', [
+        'run',
+        '--install-dir',
+        getManagedSuiteInstallDir(suite, dataDir),
+        '--agent-mode',
+        agentMode,
+        ...args,
+      ]),
+      displayCommand: buildCommandString([suite.id, 'run', ...args]),
+      cwd: dataDir,
+    };
+  }
   if (suite.id !== 'terminal-bench-2.0') return null;
   ensureTerminalBenchDatasetHelper(dataDir);
 
@@ -862,12 +1052,9 @@ function prepareManagedSuiteRun(
     translatedArgs.push('--n-concurrent', String(defaultConcurrency));
   }
 
-  const cliEntry = process.argv[1]?.trim();
-  if (!cliEntry) return null;
   const promptMode = 'none';
   const internalArgs = [
-    quoteShellArg(process.execPath),
-    quoteShellArg(path.resolve(cliEntry)),
+    ...resolveInternalEvalLauncherCommandArgs().map(quoteShellArg),
     '__eval-terminal-bench-native',
     '--install-dir',
     quoteShellArg(getManagedSuiteInstallDir(suite, dataDir)),
@@ -902,6 +1089,12 @@ function prepareManagedSuiteRun(
   };
 }
 
+function resolveLocomoAgentMode(
+  workspaceModeExplicit: boolean,
+): LocomoAgentMode {
+  return workspaceModeExplicit ? 'current-agent' : 'conversation-fresh';
+}
+
 function quoteShellArg(value: string): string {
   if (process.platform === 'win32') {
     return `"${value.replace(/"/g, '""')}"`;
@@ -1180,10 +1373,83 @@ function readTerminalBenchJobDir(meta: EvalRunMeta): string | null {
   return jobDir ? jobDir : null;
 }
 
-function readFiniteTokenNumber(value: unknown, fallback = 0): number {
+function readFiniteNumber(
+  value: unknown,
+  fallback: number | null = null,
+): number | null {
   return typeof value === 'number' && Number.isFinite(value) ? value : fallback;
 }
 
+function readFiniteTokenNumber(value: unknown, fallback = 0): number {
+  return readFiniteNumber(value, fallback) ?? fallback;
+}
+
+function readStringValue(value: unknown): string | null {
+  return typeof value === 'string' ? value : null;
+}
+
+function readLocomoCategoryAggregates(
+  value: unknown,
+): Record<string, LocomoNativeCategoryAggregate> {
+  if (!value || typeof value !== 'object') {
+    return {};
+  }
+  return Object.fromEntries(
+    Object.entries(value).map(([category, aggregate]) => {
+      const parsedAggregate =
+        aggregate && typeof aggregate === 'object' ? aggregate : {};
+      return [
+        category,
+        {
+          meanScore:
+            readFiniteNumber(
+              (parsedAggregate as { meanScore?: unknown }).meanScore,
+              0,
+            ) ?? 0,
+          questionCount:
+            readFiniteNumber(
+              (parsedAggregate as { questionCount?: unknown }).questionCount,
+              0,
+            ) ?? 0,
+          contextF1: readFiniteNumber(
+            (parsedAggregate as { contextF1?: unknown }).contextF1,
+          ),
+        },
+      ];
+    }),
+  ) as Record<string, LocomoNativeCategoryAggregate>;
+}
+
+function readLocomoTokenUsage(value: unknown): LocomoNativeTokenUsage | null {
+  if (!value || typeof value !== 'object') {
+    return null;
+  }
+  const tokenUsage = value as {
+    promptTokens?: unknown;
+    completionTokens?: unknown;
+    totalTokens?: unknown;
+    responsesWithUsage?: unknown;
+  };
+  const promptTokens = readFiniteNumber(tokenUsage.promptTokens);
+  const completionTokens = readFiniteNumber(tokenUsage.completionTokens);
+  const totalTokens = readFiniteNumber(tokenUsage.totalTokens);
+  const responsesWithUsage = readFiniteNumber(tokenUsage.responsesWithUsage);
+  if (
+    promptTokens == null ||
+    completionTokens == null ||
+    totalTokens == null ||
+    responsesWithUsage == null
+  ) {
+    return null;
+  }
+  return {
+    promptTokens,
+    completionTokens,
+    totalTokens,
+    responsesWithUsage,
+  };
+}
+
 function readTerminalBenchJobTokenUsage(
   jobDir: string,
   taskNames?: readonly string[],
@@ -1345,6 +1611,128 @@ function readTerminalBenchNativeProgress(
   };
 }
 
+function readLocomoJobDir(meta: EvalRunMeta): string | null {
+  if (meta.suiteId !== 'locomo' || meta.operation !== 'run') {
+    return null;
+  }
+  const stdoutText = readLogFileText(meta.stdoutPath);
+  const match = stdoutText.match(/^Job dir:\s*(.+)$/m);
+  const jobDir = String(match?.[1] || '').trim();
+  return jobDir || null;
+}
+
+function readLocomoNativeSummary(
+  meta: EvalRunMeta,
+): LocomoNativeSummary | null {
+  const jobDir = readLocomoJobDir(meta);
+  if (!jobDir) return null;
+  const resultPath = path.join(jobDir, 'result.json');
+  if (!fs.existsSync(resultPath)) return null;
+  try {
+    const parsed = JSON.parse(fs.readFileSync(resultPath, 'utf-8')) as Record<
+      string,
+      unknown
+    >;
+    return {
+      jobDir,
+      resultPath,
+      predictionsPath: readStringValue(parsed.predictionsPath),
+      mode: parsed.mode === 'retrieval' ? 'retrieval' : 'qa',
+      dataset: readStringValue(parsed.dataset),
+      model: readStringValue(parsed.model),
+      budgetTokens: readFiniteNumber(parsed.budgetTokens),
+      sampleCount: readFiniteNumber(parsed.sampleCount),
+      questionCount: readFiniteNumber(parsed.questionCount),
+      overallScore: readFiniteNumber(parsed.overallScore),
+      contextF1: readFiniteNumber(parsed.contextF1),
+      categories: readLocomoCategoryAggregates(parsed.categories),
+      tokenUsage: readLocomoTokenUsage(parsed.tokenUsage),
+    };
+  } catch (error) {
+    logger.debug(
+      {
+        err: error,
+        runId: meta.runId,
+        resultPath,
+      },
+      'Failed to parse LOCOMO result summary',
+    );
+    return null;
+  }
+}
+
+function readLocomoNativeProgress(
+  meta: EvalRunMeta,
+): LocomoNativeProgress | null {
+  const jobDir = readLocomoJobDir(meta);
+  if (!jobDir) return null;
+  const progressPath = path.join(jobDir, 'progress.json');
+  if (!fs.existsSync(progressPath)) return null;
+  try {
+    const parsed = JSON.parse(fs.readFileSync(progressPath, 'utf-8')) as Record<
+      string,
+      unknown
+    >;
+    return {
+      jobDir,
+      progressPath,
+      resultPath:
+        readStringValue(parsed.resultPath) || path.join(jobDir, 'result.json'),
+      predictionsPath: readStringValue(parsed.predictionsPath),
+      mode: parsed.mode === 'retrieval' ? 'retrieval' : 'qa',
+      dataset: readStringValue(parsed.dataset),
+      model: readStringValue(parsed.model),
+      budgetTokens: readFiniteNumber(parsed.budgetTokens),
+      sampleCount: readFiniteNumber(parsed.sampleCount),
+      completedSampleCount: readFiniteNumber(parsed.completedSampleCount),
+      questionCount: readFiniteNumber(parsed.questionCount),
+      completedQuestionCount: readFiniteNumber(parsed.completedQuestionCount),
+      overallScore: readFiniteNumber(parsed.overallScore),
+      contextF1: readFiniteNumber(parsed.contextF1),
+      currentSampleId: readStringValue(parsed.currentSampleId),
+      currentSampleQuestionCount: readFiniteNumber(
+        parsed.currentSampleQuestionCount,
+      ),
+      currentSampleQuestionTotal: readFiniteNumber(
+        parsed.currentSampleQuestionTotal,
+      ),
+      categories: readLocomoCategoryAggregates(parsed.categories),
+      tokenUsage: readLocomoTokenUsage(parsed.tokenUsage),
+    };
+  } catch (error) {
+    logger.debug(
+      {
+        err: error,
+        runId: meta.runId,
+        progressPath,
+      },
+      'Failed to parse LOCOMO progress summary',
+    );
+    return null;
+  }
+}
+
+function formatLocomoCategoryValue(
+  category: string,
+  value: LocomoNativeCategoryAggregate | undefined,
+  mode: 'qa' | 'retrieval',
+): string | null {
+  if (!value) return null;
+  if (mode === 'retrieval') {
+    return `Category ${category} | Hit ${value.meanScore.toFixed(3)} | F1 ${(
+      value.contextF1 ?? 0
+    ).toFixed(3)} | Q ${value.questionCount}`;
+  }
+  return `Category ${category} | Score ${value.meanScore.toFixed(3)} | Q ${value.questionCount}`;
+}
+
+function formatLocomoTokenUsage(
+  value: LocomoNativeTokenUsage | null | undefined,
+): string | null {
+  if (!value) return null;
+  return `${value.totalTokens} total (${value.promptTokens} prompt + ${value.completionTokens} completion)`;
+}
+
 function describeManagedSuiteRunLifecycle(
   meta: EvalRunMeta,
   summary?: TerminalBenchNativeSummary | null,
@@ -2300,6 +2688,14 @@ function renderManagedSuiteStatus(
     suite.id === 'terminal-bench-2.0' && latestRun
       ? readTerminalBenchNativeSummary(latestRun)
       : null;
+  const latestLocomoSummary =
+    suite.id === 'locomo' && latestRun
+      ? readLocomoNativeSummary(latestRun)
+      : null;
+  const latestLocomoProgress =
+    suite.id === 'locomo' && latestRun
+      ? readLocomoNativeProgress(latestRun)
+      : null;
   const setupFailure =
     !installed && latestSetup ? describeRunFailureReason(latestSetup) : null;
 
@@ -2307,6 +2703,11 @@ function renderManagedSuiteStatus(
     `Install dir: ${installDir}`,
     `Installed: ${installed ? 'yes' : 'no'}`,
     `Marker: ${fs.existsSync(markerPath) ? markerPath : 'missing'}`,
+    ...(suite.id === 'locomo'
+      ? [
+          `Dataset: ${fs.existsSync(getLocomoDatasetPath(dataDir)) ? getLocomoDatasetPath(dataDir) : 'missing'}`,
+        ]
+      : []),
     ...(executablePath
       ? [
           `Executable: ${fs.existsSync(executablePath) ? executablePath : 'missing'}`,
@@ -2326,6 +2727,117 @@ function renderManagedSuiteStatus(
           `Command: ${latestRun.displayCommand || latestRun.command}`,
           `Stdout: ${latestRun.stdoutPath}`,
           `Stderr: ${latestRun.stderrPath}`,
+          ...(latestLocomoSummary
+            ? [
+                `Mode: ${latestLocomoSummary.mode}`,
+                `Samples: ${latestLocomoSummary.sampleCount ?? 'unknown'}`,
+                `Questions: ${latestLocomoSummary.questionCount ?? 'unknown'}`,
+                `Budget: ${latestLocomoSummary.budgetTokens ?? 'unknown'}`,
+                `${
+                  latestLocomoSummary.mode === 'retrieval'
+                    ? 'Hit rate'
+                    : 'Overall score'
+                }: ${
+                  latestLocomoSummary.overallScore != null
+                    ? latestLocomoSummary.overallScore.toFixed(3)
+                    : 'unknown'
+                }`,
+                ...(latestLocomoSummary.mode === 'retrieval'
+                  ? [
+                      `Context F1: ${
+                        latestLocomoSummary.contextF1 != null
+                          ? latestLocomoSummary.contextF1.toFixed(3)
+                          : 'unknown'
+                      }`,
+                    ]
+                  : []),
+                ...Object.entries(latestLocomoSummary.categories)
+                  .sort(([left], [right]) => Number(left) - Number(right))
+                  .map(
+                    ([category, value]) =>
+                      `cat${category}: ${
+                        formatLocomoCategoryValue(
+                          category,
+                          value,
+                          latestLocomoSummary.mode,
+                        ) || 'n/a'
+                      }`,
+                  ),
+                ...(latestLocomoSummary.tokenUsage
+                  ? [
+                      `Tokens: ${formatLocomoTokenUsage(
+                        latestLocomoSummary.tokenUsage,
+                      )}`,
+                    ]
+                  : []),
+                ...(latestLocomoSummary.predictionsPath
+                  ? [`Predictions: ${latestLocomoSummary.predictionsPath}`]
+                  : []),
+              ]
+            : []),
+          ...(!latestLocomoSummary && latestLocomoProgress
+            ? [
+                `Mode: ${latestLocomoProgress.mode}`,
+                `Samples: ${latestLocomoProgress.sampleCount ?? 'unknown'}`,
+                `Completed samples: ${
+                  latestLocomoProgress.completedSampleCount ?? 'unknown'
+                }/${latestLocomoProgress.sampleCount ?? '?'}`,
+                `Questions: ${
+                  latestLocomoProgress.completedQuestionCount ?? 'unknown'
+                }/${latestLocomoProgress.questionCount ?? '?'}`,
+                `Budget: ${latestLocomoProgress.budgetTokens ?? 'unknown'}`,
+                `${
+                  latestLocomoProgress.mode === 'retrieval'
+                    ? 'Hit rate so far'
+                    : 'Score so far'
+                }: ${
+                  latestLocomoProgress.overallScore != null
+                    ? latestLocomoProgress.overallScore.toFixed(3)
+                    : 'unknown'
+                }`,
+                ...(latestLocomoProgress.mode === 'retrieval'
+                  ? [
+                      `Context F1 so far: ${
+                        latestLocomoProgress.contextF1 != null
+                          ? latestLocomoProgress.contextF1.toFixed(3)
+                          : 'unknown'
+                      }`,
+                    ]
+                  : []),
+                ...(latestLocomoProgress.currentSampleId
+                  ? [
+                      `Current sample: ${latestLocomoProgress.currentSampleId}`,
+                      ...(latestLocomoProgress.currentSampleQuestionCount !=
+                        null &&
+                      latestLocomoProgress.currentSampleQuestionTotal != null
+                        ? [
+                            `Current sample questions: ${latestLocomoProgress.currentSampleQuestionCount}/${latestLocomoProgress.currentSampleQuestionTotal}`,
+                          ]
+                        : []),
+                    ]
+                  : []),
+                ...Object.entries(latestLocomoProgress.categories)
+                  .sort(([left], [right]) => Number(left) - Number(right))
+                  .map(
+                    ([category, value]) =>
+                      `cat${category}: ${
+                        formatLocomoCategoryValue(
+                          category,
+                          value,
+                          latestLocomoProgress.mode,
+                        ) || 'n/a'
+                      }`,
+                  ),
+                ...(latestLocomoProgress.tokenUsage
+                  ? [
+                      `Tokens: ${formatLocomoTokenUsage(
+                        latestLocomoProgress.tokenUsage,
+                      )}`,
+                    ]
+                  : []),
+                `Progress JSON: ${latestLocomoProgress.progressPath}`,
+              ]
+            : []),
           ...(latestTerminalBenchSummary
             ? [
                 `Score: ${latestTerminalBenchSummary.mean.toFixed(3)}`,
@@ -2369,6 +2881,14 @@ function renderManagedSuiteResults(
     suite.id === 'terminal-bench-2.0' && latestJob.operation === 'run'
       ? readTerminalBenchNativeProgress(latestJob)
       : null;
+  const locomoSummary =
+    suite.id === 'locomo' && latestJob.operation === 'run'
+      ? readLocomoNativeSummary(latestJob)
+      : null;
+  const locomoProgress =
+    suite.id === 'locomo' && latestJob.operation === 'run'
+      ? readLocomoNativeProgress(latestJob)
+      : null;
   if (suite.id === 'terminal-bench-2.0') {
     const overviewSection = renderKeyValueSection('Overview', [
       ['Evaluated model', latestJob.baseModel || latestJob.model],
@@ -2430,6 +2950,112 @@ function renderManagedSuiteResults(
       joinSections([overviewSection, outcomeSection, runSection, pathsSection]),
     );
   }
+  if (suite.id === 'locomo') {
+    const locomoData = locomoSummary || locomoProgress;
+    const overviewSection = renderKeyValueSection('Overview', [
+      [
+        'Evaluated model',
+        locomoData?.mode !== 'retrieval'
+          ? latestJob.baseModel || latestJob.model
+          : null,
+      ],
+      ['Harness', `HybridClaw v${resolveHarnessVersion()}`],
+      ['Status', describeManagedSuiteRunLifecycle(latestJob)],
+      ['Mode', locomoData?.mode || null],
+      ['Dataset', locomoData?.dataset || null],
+      ['Samples', locomoData?.sampleCount ?? null],
+      [
+        'Questions',
+        locomoSummary
+          ? locomoSummary.questionCount
+          : locomoProgress
+            ? `${locomoProgress.completedQuestionCount ?? '?'}/${locomoProgress.questionCount ?? '?'}`
+            : null,
+      ],
+      [
+        'Completed samples',
+        !locomoSummary && locomoProgress
+          ? `${locomoProgress.completedSampleCount ?? '?'}/${locomoProgress.sampleCount ?? '?'}`
+          : null,
+      ],
+      ['Budget', locomoData?.budgetTokens ?? null],
+      [
+        locomoSummary
+          ? locomoData?.mode === 'retrieval'
+            ? 'Hit rate'
+            : 'Overall score'
+          : locomoData?.mode === 'retrieval'
+            ? 'Hit rate so far'
+            : 'Score so far',
+        locomoData?.overallScore ?? null,
+      ],
+      [
+        locomoSummary ? 'Context F1' : 'Context F1 so far',
+        locomoData?.mode === 'retrieval'
+          ? (locomoData?.contextF1 ?? null)
+          : null,
+      ],
+      [
+        'Current sample',
+        !locomoSummary && locomoProgress
+          ? locomoProgress.currentSampleId
+          : null,
+      ],
+    ]);
+    const categorySection = renderKeyValueSection(
+      locomoSummary ? 'Categories' : 'Categories So Far',
+      Object.entries(locomoData?.categories || {})
+        .sort(([left], [right]) => Number(left) - Number(right))
+        .map(
+          ([category, value]) =>
+            [
+              `cat${category}`,
+              formatLocomoCategoryValue(
+                category,
+                value,
+                locomoData?.mode || 'qa',
+              ),
+            ] as const,
+        ),
+    );
+    const usageSection = renderKeyValueSection('Usage', [
+      ['Tokens', formatLocomoTokenUsage(locomoData?.tokenUsage)],
+      [
+        'Current sample questions',
+        !locomoSummary &&
+        locomoProgress &&
+        locomoProgress.currentSampleQuestionCount != null &&
+        locomoProgress.currentSampleQuestionTotal != null
+          ? `${locomoProgress.currentSampleQuestionCount}/${locomoProgress.currentSampleQuestionTotal}`
+          : null,
+      ],
+    ]);
+    const runSection = renderKeyValueSection('Run', [
+      ['Run ID', latestJob.runId],
+      ['Command', latestJob.displayCommand || latestJob.command],
+    ]);
+    const pathsSection = renderKeyValueSection('Paths', [
+      ['Job dir', locomoData?.jobDir || null],
+      [
+        'Progress JSON',
+        !locomoSummary && locomoProgress ? locomoProgress.progressPath : null,
+      ],
+      ['Predictions JSON', locomoData?.predictionsPath || null],
+      ['Result JSON', locomoData?.resultPath || null],
+      ['Stdout', latestJob.stdoutPath],
+      ['Stderr', latestJob.stderrPath],
+    ]);
+    return infoResult(
+      `${suite.title} Results`,
+      joinSections([
+        overviewSection,
+        categorySection,
+        usageSection,
+        runSection,
+        pathsSection,
+      ]),
+    );
+  }
   return infoResult(
     `${suite.title} Results`,
     [
@@ -2574,6 +3200,7 @@ async function handleManagedSuiteRun(params: {
   effectiveAgentId?: string;
   channelId?: string;
   args: string[];
+  workspaceModeExplicit: boolean;
 }): Promise<GatewayCommandResult> {
   if (!isManagedSuiteInstalled(params.suite, params.dataDir)) {
     return errorResult(
@@ -2581,6 +3208,15 @@ async function handleManagedSuiteRun(params: {
       `Run \`/eval ${params.suite.id} setup\` first.`,
     );
   }
+  if (
+    params.suite.id === 'locomo' &&
+    params.env.profile.workspaceMode === 'fresh-agent'
+  ) {
+    return errorResult(
+      `${params.suite.title} Run`,
+      'Native LOCOMO does not support `--fresh-agent`. It already uses one fresh agent per conversation by default; use `--current-agent` to reuse the current agent workspace.',
+    );
+  }
   if (
     params.suite.id === 'terminal-bench-2.0' &&
     params.env.profile.workspaceMode === 'fresh-agent'
@@ -2597,6 +3233,7 @@ async function handleManagedSuiteRun(params: {
     params.env,
     params.effectiveAgentId || 'main',
     params.args,
+    params.workspaceModeExplicit,
   );
   if (!prepared) {
     return errorResult(
@@ -2684,6 +3321,7 @@ async function handleManagedSuiteCommand(params: {
   channelId?: string;
   subcommand?: string;
   args?: string[];
+  workspaceModeExplicit: boolean;
 }): Promise<GatewayCommandResult> {
   const subcommand = String(params.subcommand || '')
     .trim()
@@ -2701,6 +3339,7 @@ async function handleManagedSuiteCommand(params: {
         `${params.suite.title} is not implemented yet.`,
         '',
         'Implemented suites today:',
+        '- `/eval locomo ...`',
         '- `/eval terminal-bench-2.0 ...`',
         '- `/eval tau2 ...`',
       ].join('\n'),
@@ -2723,6 +3362,7 @@ async function handleManagedSuiteCommand(params: {
         effectiveAgentId: params.effectiveAgentId,
         channelId: params.channelId,
         args: params.args || [],
+        workspaceModeExplicit: params.workspaceModeExplicit,
       });
     case 'status':
       return infoResult(
@@ -2848,6 +3488,13 @@ function parseEvalProfileFlag(
   }
 }
 
+function isWorkspaceModeFlag(arg: string): boolean {
+  const normalized = String(arg || '')
+    .trim()
+    .toLowerCase();
+  return normalized === '--current-agent' || normalized === '--fresh-agent';
+}
+
 function finalizeEvalProfile(profile: EvalProfile): EvalProfile {
   if (profile.workspaceMode === 'fresh-agent') {
     delete profile.agentId;
@@ -2860,13 +3507,9 @@ function finalizeEvalProfile(profile: EvalProfile): EvalProfile {
 function parseEvalAction(
   args: string[],
   effectiveAgentId?: string,
-): {
-  action: string;
-  profile: EvalProfile;
-  commandArgs: string[];
-  error?: string;
-} {
+): ParsedEvalAction {
   const profile = buildDefaultEvalProfile(effectiveAgentId);
+  let workspaceModeExplicit = false;
   if (
     args.length === 1 &&
     ['help', '--help', '-h'].includes(
@@ -2879,13 +3522,23 @@ function parseEvalAction(
       action: 'help',
       profile: finalizeEvalProfile(profile),
       commandArgs: [],
+      workspaceModeExplicit,
     };
   }
   let index = 0;
 
   while (index < args.length && args[index]?.startsWith('--')) {
     const error = parseEvalProfileFlag(args[index], profile);
-    if (error) return { action: '', profile, commandArgs: [], error };
+    if (error) {
+      return {
+        action: '',
+        profile,
+        commandArgs: [],
+        workspaceModeExplicit,
+        error,
+      };
+    }
+    workspaceModeExplicit ||= isWorkspaceModeFlag(args[index] || '');
     index += 1;
   }
 
@@ -2897,6 +3550,7 @@ function parseEvalAction(
       action: '',
       profile: finalizeEvalProfile(profile),
       commandArgs: [],
+      workspaceModeExplicit,
     };
   }
   const rawAction = String(args[index] || '').trim();
@@ -2907,6 +3561,7 @@ function parseEvalAction(
       action: '',
       profile: finalizeEvalProfile(profile),
       commandArgs: [],
+      workspaceModeExplicit,
       error:
         'Use `/eval <shell command...>` instead of `/eval run <shell command...>`.',
     };
@@ -2919,6 +3574,7 @@ function parseEvalAction(
           action: 'run',
           profile: finalizeEvalProfile(profile),
           commandArgs: [rawAction, ...args.slice(index)],
+          workspaceModeExplicit,
         };
       }
       const error = parseEvalProfileFlag(args[index], profile);
@@ -2927,8 +3583,10 @@ function parseEvalAction(
           action: 'run',
           profile: finalizeEvalProfile(profile),
           commandArgs: [rawAction, ...args.slice(index)],
+          workspaceModeExplicit,
         };
       }
+      workspaceModeExplicit ||= isWorkspaceModeFlag(args[index] || '');
       index += 1;
     }
 
@@ -2936,6 +3594,7 @@ function parseEvalAction(
       action,
       profile: finalizeEvalProfile(profile),
       commandArgs: [],
+      workspaceModeExplicit,
     };
   }
 
@@ -2943,6 +3602,7 @@ function parseEvalAction(
     action: 'run',
     profile: finalizeEvalProfile(profile),
     commandArgs: [rawAction, ...args.slice(index)],
+    workspaceModeExplicit,
   };
 }
 
@@ -3007,6 +3667,7 @@ export async function handleEvalCommand(
           channelId: params.channelId,
           subcommand: managedSubcommand,
           args: parsed.commandArgs.slice(2),
+          workspaceModeExplicit: parsed.workspaceModeExplicit,
         });
       }
     }
@@ -3069,5 +3730,6 @@ export async function handleEvalCommand(
     effectiveAgentId: params.effectiveAgentId,
     channelId: params.channelId,
     subcommand: 'help',
+    workspaceModeExplicit: parsed.workspaceModeExplicit,
   });
 }
diff --git a/src/evals/locomo-native.ts b/src/evals/locomo-native.ts
new file mode 100644
index 00000000..2726195e
--- /dev/null
+++ b/src/evals/locomo-native.ts
@@ -0,0 +1,1633 @@
+import { createHash } from 'node:crypto';
+import fs from 'node:fs';
+import path from 'node:path';
+import { initDatabase } from '../memory/db.js';
+import { memoryService } from '../memory/memory-service.js';
+import { buildSessionKey } from '../session/session-key.js';
+import type { SemanticMemoryEntry } from '../types/memory.js';
+import type { Session } from '../types/session.js';
+import {
+  buildDefaultEvalProfile,
+  type EvalProfile,
+  encodeEvalProfileModel,
+  parseEvalProfileModel,
+} from './eval-profile.js';
+import { scoreOfficialLocomoAnswer } from './locomo-official-scoring.js';
+import type {
+  LocomoAgentMode,
+  LocomoCategoryAggregate,
+  LocomoTokenUsage,
+} from './locomo-types.js';
+import {
+  LOCOMO_DATASET_FILENAME,
+  LOCOMO_SETUP_MARKER,
+} from './locomo-types.js';
+
+const LOCOMO_DATASET_COMMIT = '3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376';
+const LOCOMO_DATASET_URL = `https://raw.githubusercontent.com/snap-research/locomo/${LOCOMO_DATASET_COMMIT}/data/locomo10.json`;
+const LOCOMO_DATASET_SHA256 =
+  '79fa87e90f04081343b8c8debecb80a9a6842b76a7aa537dc9fdf651ea698ff4';
+const DEFAULT_TOKEN_BUDGET = 4000;
+const ANSWER_BUFFER_TOKENS = 64;
+const DEFAULT_OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+const DEFAULT_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+const LOCOMO_DATASET_DOWNLOAD_TIMEOUT_MS = 120_000;
+const LOCOMO_MODEL_CALL_TIMEOUT_MS = 30_000;
+const LOCOMO_QA_CONCURRENCY = 4;
+const LOCOMO_PROGRESS_WRITE_INTERVAL_QUESTIONS = 20;
+
+const CONVERSATION_START_PROMPT =
+  'Below is a conversation between two people: {speakerA} and {speakerB}. The conversation takes place over multiple days and the date of each conversation is written at the beginning of the conversation.\n\n';
+
+const QA_PROMPT = `
+Based on the above context, write an answer in the form of a short phrase for the following question. Answer with exact words from the context whenever possible.
+
+Question: {question}
+Short answer:
+`.trim();
+
+const QA_PROMPT_CATEGORY_5 = `
+Based on the above context, answer the following question.
+
+Question: {question}
+Short answer:
+`.trim();
+
+const flattenedConversationTurnsCache = new WeakMap<
+  Record<string, unknown>,
+  LocomoFlattenedTurn[]
+>();
+
+type LocomoOperation = 'setup' | 'run';
+type LocomoEvaluationMode = 'qa' | 'retrieval';
+
+interface LocomoTurn {
+  speaker: string;
+  dia_id: string;
+  text: string;
+  blip_caption?: string;
+}
+
+type LocomoFlattenedTurn = LocomoTurn & {
+  sessionNum: number;
+  dateTime: string;
+};
+
+interface LocomoQA {
+  question: string;
+  answer?: unknown;
+  adversarial_answer?: string;
+  evidence?: string[];
+  category: number;
+}
+
+interface LocomoSample {
+  sample_id: string;
+  conversation: Record<string, unknown>;
+  qa: LocomoQA[];
+}
+
+interface LocomoRunnerOptions {
+  operation: LocomoOperation;
+  installDir: string;
+  budgetTokens: number;
+  numSamples: number | null;
+  maxQuestions: number | null;
+  mode: LocomoEvaluationMode;
+  agentMode: LocomoAgentMode;
+}
+
+interface LocomoGatewayRuntime {
+  baseUrl: string;
+  apiKey: string;
+  model: string;
+  baseModel: string;
+  profile: EvalProfile;
+}
+
+interface LocomoQuestionPrediction {
+  category: number;
+  question: string;
+  answer: string;
+  prediction: string;
+  score: number;
+  evidence: string[];
+  contextF1?: number | null;
+  retrievedSourceMessageIds?: number[];
+}
+
+interface LocomoSamplePrediction {
+  sampleId: string;
+  questionCount: number;
+  meanScore: number;
+  meanContextF1: number | null;
+  qa: LocomoQuestionPrediction[];
+}
+
+interface LocomoRunSummary {
+  suite: 'locomo';
+  mode: LocomoEvaluationMode;
+  dataset: string;
+  generatedAt: string;
+  model: string | null;
+  budgetTokens: number;
+  sampleCount: number;
+  questionCount: number;
+  overallScore: number;
+  contextF1: number | null;
+  resultPath: string;
+  predictionsPath: string;
+  categories: Record<string, LocomoCategoryAggregate>;
+  tokenUsage: LocomoTokenUsage | null;
+  samples: Array<{
+    sampleId: string;
+    questionCount: number;
+    meanScore: number;
+  }>;
+}
+
+interface LocomoProgressSummary {
+  suite: 'locomo';
+  mode: LocomoEvaluationMode;
+  dataset: string;
+  updatedAt: string;
+  model: string | null;
+  budgetTokens: number;
+  sampleCount: number;
+  completedSampleCount: number;
+  questionCount: number;
+  completedQuestionCount: number;
+  overallScore: number;
+  contextF1: number | null;
+  currentSampleId: string | null;
+  currentSampleQuestionCount: number | null;
+  currentSampleQuestionTotal: number | null;
+  progressPath: string;
+  resultPath: string;
+  predictionsPath: string;
+  categories: Record<string, LocomoCategoryAggregate>;
+  tokenUsage: LocomoTokenUsage | null;
+}
+
+interface LocomoChatCompletionUsage {
+  prompt_tokens?: number;
+  completion_tokens?: number;
+  total_tokens?: number;
+}
+
+interface LocomoCategoryRunningAggregate {
+  scoreTotal: number;
+  contextF1Total: number;
+  questionCount: number;
+}
+
+interface LocomoRetrievalSession {
+  session: Session;
+  messageIdByDiaId: Map<string, number>;
+}
+
+interface LocomoPreparedQuestion {
+  prompt: string;
+  scoreCategory: number;
+  categoryFiveAnswerKey: Record<'a' | 'b', string> | null;
+}
+
+export async function runLocomoNativeCli(argv: string[]): Promise<void> {
+  const options = parseArgs(argv);
+  if (options.operation === 'setup') {
+    await runSetup(options);
+    return;
+  }
+  await runEvaluation(options);
+}
+
+function parseArgs(argv: string[]): LocomoRunnerOptions {
+  let operation: LocomoOperation | null = null;
+  let installDir = '';
+  let budgetTokens = DEFAULT_TOKEN_BUDGET;
+  let numSamples: number | null = null;
+  let maxQuestions: number | null = null;
+  let mode: LocomoEvaluationMode = 'qa';
+  let agentMode: LocomoAgentMode = 'conversation-fresh';
+
+  for (let index = 0; index < argv.length; index += 1) {
+    const current = String(argv[index] || '').trim();
+    if (!current) continue;
+    if (current === 'setup' || current === 'run') {
+      operation = current;
+      continue;
+    }
+
+    const [flag, inlineValue] = splitInlineFlag(current);
+    const nextValue = () => inlineValue || String(argv[index + 1] || '').trim();
+
+    if (flag === '--install-dir') {
+      installDir = nextValue();
+      if (!inlineValue) index += 1;
+      continue;
+    }
+    if (flag === '--budget') {
+      budgetTokens = clampPositiveInt(nextValue(), DEFAULT_TOKEN_BUDGET);
+      if (!inlineValue) index += 1;
+      continue;
+    }
+    if (flag === '--num-samples') {
+      const parsed = clampPositiveInt(nextValue(), 0);
+      numSamples = parsed > 0 ? parsed : null;
+      if (!inlineValue) index += 1;
+      continue;
+    }
+    if (flag === '--max-questions') {
+      const parsed = clampPositiveInt(nextValue(), 0);
+      maxQuestions = parsed > 0 ? parsed : null;
+      if (!inlineValue) index += 1;
+      continue;
+    }
+    if (flag === '--mode') {
+      const value = nextValue().toLowerCase();
+      if (value === 'qa' || value === 'retrieval') {
+        mode = value;
+      } else {
+        throw new Error(`Unsupported LOCOMO mode \`${value || '(empty)'}\`.`);
+      }
+      if (!inlineValue) index += 1;
+      continue;
+    }
+    if (flag === '--agent-mode') {
+      const value = nextValue().toLowerCase();
+      if (value === 'conversation-fresh' || value === 'current-agent') {
+        agentMode = value;
+      } else {
+        throw new Error(
+          `Unsupported LOCOMO agent mode \`${value || '(empty)'}\`.`,
+        );
+      }
+      if (!inlineValue) index += 1;
+      continue;
+    }
+
+    throw new Error(`Unknown LOCOMO option: \`${current}\`.`);
+  }
+
+  if (!operation) {
+    throw new Error('Missing LOCOMO operation. Use `setup` or `run`.');
+  }
+  if (!installDir) {
+    throw new Error('Missing required `--install-dir`.');
+  }
+
+  return {
+    operation,
+    installDir: path.resolve(installDir),
+    budgetTokens,
+    numSamples,
+    maxQuestions,
+    mode,
+    agentMode,
+  };
+}
+
+function splitInlineFlag(value: string): [string, string] {
+  const separator = value.indexOf('=');
+  if (separator < 0) return [value, ''];
+  return [value.slice(0, separator), value.slice(separator + 1).trim()];
+}
+
+function clampPositiveInt(value: string, fallback: number): number {
+  const parsed = Number.parseInt(String(value || '').trim(), 10);
+  if (!Number.isFinite(parsed) || parsed <= 0) return fallback;
+  return parsed;
+}
+
+function getMarkerPath(installDir: string): string {
+  return path.join(installDir, LOCOMO_SETUP_MARKER);
+}
+
+function getDatasetPath(installDir: string): string {
+  return path.join(installDir, 'data', LOCOMO_DATASET_FILENAME);
+}
+
+function getProgressPath(jobDir: string): string {
+  return path.join(jobDir, 'progress.json');
+}
+
+async function runSetup(options: LocomoRunnerOptions): Promise<void> {
+  fs.mkdirSync(options.installDir, { recursive: true });
+  fs.mkdirSync(path.dirname(getDatasetPath(options.installDir)), {
+    recursive: true,
+  });
+
+  const datasetPath = getDatasetPath(options.installDir);
+  if (!fs.existsSync(datasetPath)) {
+    console.log(`Downloading dataset from ${LOCOMO_DATASET_URL}`);
+    const response = await fetchWithTimeout(
+      LOCOMO_DATASET_URL,
+      undefined,
+      LOCOMO_DATASET_DOWNLOAD_TIMEOUT_MS,
+      'LOCOMO dataset download',
+    );
+    if (!response.ok) {
+      throw new Error(
+        `Failed to download LOCOMO dataset: HTTP ${response.status}`,
+      );
+    }
+    const rawBuffer = Buffer.from(await response.arrayBuffer());
+    verifyDownloadedDataset(rawBuffer);
+    const raw = rawBuffer.toString('utf-8');
+    if (!raw.trim().startsWith('[')) {
+      throw new Error('Downloaded LOCOMO dataset is not valid JSON.');
+    }
+    fs.writeFileSync(datasetPath, rawBuffer);
+  } else {
+    console.log(`Dataset already present at ${datasetPath}`);
+  }
+
+  const sampleCount = loadSamples(datasetPath).length;
+  fs.writeFileSync(getMarkerPath(options.installDir), 'ok\n', 'utf-8');
+
+  console.log(`Install dir: ${options.installDir}`);
+  console.log(`Dataset path: ${datasetPath}`);
+  console.log(`Samples: ${sampleCount}`);
+  console.log('LOCOMO setup complete.');
+}
+
+async function runEvaluation(options: LocomoRunnerOptions): Promise<void> {
+  const datasetPath = getDatasetPath(options.installDir);
+  const markerPath = getMarkerPath(options.installDir);
+  const hasMarker = fs.existsSync(markerPath);
+  const hasDataset = fs.existsSync(datasetPath);
+  if (!hasMarker && !hasDataset) {
+    throw new Error(
+      'LOCOMO is not set up. Run `setup` first, or use `/eval locomo setup`.',
+    );
+  }
+  if (!hasDataset) {
+    throw new Error(
+      `LOCOMO dataset is missing at ${datasetPath}. Re-run \`setup\`, or use \`/eval locomo setup\`.`,
+    );
+  }
+  if (!hasMarker) {
+    throw new Error(
+      `LOCOMO setup marker is missing at ${markerPath}. Re-run \`setup\`, or use \`/eval locomo setup\`.`,
+    );
+  }
+
+  const runtime = readGatewayRuntime();
+  const allSamples = loadSamples(datasetPath);
+  const selectedSamples =
+    options.numSamples && options.numSamples > 0
+      ? allSamples.slice(0, options.numSamples)
+      : allSamples;
+  const plannedSamples = applyQuestionLimit(
+    selectedSamples,
+    options.maxQuestions,
+  );
+  const totalQuestionCount = plannedSamples.reduce(
+    (total, sample) => total + sample.qa.length,
+    0,
+  );
+
+  const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+  const jobDir = path.join(options.installDir, 'jobs', timestamp);
+  fs.mkdirSync(jobDir, { recursive: true });
+  const progressPath = getProgressPath(jobDir);
+  const resultPath = path.join(jobDir, 'result.json');
+  const predictionsPath = path.join(jobDir, 'predictions.json');
+  const retrievalDbPath = path.join(jobDir, 'retrieval-memory.db');
+
+  console.log(`Job dir: ${jobDir}`);
+  console.log(`Dataset: ${datasetPath}`);
+  console.log(`Mode: ${options.mode}`);
+  if (options.mode === 'qa') {
+    console.log(`Model: ${runtime.model}`);
+  } else {
+    console.log(`Memory DB: ${retrievalDbPath}`);
+  }
+  console.log(`Samples: ${plannedSamples.length}`);
+  console.log(`Budget: ${options.budgetTokens}`);
+  if (options.mode === 'qa') {
+    console.log(`Agent mode: ${options.agentMode}`);
+  }
+  if (options.maxQuestions) {
+    console.log(`Max questions: ${options.maxQuestions}`);
+  }
+  console.log(`Questions planned: ${totalQuestionCount}`);
+
+  if (options.mode === 'retrieval') {
+    initDatabase({ quiet: true, dbPath: retrievalDbPath });
+  }
+
+  const predictions: LocomoSamplePrediction[] = [];
+  const categories = new Map<number, LocomoCategoryRunningAggregate>();
+  const usageTotals: LocomoTokenUsage = {
+    promptTokens: 0,
+    completionTokens: 0,
+    totalTokens: 0,
+    responsesWithUsage: 0,
+  };
+  let questionCount = 0;
+  let scoreTotal = 0;
+  let contextF1Total = 0;
+  let completedSampleCount = 0;
+  const runTag = path.basename(jobDir);
+
+  writeProgressFile({
+    progressPath,
+    resultPath,
+    predictionsPath,
+    mode: options.mode,
+    datasetPath,
+    model: options.mode === 'qa' ? runtime.model : null,
+    budgetTokens: options.budgetTokens,
+    sampleCount: plannedSamples.length,
+    completedSampleCount,
+    questionCount: totalQuestionCount,
+    completedQuestionCount: questionCount,
+    scoreTotal,
+    contextF1Total,
+    categories,
+    usageTotals,
+    currentSampleId: null,
+    currentSampleQuestionCount: null,
+    currentSampleQuestionTotal: null,
+  });
+
+  const writeQuestionProgress = (params: {
+    samplePrediction: LocomoSamplePrediction;
+    model: string | null;
+    contextF1TotalForSnapshot: number;
+  }): void => {
+    if (
+      !shouldWriteQuestionProgressSnapshot(params.samplePrediction.qa.length)
+    ) {
+      return;
+    }
+    const completedQuestionCount =
+      questionCount + params.samplePrediction.qa.length;
+    const partialScoreTotal =
+      scoreTotal +
+      params.samplePrediction.qa.reduce((total, qa) => total + qa.score, 0);
+    writeProgressFile({
+      progressPath,
+      resultPath,
+      predictionsPath,
+      mode: options.mode,
+      datasetPath,
+      model: params.model,
+      budgetTokens: options.budgetTokens,
+      sampleCount: plannedSamples.length,
+      completedSampleCount,
+      questionCount: totalQuestionCount,
+      completedQuestionCount,
+      scoreTotal: partialScoreTotal,
+      contextF1Total: params.contextF1TotalForSnapshot,
+      categories,
+      usageTotals,
+      currentSampleId: params.samplePrediction.sampleId,
+      currentSampleQuestionCount: params.samplePrediction.qa.length,
+      currentSampleQuestionTotal: params.samplePrediction.questionCount,
+    });
+  };
+
+  for (const sample of plannedSamples) {
+    const samplePrediction =
+      options.mode === 'retrieval'
+        ? await evaluateRetrievalSample({
+            sample,
+            budgetTokens: options.budgetTokens,
+            categories,
+            session: ingestSampleIntoNativeMemory({
+              sample,
+              runTag,
+              agentMode: options.agentMode,
+              runtime,
+            }),
+            onQuestionProgress: ({ samplePrediction }) => {
+              const partialContextF1Total =
+                contextF1Total +
+                samplePrediction.qa.reduce(
+                  (total, qa) => total + (qa.contextF1 || 0),
+                  0,
+                );
+              writeQuestionProgress({
+                samplePrediction,
+                model: null,
+                contextF1TotalForSnapshot: partialContextF1Total,
+              });
+            },
+          })
+        : await evaluateSample({
+            runtime,
+            requestModel: resolveSampleRequestModel({
+              runtime,
+              agentMode: options.agentMode,
+              sampleId: sample.sample_id,
+              runTag,
+            }),
+            sample,
+            budgetTokens: options.budgetTokens,
+            usageTotals,
+            categories,
+            onQuestionProgress: ({ samplePrediction }) => {
+              writeQuestionProgress({
+                samplePrediction,
+                model: runtime.model,
+                contextF1TotalForSnapshot: contextF1Total,
+              });
+            },
+          });
+    predictions.push(samplePrediction);
+    questionCount += samplePrediction.questionCount;
+    scoreTotal += samplePrediction.meanScore * samplePrediction.questionCount;
+    contextF1Total +=
+      (samplePrediction.meanContextF1 || 0) * samplePrediction.questionCount;
+    completedSampleCount += 1;
+    writeProgressFile({
+      progressPath,
+      resultPath,
+      predictionsPath,
+      mode: options.mode,
+      datasetPath,
+      model: options.mode === 'qa' ? runtime.model : null,
+      budgetTokens: options.budgetTokens,
+      sampleCount: plannedSamples.length,
+      completedSampleCount,
+      questionCount: totalQuestionCount,
+      completedQuestionCount: questionCount,
+      scoreTotal,
+      contextF1Total,
+      categories,
+      usageTotals,
+      currentSampleId:
+        completedSampleCount < plannedSamples.length
+          ? plannedSamples[completedSampleCount]?.sample_id || null
+          : null,
+      currentSampleQuestionCount: null,
+      currentSampleQuestionTotal:
+        completedSampleCount < plannedSamples.length
+          ? plannedSamples[completedSampleCount]?.qa.length || null
+          : null,
+    });
+  }
+
+  fs.writeFileSync(
+    predictionsPath,
+    JSON.stringify(predictions, null, 2),
+    'utf-8',
+  );
+
+  const summary: LocomoRunSummary = {
+    suite: 'locomo',
+    mode: options.mode,
+    dataset: path.basename(datasetPath),
+    generatedAt: new Date().toISOString(),
+    model: options.mode === 'qa' ? runtime.model : null,
+    budgetTokens: options.budgetTokens,
+    sampleCount: plannedSamples.length,
+    questionCount,
+    overallScore: roundMetric(scoreTotal / Math.max(questionCount, 1)),
+    contextF1:
+      options.mode === 'retrieval'
+        ? roundMetric(contextF1Total / Math.max(questionCount, 1))
+        : null,
+    resultPath,
+    predictionsPath,
+    categories: buildCategorySummaries(categories, options.mode),
+    tokenUsage:
+      options.mode === 'qa' && usageTotals.responsesWithUsage > 0
+        ? usageTotals
+        : null,
+    samples: predictions.map((samplePrediction) => ({
+      sampleId: samplePrediction.sampleId,
+      questionCount: samplePrediction.questionCount,
+      meanScore: samplePrediction.meanScore,
+    })),
+  };
+
+  fs.writeFileSync(resultPath, JSON.stringify(summary, null, 2), 'utf-8');
+  printSummaryTable(summary);
+}
+
+function applyQuestionLimit(
+  samples: LocomoSample[],
+  maxQuestions: number | null,
+): LocomoSample[] {
+  if (!maxQuestions || maxQuestions <= 0) {
+    return samples.map((sample) => ({
+      ...sample,
+      qa: Array.isArray(sample.qa) ? [...sample.qa] : [],
+    }));
+  }
+
+  const selected: LocomoSample[] = [];
+  let remaining = maxQuestions;
+  for (const sample of samples) {
+    if (remaining <= 0) break;
+    const qa = Array.isArray(sample.qa) ? sample.qa.slice(0, remaining) : [];
+    if (qa.length === 0) continue;
+    selected.push({
+      ...sample,
+      qa,
+    });
+    remaining -= qa.length;
+  }
+  return selected;
+}
+
+function shouldWriteQuestionProgressSnapshot(
+  completedSampleQuestionCount: number,
+): boolean {
+  return (
+    completedSampleQuestionCount === 1 ||
+    completedSampleQuestionCount % LOCOMO_PROGRESS_WRITE_INTERVAL_QUESTIONS ===
+      0
+  );
+}
+
+function buildCategorySummaries(
+  categories: Map<number, LocomoCategoryRunningAggregate>,
+  mode: LocomoEvaluationMode,
+): Record<string, LocomoCategoryAggregate> {
+  const categorySummaries: Record<string, LocomoCategoryAggregate> = {};
+  for (const [category, aggregate] of categories.entries()) {
+    categorySummaries[String(category)] = {
+      meanScore: roundMetric(
+        aggregate.scoreTotal / Math.max(aggregate.questionCount, 1),
+      ),
+      questionCount: aggregate.questionCount,
+      contextF1:
+        mode === 'retrieval'
+          ? roundMetric(
+              aggregate.contextF1Total / Math.max(aggregate.questionCount, 1),
+            )
+          : null,
+    };
+  }
+  return categorySummaries;
+}
+
+function writeProgressFile(params: {
+  progressPath: string;
+  resultPath: string;
+  predictionsPath: string;
+  mode: LocomoEvaluationMode;
+  datasetPath: string;
+  model: string | null;
+  budgetTokens: number;
+  sampleCount: number;
+  completedSampleCount: number;
+  questionCount: number;
+  completedQuestionCount: number;
+  scoreTotal: number;
+  contextF1Total: number;
+  categories: Map<number, LocomoCategoryRunningAggregate>;
+  usageTotals: LocomoTokenUsage;
+  currentSampleId: string | null;
+  currentSampleQuestionCount: number | null;
+  currentSampleQuestionTotal: number | null;
+}): void {
+  const progress: LocomoProgressSummary = {
+    suite: 'locomo',
+    mode: params.mode,
+    dataset: path.basename(params.datasetPath),
+    updatedAt: new Date().toISOString(),
+    model: params.model,
+    budgetTokens: params.budgetTokens,
+    sampleCount: params.sampleCount,
+    completedSampleCount: params.completedSampleCount,
+    questionCount: params.questionCount,
+    completedQuestionCount: params.completedQuestionCount,
+    overallScore: roundMetric(
+      params.scoreTotal / Math.max(params.completedQuestionCount, 1),
+    ),
+    contextF1:
+      params.mode === 'retrieval'
+        ? roundMetric(
+            params.contextF1Total / Math.max(params.completedQuestionCount, 1),
+          )
+        : null,
+    currentSampleId: params.currentSampleId,
+    currentSampleQuestionCount: params.currentSampleQuestionCount,
+    currentSampleQuestionTotal: params.currentSampleQuestionTotal,
+    progressPath: params.progressPath,
+    resultPath: params.resultPath,
+    predictionsPath: params.predictionsPath,
+    categories: buildCategorySummaries(params.categories, params.mode),
+    tokenUsage:
+      params.mode === 'qa' && params.usageTotals.responsesWithUsage > 0
+        ? params.usageTotals
+        : null,
+  };
+  fs.writeFileSync(
+    params.progressPath,
+    JSON.stringify(progress, null, 2),
+    'utf-8',
+  );
+}
+
+function readGatewayRuntime(): LocomoGatewayRuntime {
+  const baseUrl = String(process.env.OPENAI_BASE_URL || DEFAULT_OPENAI_BASE_URL)
+    .trim()
+    .replace(/\/+$/, '');
+  const apiKey = String(
+    process.env.OPENAI_API_KEY || 'hybridclaw-local',
+  ).trim();
+  const model = String(
+    process.env.HYBRIDCLAW_EVAL_MODEL || DEFAULT_EVAL_MODEL,
+  ).trim();
+  const parsed = parseEvalProfileModel(model || DEFAULT_EVAL_MODEL);
+
+  return {
+    baseUrl: baseUrl || DEFAULT_OPENAI_BASE_URL,
+    apiKey: apiKey || 'hybridclaw-local',
+    model: model || DEFAULT_EVAL_MODEL,
+    baseModel: parsed.model || DEFAULT_EVAL_MODEL,
+    profile: parsed.profile || buildDefaultEvalProfile(),
+  };
+}
+
+function resolveSampleRequestModel(params: {
+  runtime: LocomoGatewayRuntime;
+  agentMode: LocomoAgentMode;
+  sampleId: string;
+  runTag: string;
+}): string {
+  if (params.agentMode === 'current-agent') {
+    return params.runtime.model;
+  }
+
+  return encodeEvalProfileModel(params.runtime.baseModel, {
+    workspaceMode: 'current-agent',
+    ablateSystemPrompt: params.runtime.profile.ablateSystemPrompt,
+    includePromptParts: [...params.runtime.profile.includePromptParts],
+    omitPromptParts: [...params.runtime.profile.omitPromptParts],
+    agentId: buildConversationAgentId(params.sampleId, params.runTag),
+  });
+}
+
+function buildConversationAgentId(sampleId: string, runTag: string): string {
+  const sanitizedSampleId = String(sampleId || '')
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+    .slice(0, 24);
+  const digest = createHash('sha1')
+    .update(`${runTag}:${sampleId}`)
+    .digest('hex')
+    .slice(0, 12);
+  return `locomo-${sanitizedSampleId || 'sample'}-${digest}`;
+}
+
+function loadSamples(datasetPath: string): LocomoSample[] {
+  const raw = fs.readFileSync(datasetPath, 'utf-8');
+  const parsed = JSON.parse(raw) as unknown;
+  if (!Array.isArray(parsed)) {
+    throw new Error(`Expected LOCOMO dataset array in ${datasetPath}.`);
+  }
+  const firstSample = parsed[0];
+  if (firstSample !== undefined) {
+    const sample =
+      firstSample && typeof firstSample === 'object' ? firstSample : null;
+    const sampleId = sample
+      ? (sample as { sample_id?: unknown }).sample_id
+      : null;
+    const conversation = sample
+      ? (sample as { conversation?: unknown }).conversation
+      : null;
+    const qa = sample ? (sample as { qa?: unknown }).qa : null;
+    if (
+      typeof sampleId !== 'string' ||
+      !sampleId.trim() ||
+      !conversation ||
+      typeof conversation !== 'object' ||
+      Array.isArray(conversation) ||
+      !Array.isArray(qa)
+    ) {
+      throw new Error(
+        `Invalid LOCOMO sample at index 0 in ${datasetPath}. Expected a non-empty string sample_id, an object conversation, and an array qa.`,
+      );
+    }
+  }
+  return parsed as LocomoSample[];
+}
+
+function verifyDownloadedDataset(rawBuffer: Uint8Array): void {
+  const actualSha256 = createHash('sha256').update(rawBuffer).digest('hex');
+  if (actualSha256 !== LOCOMO_DATASET_SHA256) {
+    throw new Error(
+      `Downloaded LOCOMO dataset failed SHA-256 verification (expected ${LOCOMO_DATASET_SHA256}, got ${actualSha256}).`,
+    );
+  }
+}
+
+function isFetchTimeoutError(error: unknown): boolean {
+  return (
+    error instanceof Error &&
+    (error.name === 'TimeoutError' || error.name === 'AbortError')
+  );
+}
+
+async function fetchWithTimeout(
+  input: string,
+  init: RequestInit | undefined,
+  timeoutMs: number,
+  label: string,
+): Promise<Response> {
+  try {
+    return await fetch(input, {
+      ...init,
+      signal: AbortSignal.timeout(timeoutMs),
+    });
+  } catch (error) {
+    if (isFetchTimeoutError(error)) {
+      throw new Error(`${label} timed out after ${timeoutMs}ms.`);
+    }
+    throw error;
+  }
+}
+
+async function evaluateSample(params: {
+  runtime: LocomoGatewayRuntime;
+  requestModel: string;
+  sample: LocomoSample;
+  budgetTokens: number;
+  usageTotals: LocomoTokenUsage;
+  categories: Map<number, LocomoCategoryRunningAggregate>;
+  onQuestionProgress?: (params: {
+    samplePrediction: LocomoSamplePrediction;
+  }) => void;
+}): Promise<LocomoSamplePrediction> {
+  const sampleQa = Array.isArray(params.sample.qa) ? params.sample.qa : [];
+  const sampleQuestionCount = sampleQa.length;
+  const qaPredictions: Array<LocomoQuestionPrediction | null> = Array.from(
+    { length: sampleQuestionCount },
+    () => null,
+  );
+  let nextQuestionIndex = 0;
+
+  const runWorker = async (): Promise<void> => {
+    while (true) {
+      const questionIndex = nextQuestionIndex;
+      nextQuestionIndex += 1;
+      if (questionIndex >= sampleQa.length) {
+        return;
+      }
+
+      const qa = sampleQa[questionIndex];
+      const prepared = buildQuestionPrompt(
+        params.sample,
+        qa,
+        params.budgetTokens,
+      );
+      const completion = await requestModelAnswer({
+        runtime: params.runtime,
+        model: params.requestModel,
+        prompt: prepared.prompt,
+        user: `locomo-${params.sample.sample_id}`,
+      });
+      mergeUsage(params.usageTotals, completion.usage);
+
+      const answer = answerToString(qa);
+      const prediction =
+        prepared.scoreCategory === 5
+          ? normalizeCategoryFivePrediction(
+              completion.content,
+              prepared.categoryFiveAnswerKey,
+            )
+          : normalizeModelPrediction(completion.content);
+      const score = scoreLocomoAnswer(qa, prediction);
+      qaPredictions[questionIndex] = {
+        category: prepared.scoreCategory,
+        question: qa.question,
+        answer,
+        prediction,
+        score,
+        evidence: Array.isArray(qa.evidence) ? qa.evidence : [],
+      };
+
+      const existing = params.categories.get(prepared.scoreCategory) || {
+        scoreTotal: 0,
+        contextF1Total: 0,
+        questionCount: 0,
+      };
+      existing.scoreTotal += score;
+      existing.questionCount += 1;
+      params.categories.set(prepared.scoreCategory, existing);
+
+      const completedPredictions = qaPredictions.filter(
+        (entry): entry is LocomoQuestionPrediction => entry !== null,
+      );
+      params.onQuestionProgress?.({
+        samplePrediction: {
+          sampleId: params.sample.sample_id,
+          questionCount: sampleQuestionCount,
+          meanScore: roundMetric(
+            completedPredictions.reduce(
+              (total, entry) => total + entry.score,
+              0,
+            ) / Math.max(completedPredictions.length, 1),
+          ),
+          meanContextF1: null,
+          qa: completedPredictions,
+        },
+      });
+    }
+  };
+
+  await Promise.all(
+    Array.from(
+      { length: Math.min(LOCOMO_QA_CONCURRENCY, sampleQa.length) },
+      () => runWorker(),
+    ),
+  );
+
+  const completedQaPredictions = qaPredictions.filter(
+    (entry): entry is LocomoQuestionPrediction => entry !== null,
+  );
+  const questionCount = completedQaPredictions.length;
+  const meanScore = roundMetric(
+    completedQaPredictions.reduce((total, qa) => total + qa.score, 0) /
+      Math.max(questionCount, 1),
+  );
+  return {
+    sampleId: params.sample.sample_id,
+    questionCount,
+    meanScore,
+    meanContextF1: null,
+    qa: completedQaPredictions,
+  };
+}
+
+async function evaluateRetrievalSample(params: {
+  sample: LocomoSample;
+  budgetTokens: number;
+  categories: Map<number, LocomoCategoryRunningAggregate>;
+  session: LocomoRetrievalSession;
+  onQuestionProgress?: (params: {
+    samplePrediction: LocomoSamplePrediction;
+  }) => void;
+}): Promise<LocomoSamplePrediction> {
+  const qaPredictions: LocomoQuestionPrediction[] = [];
+  const sampleQuestionCount = (params.sample.qa || []).length;
+
+  for (const qa of params.sample.qa || []) {
+    const memoryContext = memoryService.buildPromptMemoryContext({
+      session: params.session.session,
+      query: String(qa.question || '').trim(),
+      semanticLimit: 12,
+    });
+    const recalledMemories = budgetTruncateSemanticMemories(
+      memoryContext.semanticMemories,
+      params.budgetTokens,
+    );
+    const recalledContext = recalledMemories
+      .map((entry) => entry.content)
+      .join('\n\n')
+      .trim();
+    const retrievedSourceMessageIds = recalledMemories
+      .map((entry) => entry.source_message_id)
+      .filter(
+        (value): value is number =>
+          typeof value === 'number' && Number.isFinite(value) && value > 0,
+      );
+    const hitRate = computeRetrievalHitRate({
+      sample: params.sample,
+      evidence: Array.isArray(qa.evidence) ? qa.evidence : [],
+      retrievedContent: recalledContext,
+    });
+    const contextF1 = computeContextTokenF1(
+      recalledContext,
+      answerToString(qa),
+    );
+
+    qaPredictions.push({
+      category: qa.category,
+      question: qa.question,
+      answer: answerToString(qa),
+      prediction: recalledContext,
+      score: hitRate,
+      evidence: Array.isArray(qa.evidence) ? qa.evidence : [],
+      contextF1,
+      retrievedSourceMessageIds,
+    });
+
+    const existing = params.categories.get(qa.category) || {
+      scoreTotal: 0,
+      contextF1Total: 0,
+      questionCount: 0,
+    };
+    existing.scoreTotal += hitRate;
+    existing.contextF1Total += contextF1;
+    existing.questionCount += 1;
+    params.categories.set(qa.category, existing);
+    params.onQuestionProgress?.({
+      samplePrediction: {
+        sampleId: params.sample.sample_id,
+        questionCount: sampleQuestionCount,
+        meanScore: roundMetric(
+          qaPredictions.reduce((total, entry) => total + entry.score, 0) /
+            Math.max(qaPredictions.length, 1),
+        ),
+        meanContextF1: roundMetric(
+          qaPredictions.reduce(
+            (total, entry) => total + (entry.contextF1 || 0),
+            0,
+          ) / Math.max(qaPredictions.length, 1),
+        ),
+        qa: [...qaPredictions],
+      },
+    });
+  }
+
+  const questionCount = qaPredictions.length;
+  return {
+    sampleId: params.sample.sample_id,
+    questionCount,
+    meanScore: roundMetric(
+      qaPredictions.reduce((total, entry) => total + entry.score, 0) /
+        Math.max(questionCount, 1),
+    ),
+    meanContextF1: roundMetric(
+      qaPredictions.reduce(
+        (total, entry) => total + (entry.contextF1 || 0),
+        0,
+      ) / Math.max(questionCount, 1),
+    ),
+    qa: qaPredictions,
+  };
+}
+
+function buildQuestionPrompt(
+  sample: LocomoSample,
+  qa: LocomoQA,
+  budgetTokens: number,
+): LocomoPreparedQuestion {
+  let question = String(qa.question || '').trim();
+  if (qa.category === 2) {
+    question += ' Use DATE of CONVERSATION to answer with an approximate date.';
+  }
+  if (qa.category === 5) {
+    question = buildCategoryFiveQuestion(sample, qa);
+  }
+
+  const speakers = getSpeakerNames(sample);
+  const conversationStart = CONVERSATION_START_PROMPT.replace(
+    '{speakerA}',
+    speakers[0],
+  ).replace('{speakerB}', speakers[1]);
+  const qaPromptTemplate = qa.category === 5 ? QA_PROMPT_CATEGORY_5 : QA_PROMPT;
+  const questionPrompt = qaPromptTemplate.replace('{question}', question);
+  const availableConversationTokens = Math.max(
+    1,
+    budgetTokens -
+      estimateTokenCount(conversationStart) -
+      estimateTokenCount(questionPrompt) -
+      ANSWER_BUFFER_TOKENS,
+  );
+  const conversation = selectConversationContext(
+    sample.conversation,
+    availableConversationTokens,
+  );
+
+  return {
+    prompt: `${conversationStart}${conversation}\n\n${questionPrompt}`.trim(),
+    scoreCategory: qa.category,
+    categoryFiveAnswerKey:
+      qa.category === 5 ? buildCategoryFiveAnswerKey(sample, qa) : null,
+  };
+}
+
+function buildCategoryFiveQuestion(sample: LocomoSample, qa: LocomoQA): string {
+  const answerKey = buildCategoryFiveAnswerKey(sample, qa);
+  return `${qa.question} Select the correct answer: (a) ${answerKey.a} (b) ${answerKey.b}.`;
+}
+
+function buildCategoryFiveAnswerKey(
+  sample: LocomoSample,
+  qa: LocomoQA,
+): Record<'a' | 'b', string> {
+  const answer = answerToString(qa) || 'No information available';
+  const notMentioned = 'Not mentioned in the conversation';
+  const answerFirst = hashText(`${sample.sample_id}:${qa.question}`) % 2 === 0;
+  return answerFirst
+    ? { a: answer, b: notMentioned }
+    : { a: notMentioned, b: answer };
+}
+
+function getSpeakerNames(sample: LocomoSample): [string, string] {
+  const explicitA = String(sample.conversation.speaker_a || '').trim();
+  const explicitB = String(sample.conversation.speaker_b || '').trim();
+  if (explicitA && explicitB) {
+    return [explicitA, explicitB];
+  }
+
+  const discovered: string[] = [];
+  for (const turn of flattenConversationTurns(sample.conversation)) {
+    const speaker = turn.speaker.trim();
+    if (!speaker || discovered.includes(speaker)) continue;
+    discovered.push(speaker);
+    if (discovered.length === 2) break;
+  }
+
+  return [
+    discovered[0] || explicitA || 'Speaker A',
+    discovered[1] || explicitB || 'Speaker B',
+  ];
+}
+
+function flattenConversationTurns(
+  conversation: Record<string, unknown>,
+): LocomoFlattenedTurn[] {
+  const cached = flattenedConversationTurnsCache.get(conversation);
+  if (cached) {
+    return cached;
+  }
+
+  const sessionNames = Object.keys(conversation || {})
+    .filter((key) => /^session_\d+$/.test(key))
+    .sort((left, right) => {
+      const leftNum = Number.parseInt(left.slice('session_'.length), 10) || 0;
+      const rightNum = Number.parseInt(right.slice('session_'.length), 10) || 0;
+      return leftNum - rightNum;
+    });
+  const turns: LocomoFlattenedTurn[] = [];
+
+  for (const sessionName of sessionNames) {
+    const sessionNum =
+      Number.parseInt(sessionName.slice('session_'.length), 10) || 0;
+    const dateTime = String(
+      conversation[`session_${sessionNum}_date_time`] || '',
+    ).trim();
+    const rawTurns = conversation[sessionName];
+    if (!Array.isArray(rawTurns)) continue;
+
+    for (const entry of rawTurns) {
+      if (!entry || typeof entry !== 'object') continue;
+      const record = entry as Record<string, unknown>;
+      const speaker = String(record.speaker || '').trim();
+      const diaId = String(record.dia_id || '').trim();
+      const text = String(record.text || '').trim();
+      if (!speaker || !diaId || !text) continue;
+      turns.push({
+        sessionNum,
+        dateTime,
+        speaker,
+        dia_id: diaId,
+        text,
+        ...(typeof record.blip_caption === 'string' &&
+        record.blip_caption.trim().length > 0
+          ? { blip_caption: record.blip_caption.trim() }
+          : {}),
+      });
+    }
+  }
+
+  flattenedConversationTurnsCache.set(conversation, turns);
+  return turns;
+}
+
+function selectConversationContext(
+  conversation: Record<string, unknown>,
+  maxTokens: number,
+): string {
+  const chronologicalTurns = flattenConversationTurns(conversation);
+  const selected: LocomoFlattenedTurn[] = [];
+  let totalTokens = 0;
+
+  for (let index = chronologicalTurns.length - 1; index >= 0; index -= 1) {
+    const turn = chronologicalTurns[index];
+    const headerTokens =
+      selected.length === 0 || selected[0].sessionNum !== turn.sessionNum
+        ? estimateTokenCount(`DATE: ${turn.dateTime}\nCONVERSATION:\n`)
+        : 0;
+    const turnTokens = estimateTokenCount(formatConversationTurn(turn));
+    if (
+      totalTokens + headerTokens + turnTokens > maxTokens &&
+      selected.length
+    ) {
+      break;
+    }
+    selected.unshift(turn);
+    totalTokens += headerTokens + turnTokens;
+  }
+
+  if (selected.length === 0) {
+    const lastTurn = chronologicalTurns.at(-1);
+    return lastTurn
+      ? `DATE: ${lastTurn.dateTime || 'unknown'}\nCONVERSATION:\n${formatConversationTurn(lastTurn)}`.trim()
+      : '';
+  }
+
+  const sections: string[] = [];
+  let currentSessionNum: number | null = null;
+  let currentDateTime = '';
+  let currentTurns: string[] = [];
+
+  for (const turn of selected) {
+    if (currentSessionNum !== turn.sessionNum) {
+      if (currentTurns.length > 0) {
+        sections.push(
+          `DATE: ${currentDateTime}\nCONVERSATION:\n${currentTurns.join('')}`.trim(),
+        );
+      }
+      currentSessionNum = turn.sessionNum;
+      currentDateTime = turn.dateTime || 'unknown';
+      currentTurns = [];
+    }
+    currentTurns.push(formatConversationTurn(turn));
+  }
+
+  if (currentTurns.length > 0) {
+    sections.push(
+      `DATE: ${currentDateTime}\nCONVERSATION:\n${currentTurns.join('')}`.trim(),
+    );
+  }
+
+  return sections.join('\n\n').trim();
+}
+
+function formatConversationTurn(turn: LocomoTurn): string {
+  const base = `${turn.speaker} said, "${turn.text}"`;
+  if (turn.blip_caption) {
+    return `${base} and shared ${turn.blip_caption}.\n`;
+  }
+  return `${base}\n`;
+}
+
+function ingestSampleIntoNativeMemory(params: {
+  sample: LocomoSample;
+  runTag: string;
+  agentMode: LocomoAgentMode;
+  runtime: LocomoGatewayRuntime;
+}): LocomoRetrievalSession {
+  const agentId = resolveRetrievalAgentId(params);
+  const sessionId = buildSessionKey(
+    agentId,
+    'locomo',
+    'dm',
+    `${params.runTag}-${params.sample.sample_id}`,
+  );
+  const session = memoryService.getOrCreateSession(
+    sessionId,
+    null,
+    'locomo',
+    agentId,
+  );
+  const messageIdByDiaId = new Map<string, number>();
+  const [speakerA, speakerB] = getSpeakerNames(params.sample);
+
+  for (const turn of flattenConversationTurns(params.sample.conversation)) {
+    const role = turn.speaker === speakerB ? 'assistant' : 'user';
+    const content = formatRetrievalMemoryTurn(turn);
+    const messageId = memoryService.storeMessage({
+      sessionId,
+      userId: `locomo:${turn.speaker.toLowerCase().replace(/\s+/g, '-')}`,
+      username: turn.speaker,
+      role,
+      content,
+    });
+    messageIdByDiaId.set(normalizeDiaId(turn.dia_id), messageId);
+    memoryService.storeSemanticMemory({
+      sessionId,
+      role,
+      source: 'locomo-retrieval',
+      scope: 'episodic',
+      metadata: {
+        sampleId: params.sample.sample_id,
+        diaId: turn.dia_id,
+        speaker: turn.speaker,
+        speakerA,
+        speakerB,
+      },
+      content,
+      confidence: 1,
+      sourceMessageId: messageId,
+    });
+  }
+
+  return {
+    session,
+    messageIdByDiaId,
+  };
+}
+
+function resolveRetrievalAgentId(params: {
+  sample: LocomoSample;
+  runTag: string;
+  agentMode: LocomoAgentMode;
+  runtime: LocomoGatewayRuntime;
+}): string {
+  if (params.agentMode === 'current-agent') {
+    return params.runtime.profile.agentId || 'main';
+  }
+  return buildConversationAgentId(params.sample.sample_id, params.runTag);
+}
+
+function formatRetrievalMemoryTurn(
+  turn: LocomoTurn & { dateTime?: string },
+): string {
+  const datePrefix = turn.dateTime ? `DATE: ${turn.dateTime}\n` : '';
+  return `${datePrefix}${formatConversationTurn(turn).trim()}`.trim();
+}
+
+function budgetTruncateSemanticMemories(
+  memories: SemanticMemoryEntry[],
+  budgetTokens: number,
+): SemanticMemoryEntry[] {
+  const selected: SemanticMemoryEntry[] = [];
+  let totalTokens = 0;
+
+  for (const memory of memories) {
+    const content = String(memory.content || '').trim();
+    if (!content) continue;
+    const contentTokens = estimateTokenCount(content);
+    if (totalTokens + contentTokens > budgetTokens && selected.length > 0) {
+      break;
+    }
+    selected.push(memory);
+    totalTokens += contentTokens;
+  }
+
+  return selected;
+}
+
+function computeRetrievalHitRate(params: {
+  sample: LocomoSample;
+  evidence: string[];
+  retrievedContent: string;
+}): number {
+  const expandedEvidenceIds = expandEvidenceIds(params.evidence);
+  if (expandedEvidenceIds.length === 0) return 1;
+
+  const turnMap = new Map<string, LocomoTurn>();
+  for (const turn of flattenConversationTurns(params.sample.conversation)) {
+    turnMap.set(turn.dia_id, turn);
+  }
+
+  const lowerRetrieved = String(params.retrievedContent || '').toLowerCase();
+  let found = 0;
+  let resolvable = 0;
+  for (const evidenceId of expandedEvidenceIds) {
+    const turn = turnMap.get(evidenceId);
+    if (!turn) {
+      console.log(
+        `WARNING: dia_id "${evidenceId}" not found in sample ${params.sample.sample_id}`,
+      );
+      continue;
+    }
+    resolvable += 1;
+    if (turn.text.length < 20) {
+      console.log(
+        `WARNING: short turn text (${turn.text.length} chars) for dia_id ${evidenceId}: ${JSON.stringify(turn.text)}`,
+      );
+    }
+    if (lowerRetrieved.includes(turn.text.toLowerCase())) {
+      found += 1;
+    }
+  }
+  if (resolvable === 0) return 0;
+  return found / resolvable;
+}
+
+function expandEvidenceIds(evidence: string[]): string[] {
+  const expanded: string[] = [];
+  for (const entry of evidence || []) {
+    const segments = String(entry || '')
+      .split(';')
+      .flatMap((part) => part.split(/\s+/g))
+      .map((part) => normalizeDiaId(part))
+      .filter((part) => /^D\d+:\d+$/i.test(part));
+    expanded.push(...segments);
+  }
+  return expanded;
+}
+
+function normalizeDiaId(value: string): string {
+  const match = /^D(\d+):(\d+)$/i.exec(String(value || '').trim());
+  if (!match) return String(value || '').trim();
+  return `D${Number.parseInt(match[1], 10)}:${Number.parseInt(match[2], 10)}`;
+}
+
+function computeContextTokenF1(prediction: string, answer: string): number {
+  const predictionTokens = tokenizeContextText(prediction);
+  const answerTokens = tokenizeContextText(answer);
+  if (predictionTokens.length === 0 && answerTokens.length === 0) return 1;
+  if (predictionTokens.length === 0 || answerTokens.length === 0) return 0;
+
+  const predictionCounts = new Map<string, number>();
+  const answerCounts = new Map<string, number>();
+  for (const token of predictionTokens) {
+    predictionCounts.set(token, (predictionCounts.get(token) || 0) + 1);
+  }
+  for (const token of answerTokens) {
+    answerCounts.set(token, (answerCounts.get(token) || 0) + 1);
+  }
+
+  let matches = 0;
+  for (const [token, count] of predictionCounts.entries()) {
+    const answerCount = answerCounts.get(token) || 0;
+    matches += Math.min(count, answerCount);
+  }
+  if (matches === 0) return 0;
+  const precision = matches / predictionTokens.length;
+  const recall = matches / answerTokens.length;
+  return (2 * precision * recall) / (precision + recall);
+}
+
+function tokenizeContextText(value: string): string[] {
+  return String(value || '')
+    .toLowerCase()
+    .trim()
+    .split(/\s+/)
+    .filter(Boolean);
+}
+
+export const testOnlyLocomoNativeRetrieval = {
+  computeContextTokenF1,
+  computeRetrievalHitRate,
+  expandEvidenceIds,
+  normalizeDiaId,
+};
+
+async function requestModelAnswer(params: {
+  runtime: LocomoGatewayRuntime;
+  model: string;
+  prompt: string;
+  user: string;
+}): Promise<{ content: string; usage: LocomoChatCompletionUsage | null }> {
+  const url = `${params.runtime.baseUrl.replace(/\/+$/, '')}/chat/completions`;
+  const response = await fetchWithTimeout(
+    url,
+    {
+      method: 'POST',
+      headers: {
+        'Content-Type': 'application/json',
+        Authorization: `Bearer ${params.runtime.apiKey}`,
+      },
+      body: JSON.stringify({
+        model: params.model,
+        user: params.user,
+        messages: [{ role: 'user', content: params.prompt }],
+      }),
+    },
+    LOCOMO_MODEL_CALL_TIMEOUT_MS,
+    'LOCOMO model call',
+  );
+
+  if (!response.ok) {
+    throw new Error(`LOCOMO model call failed with HTTP ${response.status}.`);
+  }
+
+  const payload = (await response.json()) as {
+    choices?: Array<{
+      message?: {
+        content?: unknown;
+      };
+    }>;
+    usage?: LocomoChatCompletionUsage;
+  };
+  return {
+    content: String(payload.choices?.[0]?.message?.content || '').trim(),
+    usage:
+      payload.usage && typeof payload.usage === 'object' ? payload.usage : null,
+  };
+}
+
+function mergeUsage(
+  target: LocomoTokenUsage,
+  usage: LocomoChatCompletionUsage | null,
+): void {
+  if (!usage) return;
+  const promptTokens =
+    typeof usage.prompt_tokens === 'number' &&
+    Number.isFinite(usage.prompt_tokens)
+      ? usage.prompt_tokens
+      : 0;
+  const completionTokens =
+    typeof usage.completion_tokens === 'number' &&
+    Number.isFinite(usage.completion_tokens)
+      ? usage.completion_tokens
+      : 0;
+  const totalTokens =
+    typeof usage.total_tokens === 'number' &&
+    Number.isFinite(usage.total_tokens)
+      ? usage.total_tokens
+      : promptTokens + completionTokens;
+  target.promptTokens += promptTokens;
+  target.completionTokens += completionTokens;
+  target.totalTokens += totalTokens;
+  target.responsesWithUsage += 1;
+}
+
+function normalizeModelPrediction(value: string): string {
+  return String(value || '')
+    .replace(/^short answer:\s*/i, '')
+    .split('\n')[0]
+    .trim()
+    .replace(/^["']+|["']+$/g, '');
+}
+
+function normalizeCategoryFivePrediction(
+  value: string,
+  answerKey: Record<'a' | 'b', string> | null,
+): string {
+  const normalized = normalizeModelPrediction(value).toLowerCase();
+  if (
+    normalized.includes('no information available') ||
+    normalized.includes('not mentioned')
+  ) {
+    return 'Not mentioned in the conversation';
+  }
+  if (answerKey && /^\(?a\)?$/.test(normalized)) {
+    return answerKey.a;
+  }
+  if (answerKey && /^\(?b\)?$/.test(normalized)) {
+    return answerKey.b;
+  }
+  return normalized;
+}
+
+function scoreLocomoAnswer(qa: LocomoQA, prediction: string): number {
+  return scoreOfficialLocomoAnswer({
+    category: qa.category,
+    prediction,
+    answer: answerToString(qa),
+  });
+}
+
+function answerToString(qa: LocomoQA): string {
+  if (typeof qa.answer === 'string') return qa.answer;
+  if (typeof qa.answer === 'number' && Number.isFinite(qa.answer)) {
+    return String(qa.answer);
+  }
+  if (qa.answer && typeof qa.answer === 'object') {
+    return JSON.stringify(qa.answer);
+  }
+  return String(qa.adversarial_answer || '').trim();
+}
+
+function estimateTokenCount(text: string): number {
+  return Math.max(1, Math.ceil(String(text || '').length / 4));
+}
+
+function roundMetric(value: number): number {
+  if (!Number.isFinite(value)) return 0;
+  return Math.round(value * 1000) / 1000;
+}
+
+function hashText(value: string): number {
+  let hash = 2166136261;
+  for (let index = 0; index < value.length; index += 1) {
+    hash ^= value.charCodeAt(index);
+    hash = Math.imul(hash, 16777619);
+  }
+  return hash >>> 0;
+}
+
+function formatLocomoCategoryLine(
+  category: string,
+  aggregate: LocomoCategoryAggregate,
+  mode: LocomoEvaluationMode,
+): string {
+  if (mode === 'retrieval') {
+    return `cat${category.padEnd(4)} Hit ${aggregate.meanScore.toFixed(3).padEnd(6)} F1 ${String(
+      (aggregate.contextF1 ?? 0).toFixed(3),
+    ).padEnd(6)} Q ${String(aggregate.questionCount)}`;
+  }
+  return `cat${category.padEnd(4)} Score ${aggregate.meanScore.toFixed(3).padEnd(6)} Q ${String(aggregate.questionCount)}`;
+}
+
+function printSummaryTable(summary: LocomoRunSummary): void {
+  console.log('');
+  console.log(
+    summary.mode === 'retrieval'
+      ? 'Category  Hit     F1      Questions'
+      : 'Category  Score   Questions',
+  );
+  for (const [category, aggregate] of Object.entries(summary.categories).sort(
+    ([left], [right]) => Number(left) - Number(right),
+  )) {
+    console.log(formatLocomoCategoryLine(category, aggregate, summary.mode));
+  }
+  console.log('');
+  console.log(
+    summary.mode === 'retrieval'
+      ? `Hit rate: ${summary.overallScore.toFixed(3)}`
+      : `Overall score: ${summary.overallScore.toFixed(3)}`,
+  );
+  if (summary.mode === 'retrieval' && summary.contextF1 != null) {
+    console.log(`Context F1: ${summary.contextF1.toFixed(3)}`);
+  }
+  console.log(`Questions: ${summary.questionCount}`);
+  if (summary.tokenUsage) {
+    console.log(`Prompt tokens: ${summary.tokenUsage.promptTokens}`);
+    console.log(`Completion tokens: ${summary.tokenUsage.completionTokens}`);
+    console.log(`Total tokens: ${summary.tokenUsage.totalTokens}`);
+  }
+  console.log(`Predictions JSON: ${summary.predictionsPath}`);
+  console.log(`Result JSON: ${summary.resultPath}`);
+}
+
+export const testOnlyLocomoNative = {
+  flattenConversationTurns,
+};
diff --git a/src/evals/locomo-official-scoring.ts b/src/evals/locomo-official-scoring.ts
new file mode 100644
index 00000000..d6ef1589
--- /dev/null
+++ b/src/evals/locomo-official-scoring.ts
@@ -0,0 +1,111 @@
+import { stemmer } from 'stemmer';
+
+const PUNCTUATION_REGEX = /[!"#$%&'()*+./:;<=>?@[\\\]^_`{|}~-]/g;
+const ARTICLES_REGEX = /\b(a|an|the|and)\b/g;
+
+// Direct port of LoCoMo QA scoring semantics from task_eval/evaluation.py.
+export function scoreOfficialLocomoAnswer(params: {
+  category: number;
+  prediction: string;
+  answer: string;
+}): number {
+  const { category, prediction, answer } = params;
+  if (category === 1) {
+    return roundMetric(multiAnswerF1(prediction, answer));
+  }
+  if (category === 2 || category === 4) {
+    return roundMetric(singleAnswerF1(prediction, answer));
+  }
+  if (category === 3) {
+    return roundMetric(singleAnswerF1(prediction, answer.split(';')[0] || ''));
+  }
+  if (category === 5) {
+    const normalized = prediction.toLowerCase();
+    return normalized.includes('no information available') ||
+      normalized.includes('not mentioned')
+      ? 1
+      : 0;
+  }
+  throw new Error(`Unsupported LOCOMO question category: ${category}`);
+}
+
+function normalizeAnswer(value: string): string {
+  return String(value || '')
+    .replace(/,/g, '')
+    .toLowerCase()
+    .replace(PUNCTUATION_REGEX, '')
+    .replace(ARTICLES_REGEX, ' ')
+    .trim()
+    .split(/\s+/)
+    .filter(Boolean)
+    .join(' ');
+}
+
+function stemTokens(value: string): string[] {
+  return normalizeAnswer(value)
+    .split(' ')
+    .filter(Boolean)
+    .map((token) => stemmer(token));
+}
+
+function singleAnswerF1(prediction: string, groundTruth: string): number {
+  const predictionTokens = stemTokens(prediction);
+  const groundTruthTokens = stemTokens(groundTruth);
+  if (predictionTokens.length === 0 || groundTruthTokens.length === 0) {
+    return 0;
+  }
+
+  const groundTruthCounts = new Map<string, number>();
+  for (const token of groundTruthTokens) {
+    groundTruthCounts.set(token, (groundTruthCounts.get(token) || 0) + 1);
+  }
+
+  let commonCount = 0;
+  for (const token of predictionTokens) {
+    const remaining = groundTruthCounts.get(token) || 0;
+    if (remaining <= 0) continue;
+    commonCount += 1;
+    groundTruthCounts.set(token, remaining - 1);
+  }
+
+  if (commonCount === 0) return 0;
+  const precision = commonCount / predictionTokens.length;
+  const recall = commonCount / groundTruthTokens.length;
+  return (2 * precision * recall) / (precision + recall);
+}
+
+function multiAnswerF1(prediction: string, groundTruth: string): number {
+  const predictions = splitAnswers(prediction);
+  const groundTruths = splitAnswers(groundTruth);
+  if (predictions.length === 0 || groundTruths.length === 0) {
+    return 0;
+  }
+
+  let total = 0;
+  for (const truth of groundTruths) {
+    let best = 0;
+    for (const candidate of predictions) {
+      best = Math.max(best, singleAnswerF1(candidate, truth));
+    }
+    total += best;
+  }
+  return total / groundTruths.length;
+}
+
+function splitAnswers(value: string): string[] {
+  return String(value || '')
+    .split(',')
+    .map((entry) => entry.trim())
+    .filter(Boolean);
+}
+
+function roundMetric(value: number): number {
+  if (!Number.isFinite(value)) return 0;
+  return Math.round(value * 1000) / 1000;
+}
+
+export const testOnlyLocomoOfficialScoring = {
+  normalizeAnswer,
+  singleAnswerF1,
+  multiAnswerF1,
+};
diff --git a/src/evals/locomo-types.ts b/src/evals/locomo-types.ts
new file mode 100644
index 00000000..dd2d3c9c
--- /dev/null
+++ b/src/evals/locomo-types.ts
@@ -0,0 +1,16 @@
+export type LocomoAgentMode = 'conversation-fresh' | 'current-agent';
+export const LOCOMO_DATASET_FILENAME = 'locomo10.json';
+export const LOCOMO_SETUP_MARKER = '.hybridclaw-setup-ok';
+
+export interface LocomoCategoryAggregate {
+  meanScore: number;
+  questionCount: number;
+  contextF1: number | null;
+}
+
+export interface LocomoTokenUsage {
+  promptTokens: number;
+  completionTokens: number;
+  totalTokens: number;
+  responsesWithUsage: number;
+}
diff --git a/src/memory/memory-service.ts b/src/memory/memory-service.ts
index ac8df3b5..bc9194c3 100644
--- a/src/memory/memory-service.ts
+++ b/src/memory/memory-service.ts
@@ -349,21 +349,25 @@ class HashedTokenEmbeddingProvider implements EmbeddingProvider {
       .slice(0, 256);
     if (tokens.length === 0) return null;
 
-    const vector = new Float32Array(this.dimensions);
+    const vector = Array<number>(this.dimensions).fill(0);
     for (const token of tokens) {
       const hash = this.hashToken(token);
       const index = hash % this.dimensions;
       const sign = (hash & 1) === 0 ? 1 : -1;
-      vector[index] += sign * Math.min(4, token.length);
+      vector[index] = (vector[index] || 0) + sign * Math.min(4, token.length);
     }
 
     let norm = 0;
     for (let i = 0; i < vector.length; i += 1) {
-      norm += vector[i] * vector[i];
+      const value = vector[i] || 0;
+      norm += value * value;
     }
     if (norm <= Number.EPSILON) return null;
     const scale = 1 / Math.sqrt(norm);
-    return Array.from(vector, (value) => value * scale);
+    for (let i = 0; i < vector.length; i += 1) {
+      vector[i] = (vector[i] || 0) * scale;
+    }
+    return vector;
   }
 
   private hashToken(token: string): number {
@@ -660,6 +664,32 @@ export class MemoryService {
     );
   }
 
+  storeSemanticMemory(params: {
+    sessionId: string;
+    role: string;
+    source?: string | null;
+    scope?: string | null;
+    metadata?: Record<string, unknown> | string | null;
+    content: string;
+    confidence?: number;
+    embedding?: number[] | null;
+    sourceMessageId?: number | null;
+  }): number {
+    const content = params.content.trim();
+    if (!content) {
+      throw new Error('Cannot store empty semantic memory content.');
+    }
+
+    return this.backend.storeSemanticMemory({
+      ...params,
+      content,
+      embedding:
+        params.embedding === undefined
+          ? this.embeddingProvider.embed(content)
+          : params.embedding,
+    });
+  }
+
   storeTurn(params: StoreTurnParams): {
     userMessageId: number;
     assistantMessageId: number;
@@ -689,7 +719,7 @@ export class MemoryService {
       };
     }
 
-    this.backend.storeSemanticMemory({
+    this.storeSemanticMemory({
       sessionId: params.sessionId,
       role: 'assistant',
       source: 'conversation',
@@ -697,7 +727,6 @@ export class MemoryService {
       metadata: {},
       content: interactionText,
       confidence: 1,
-      embedding: this.embeddingProvider.embed(interactionText),
       sourceMessageId: assistantMessageId,
     });
 
diff --git a/tests/cli.test.ts b/tests/cli.test.ts
index d3587045..d0e2329d 100644
--- a/tests/cli.test.ts
+++ b/tests/cli.test.ts
@@ -218,6 +218,30 @@ async function importFreshCli(options?: {
       expiresAt: number;
     };
   };
+  googleWorkspaceStatus?: {
+    authenticated: boolean;
+    path: string;
+    clientConfigured: boolean;
+    pendingAuthorization: boolean;
+    refreshTokenConfigured: boolean;
+    reloginRequired: boolean;
+    expiresAt: number | null;
+    scopes: string[];
+  };
+  googleWorkspaceClientSecretResult?: {
+    path: string;
+    clientId: string;
+  };
+  googleWorkspaceStartResult?: {
+    path: string;
+    authUrl: string;
+    redirectUri: string;
+  };
+  googleWorkspaceExchangeResult?: {
+    path: string;
+    expiresAt: number;
+    scopes: string[];
+  };
   gatewayReachable?: boolean;
   gatewayStatusReachable?: boolean;
   gatewayStatusSandboxMode?: 'host' | 'container' | null;
@@ -495,6 +519,44 @@ async function importFreshCli(options?: {
         },
       },
   );
+  const clearGoogleWorkspaceCredentials = vi.fn(() => '/tmp/credentials.json');
+  const getGoogleWorkspaceAuthStatus = vi.fn(
+    () =>
+      options?.googleWorkspaceStatus || {
+        authenticated: false,
+        path: '/tmp/credentials.json',
+        clientConfigured: false,
+        pendingAuthorization: false,
+        refreshTokenConfigured: false,
+        reloginRequired: false,
+        expiresAt: null,
+        scopes: [],
+      },
+  );
+  const saveGoogleWorkspaceClientSecretFile = vi.fn(
+    () =>
+      options?.googleWorkspaceClientSecretResult || {
+        path: '/tmp/credentials.json',
+        clientId: 'client-id.apps.googleusercontent.com',
+      },
+  );
+  const startGoogleWorkspaceAuth = vi.fn(
+    () =>
+      options?.googleWorkspaceStartResult || {
+        path: '/tmp/credentials.json',
+        authUrl:
+          'https://accounts.google.com/o/oauth2/v2/auth?client_id=client-id.apps.googleusercontent.com',
+        redirectUri: 'http://localhost:1',
+      },
+  );
+  const exchangeGoogleWorkspaceAuthCode = vi.fn(
+    async () =>
+      options?.googleWorkspaceExchangeResult || {
+        path: '/tmp/credentials.json',
+        expiresAt: Date.parse('2026-03-13T12:00:00.000Z'),
+        scopes: ['https://www.googleapis.com/auth/calendar'],
+      },
+  );
   const printUpdateUsage = vi.fn();
   const runUpdateCommand = vi.fn();
   const runDoctorCli = vi.fn(async () => 0);
@@ -1080,6 +1142,13 @@ async function importFreshCli(options?: {
     getCodexAuthStatus,
     loginCodexInteractive,
   }));
+  vi.doMock('../src/auth/google-workspace-auth.ts', () => ({
+    clearGoogleWorkspaceCredentials,
+    exchangeGoogleWorkspaceAuthCode,
+    getGoogleWorkspaceAuthStatus,
+    saveGoogleWorkspaceClientSecretFile,
+    startGoogleWorkspaceAuth,
+  }));
   vi.doMock('../src/config/cli-flags.ts', () => ({
     findUnsupportedGatewayLifecycleFlag: vi.fn(() => null),
     parseGatewayFlags: vi.fn(() => ({
@@ -1269,10 +1338,15 @@ async function importFreshCli(options?: {
     cli,
     clearHybridAICredentials,
     clearCodexCredentials,
+    clearGoogleWorkspaceCredentials,
+    exchangeGoogleWorkspaceAuthCode,
     getCodexAuthStatus,
+    getGoogleWorkspaceAuthStatus,
     getHybridAIAuthStatus,
     loginCodexInteractive,
     loginHybridAIInteractive,
+    saveGoogleWorkspaceClientSecretFile,
+    startGoogleWorkspaceAuth,
     printUpdateUsage,
     runUpdateCommand,
     runDoctorCli,
@@ -1348,6 +1422,7 @@ afterEach(() => {
   vi.unstubAllGlobals();
   vi.doUnmock('../src/auth/hybridai-auth.ts');
   vi.doUnmock('../src/auth/codex-auth.ts');
+  vi.doUnmock('../src/auth/google-workspace-auth.ts');
   vi.doUnmock('../src/config/cli-flags.ts');
   vi.doUnmock('../src/config/config.ts');
   vi.doUnmock('../src/config/runtime-config.ts');
@@ -3698,6 +3773,32 @@ describe('CLI hybridai commands', () => {
     );
   });
 
+  it('routes auth login google-workspace through the stepwise OAuth flow', async () => {
+    const {
+      cli,
+      exchangeGoogleWorkspaceAuthCode,
+      saveGoogleWorkspaceClientSecretFile,
+      startGoogleWorkspaceAuth,
+    } = await importFreshCli();
+    const logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+    await cli.main([
+      'auth',
+      'login',
+      'google-workspace',
+      '--client-secret',
+      '/tmp/client_secret.json',
+      '--auth-url',
+    ]);
+
+    expect(saveGoogleWorkspaceClientSecretFile).toHaveBeenCalledWith(
+      '/tmp/client_secret.json',
+    );
+    expect(startGoogleWorkspaceAuth).toHaveBeenCalled();
+    expect(exchangeGoogleWorkspaceAuthCode).not.toHaveBeenCalled();
+    expect(logSpy).toHaveBeenCalledWith('Google Workspace OAuth prepared.');
+  });
+
   it('routes auth login local to local backend configuration', async () => {
     const { cli, updateRuntimeConfig } = await importFreshCli();
     const logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
@@ -3830,6 +3931,44 @@ describe('CLI hybridai commands', () => {
     expect(logSpy).toHaveBeenCalledWith('Tenant ID: teams-tenant-id');
   });
 
+  it('prints Google Workspace status through auth status', async () => {
+    const { cli } = await importFreshCli({
+      googleWorkspaceStatus: {
+        authenticated: true,
+        path: '/tmp/credentials.json',
+        clientConfigured: true,
+        pendingAuthorization: false,
+        refreshTokenConfigured: true,
+        reloginRequired: false,
+        expiresAt: Date.parse('2026-03-13T12:00:00.000Z'),
+        scopes: [
+          'https://www.googleapis.com/auth/calendar',
+          'https://www.googleapis.com/auth/drive.readonly',
+        ],
+      },
+    });
+    const logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+    await cli.main(['auth', 'status', 'google-workspace']);
+
+    expect(logSpy).toHaveBeenCalledWith('Authenticated: yes');
+    expect(logSpy).toHaveBeenCalledWith('Client secret: configured');
+    expect(logSpy).toHaveBeenCalledWith('Refresh token: configured');
+    expect(logSpy).toHaveBeenCalledWith('Granted scopes: 2');
+  });
+
+  it('clears Google Workspace credentials through auth logout', async () => {
+    const { cli, clearGoogleWorkspaceCredentials } = await importFreshCli();
+    const logSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
+
+    await cli.main(['auth', 'logout', 'google-workspace']);
+
+    expect(clearGoogleWorkspaceCredentials).toHaveBeenCalled();
+    expect(logSpy).toHaveBeenCalledWith(
+      'Cleared Google Workspace OAuth token in /tmp/credentials.json.',
+    );
+  });
+
   it('configures OpenRouter from auth login with --api-key', async () => {
     const {
       cli,
diff --git a/tests/command-registry.test.ts b/tests/command-registry.test.ts
index c900ad01..4c666494 100644
--- a/tests/command-registry.test.ts
+++ b/tests/command-registry.test.ts
@@ -262,6 +262,9 @@ test('registers eval as a local slash/text command', async () => {
           expect.objectContaining({
             label: '/eval list',
           }),
+          expect.objectContaining({
+            label: '/eval locomo',
+          }),
           expect.objectContaining({
             label: '/eval swebench-verified',
           }),
@@ -273,6 +276,10 @@ test('registers eval as a local slash/text command', async () => {
     'eval',
     'gaia',
   ]);
+  expect(mapCanonicalCommandToGatewayArgs(['eval', 'locomo'])).toEqual([
+    'eval',
+    'locomo',
+  ]);
 });
 
 test('registers config as a local slash/text command', async () => {
diff --git a/tests/eval-command.test.ts b/tests/eval-command.test.ts
index 063569a6..c6d907c3 100644
--- a/tests/eval-command.test.ts
+++ b/tests/eval-command.test.ts
@@ -61,6 +61,13 @@ afterEach(() => {
   }
 });
 
+function quoteForShell(value: string): string {
+  if (process.platform === 'win32') {
+    return `"${value.replace(/"/g, '""')}"`;
+  }
+  return `'${value.replace(/'/g, `'\\''`)}'`;
+}
+
 function installTau2Layout(dataDir: string): void {
   const installDir = path.join(dataDir, 'evals', 'tau2-bench');
   fs.mkdirSync(path.join(installDir, '.git'), { recursive: true });
@@ -80,6 +87,179 @@ function installTau2Layout(dataDir: string): void {
   fs.writeFileSync(path.join(installDir, '.venv', 'bin', 'python'), '');
 }
 
+function installLocomoLayout(dataDir: string): void {
+  const installDir = path.join(dataDir, 'evals', 'locomo');
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    JSON.stringify([]),
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), '');
+}
+
+function writeLocomoResult(
+  jobDir: string,
+  result: {
+    mode?: 'qa' | 'retrieval';
+    sampleCount?: number;
+    questionCount?: number;
+    budgetTokens?: number;
+    overallScore?: number;
+    contextF1?: number | null;
+    model?: string;
+    tokenUsage?: {
+      promptTokens: number;
+      completionTokens: number;
+      totalTokens: number;
+      responsesWithUsage: number;
+    };
+    categories?: Record<
+      string,
+      {
+        meanScore: number;
+        questionCount: number;
+        contextF1?: number | null;
+      }
+    >;
+  },
+): void {
+  fs.mkdirSync(jobDir, { recursive: true });
+  fs.writeFileSync(path.join(jobDir, 'predictions.json'), JSON.stringify([]));
+  fs.writeFileSync(
+    path.join(jobDir, 'result.json'),
+    JSON.stringify(
+      {
+        suite: 'locomo',
+        mode: result.mode ?? 'qa',
+        dataset: 'locomo10.json',
+        generatedAt: '2026-04-10T08:00:00.000Z',
+        model:
+          result.mode === 'retrieval'
+            ? null
+            : (result.model ?? 'hybridai/gpt-4.1-mini'),
+        sampleCount: result.sampleCount ?? 2,
+        questionCount: result.questionCount ?? 40,
+        budgetTokens: result.budgetTokens ?? 4000,
+        overallScore: result.overallScore ?? 0.537,
+        contextF1:
+          result.mode === 'retrieval' ? (result.contextF1 ?? 0.113) : null,
+        resultPath: path.join(jobDir, 'result.json'),
+        predictionsPath: path.join(jobDir, 'predictions.json'),
+        categories: result.categories ?? {
+          '1': {
+            meanScore: 0.625,
+            questionCount: 16,
+            contextF1: result.mode === 'retrieval' ? 0.125 : null,
+          },
+          '2': {
+            meanScore: 0.5,
+            questionCount: 8,
+            contextF1: result.mode === 'retrieval' ? 0.1 : null,
+          },
+          '5': {
+            meanScore: 0.75,
+            questionCount: 16,
+            contextF1: result.mode === 'retrieval' ? 0.05 : null,
+          },
+        },
+        tokenUsage:
+          result.mode === 'retrieval'
+            ? null
+            : (result.tokenUsage ?? {
+                promptTokens: 1200,
+                completionTokens: 180,
+                totalTokens: 1380,
+                responsesWithUsage: 40,
+              }),
+        samples: [],
+      },
+      null,
+      2,
+    ),
+  );
+}
+
+function writeLocomoProgress(
+  jobDir: string,
+  progress: {
+    mode?: 'qa' | 'retrieval';
+    sampleCount?: number;
+    completedSampleCount?: number;
+    questionCount?: number;
+    completedQuestionCount?: number;
+    budgetTokens?: number;
+    overallScore?: number;
+    contextF1?: number | null;
+    model?: string;
+    currentSampleId?: string | null;
+    currentSampleQuestionCount?: number | null;
+    currentSampleQuestionTotal?: number | null;
+    tokenUsage?: {
+      promptTokens: number;
+      completionTokens: number;
+      totalTokens: number;
+      responsesWithUsage: number;
+    };
+    categories?: Record<
+      string,
+      {
+        meanScore: number;
+        questionCount: number;
+        contextF1?: number | null;
+      }
+    >;
+  },
+): void {
+  fs.mkdirSync(jobDir, { recursive: true });
+  fs.writeFileSync(
+    path.join(jobDir, 'progress.json'),
+    JSON.stringify(
+      {
+        suite: 'locomo',
+        mode: progress.mode ?? 'qa',
+        dataset: 'locomo10.json',
+        updatedAt: '2026-04-10T08:00:00.000Z',
+        model:
+          progress.mode === 'retrieval'
+            ? null
+            : (progress.model ?? 'hybridai/gpt-4.1-mini'),
+        budgetTokens: progress.budgetTokens ?? 4000,
+        sampleCount: progress.sampleCount ?? 2,
+        completedSampleCount: progress.completedSampleCount ?? 0,
+        questionCount: progress.questionCount ?? 20,
+        completedQuestionCount: progress.completedQuestionCount ?? 7,
+        overallScore: progress.overallScore ?? 0.429,
+        contextF1:
+          progress.mode === 'retrieval' ? (progress.contextF1 ?? 0.091) : null,
+        currentSampleId: progress.currentSampleId ?? 'conv-26',
+        currentSampleQuestionCount: progress.currentSampleQuestionCount ?? 7,
+        currentSampleQuestionTotal: progress.currentSampleQuestionTotal ?? 20,
+        progressPath: path.join(jobDir, 'progress.json'),
+        resultPath: path.join(jobDir, 'result.json'),
+        predictionsPath: path.join(jobDir, 'predictions.json'),
+        categories: progress.categories ?? {
+          '1': {
+            meanScore: 0.571,
+            questionCount: 7,
+            contextF1: progress.mode === 'retrieval' ? 0.111 : null,
+          },
+        },
+        tokenUsage:
+          progress.mode === 'retrieval'
+            ? null
+            : (progress.tokenUsage ?? {
+                promptTokens: 210,
+                completionTokens: 35,
+                totalTokens: 245,
+                responsesWithUsage: 7,
+              }),
+      },
+      null,
+      2,
+    ),
+  );
+}
+
 function writeTerminalBenchAgentResult(
   jobDir: string,
   taskName: string,
@@ -234,6 +414,33 @@ test('shows managed tau2 usage', async () => {
   );
 });
 
+test('shows managed locomo usage', async () => {
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+
+  const result = await handleEvalCommand({
+    args: ['locomo'],
+    dataDir: fs.mkdtempSync(path.join(os.tmpdir(), 'hybridclaw-eval-')),
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  if (result.kind !== 'info') {
+    throw new Error(`Unexpected result kind: ${result.kind}`);
+  }
+  expect(result.title).toBe('LOCOMO');
+  expect(result.text).toContain('/eval locomo setup');
+  expect(result.text).toContain(
+    '/eval locomo run --budget 4000 --max-questions 20',
+  );
+  expect(result.text).toContain('`--max-questions` for quick smoke runs');
+  expect(result.text).toContain(
+    'By default, LOCOMO creates one fresh template-seeded agent per conversation sample.',
+  );
+});
+
 test('starts detached tau2 setup', async () => {
   const dataDir = fs.mkdtempSync(
     path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
@@ -297,6 +504,186 @@ test('reports non-terminal suites as not implemented yet', async () => {
   expect(spawnMock).not.toHaveBeenCalled();
 });
 
+test('starts detached locomo setup', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  spawnMock.mockReturnValue({
+    pid: 6791,
+    unref: vi.fn(),
+    on: vi.fn(),
+    off: vi.fn(),
+  });
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'setup'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  if (result.kind !== 'info') {
+    throw new Error(`Unexpected result kind: ${result.kind}`);
+  }
+  expect(result.title).toBe('LOCOMO Setup Started');
+  expect(result.text).toContain('Command: locomo setup');
+  expect(result.text).toContain(
+    'Setup strategy: native HybridClaw LOCOMO harness with official locomo10 dataset download.',
+  );
+  expect(result.text).toContain('Use `/eval locomo status`');
+  expect(result.text).toContain('Use `/eval locomo results`');
+
+  const [, shellArgs] = spawnMock.mock.calls[0] as [string, string[]];
+  expect(shellArgs[1]).toContain(
+    quoteForShell(
+      path.join(process.cwd(), 'node_modules', 'tsx', 'dist', 'cli.mjs'),
+    ),
+  );
+  expect(shellArgs[1]).toContain(
+    quoteForShell(path.join(process.cwd(), 'src', 'cli.ts')),
+  );
+  expect(shellArgs[1]).toContain('__eval-locomo-native');
+  expect(shellArgs[1]).toContain('setup');
+  expect(shellArgs[1]).toContain('--install-dir');
+});
+
+test('runs managed locomo with question cap flag', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  spawnMock.mockReturnValue({
+    pid: 6798,
+    unref: vi.fn(),
+    on: vi.fn(),
+    off: vi.fn(),
+  });
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'run', '--budget', '4000', '--max-questions', '20'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  if (result.kind !== 'info') {
+    throw new Error(`Unexpected result kind: ${result.kind}`);
+  }
+  expect(result.title).toBe('LOCOMO Run Started');
+  expect(result.text).toContain(
+    'Command: locomo run --budget 4000 --max-questions 20',
+  );
+  expect(result.text).toContain(
+    'Use `/eval locomo status` and `/eval locomo results` to follow this run.',
+  );
+
+  const [, shellArgs] = spawnMock.mock.calls[0] as [string, string[]];
+  expect(shellArgs[1]).toContain(
+    quoteForShell(
+      path.join(process.cwd(), 'node_modules', 'tsx', 'dist', 'cli.mjs'),
+    ),
+  );
+  expect(shellArgs[1]).toContain(
+    quoteForShell(path.join(process.cwd(), 'src', 'cli.ts')),
+  );
+  expect(shellArgs[1]).toContain('__eval-locomo-native');
+  expect(shellArgs[1]).toContain('run');
+  expect(shellArgs[1]).toContain('--install-dir');
+  expect(shellArgs[1]).toContain('--agent-mode');
+  expect(shellArgs[1]).toContain('conversation-fresh');
+  expect(shellArgs[1]).toContain('--budget');
+  expect(shellArgs[1]).toContain('--max-questions');
+});
+
+test('runs managed locomo with current agent override', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  spawnMock.mockReturnValue({
+    pid: 6798,
+    unref: vi.fn(),
+    on: vi.fn(),
+    off: vi.fn(),
+  });
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  await handleEvalCommand({
+    args: ['--current-agent', 'locomo', 'run', '--max-questions', '20'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'charly',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  const [, shellArgs] = spawnMock.mock.calls[0] as [string, string[]];
+  expect(shellArgs[1]).toContain('--agent-mode');
+  expect(shellArgs[1]).toContain('current-agent');
+});
+
+test('rejects fresh-agent override for managed locomo', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['--fresh-agent', 'locomo', 'run', '--max-questions', '20'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'charly',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('error');
+  if (result.kind !== 'error') {
+    throw new Error(`Unexpected result kind: ${result.kind}`);
+  }
+  expect(result.title).toBe('LOCOMO Run');
+  expect(result.text).toContain(
+    'Native LOCOMO does not support `--fresh-agent`.',
+  );
+  expect(spawnMock).not.toHaveBeenCalled();
+});
+
+test('runs managed locomo with retrieval mode', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  spawnMock.mockReturnValue({
+    pid: 6798,
+    unref: vi.fn(),
+    on: vi.fn(),
+    off: vi.fn(),
+  });
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  await handleEvalCommand({
+    args: ['locomo', 'run', '--mode', 'retrieval', '--max-questions', '20'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  const [, shellArgs] = spawnMock.mock.calls[0] as [string, string[]];
+  expect(shellArgs[1]).toContain('--mode');
+  expect(shellArgs[1]).toContain('retrieval');
+});
+
 test('starts detached terminal-bench setup', async () => {
   const dataDir = fs.mkdtempSync(
     path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
@@ -381,6 +768,14 @@ test('runs managed terminal-bench with native HybridClaw runner defaults', async
   );
 
   const [, shellArgs] = spawnMock.mock.calls[0] as [string, string[]];
+  expect(shellArgs[1]).toContain(
+    quoteForShell(
+      path.join(process.cwd(), 'node_modules', 'tsx', 'dist', 'cli.mjs'),
+    ),
+  );
+  expect(shellArgs[1]).toContain(
+    quoteForShell(path.join(process.cwd(), 'src', 'cli.ts')),
+  );
   expect(shellArgs[1]).toContain('__eval-terminal-bench-native');
   expect(shellArgs[1]).toContain('--install-dir');
   expect(shellArgs[1]).toContain('--data-dir');
@@ -672,6 +1067,209 @@ test('reports managed suite latest run in status output', async () => {
   expect(result.text).toContain('Errors: 0');
 });
 
+test('reports locomo latest run in status output', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  writeLocomoResult(jobDir, {});
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4451,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        finishedAt: '2026-04-10T08:01:00.000Z',
+        exitCode: 0,
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: `${process.execPath} ${path.join(process.cwd(), 'dist', 'cli.js')} __eval-locomo-native run --install-dir ${path.join(dataDir, 'evals', 'locomo')} --budget 4000 --num-samples 2`,
+        displayCommand: 'locomo run --budget 4000 --num-samples 2',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'status'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.text).toContain('Latest run: eval-locomo-run (completed)');
+  expect(result.text).toContain('Dataset:');
+  expect(result.text).toContain('Questions: 40');
+  expect(result.text).toContain('Budget: 4000');
+  expect(result.text).toContain('Overall score: 0.537');
+  expect(result.text).toContain('cat1: Category 1 | Score 0.625 | Q 16');
+  expect(result.text).toContain(
+    'Tokens: 1380 total (1200 prompt + 180 completion)',
+  );
+  expect(result.text).toContain('Predictions:');
+});
+
+test('reports locomo retrieval latest run in status output', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  writeLocomoResult(jobDir, {
+    mode: 'retrieval',
+    overallScore: 0.812,
+    contextF1: 0.143,
+    categories: {
+      '1': {
+        meanScore: 0.875,
+        questionCount: 16,
+        contextF1: 0.188,
+      },
+    },
+  });
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4451,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        finishedAt: '2026-04-10T08:01:00.000Z',
+        exitCode: 0,
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: `${process.execPath} ${path.join(process.cwd(), 'dist', 'cli.js')} __eval-locomo-native run --install-dir ${path.join(dataDir, 'evals', 'locomo')} --mode retrieval --budget 4000 --num-samples 2`,
+        displayCommand:
+          'locomo run --mode retrieval --budget 4000 --num-samples 2',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'status'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.text).toContain('Mode: retrieval');
+  expect(result.text).toContain('Hit rate: 0.812');
+  expect(result.text).toContain('Context F1: 0.143');
+  expect(result.text).toContain(
+    'cat1: Category 1 | Hit 0.875 | F1 0.188 | Q 16',
+  );
+  expect(result.text).not.toContain('Tokens:');
+});
+
+test('reports locomo in-flight progress in status output', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  writeLocomoProgress(jobDir, {});
+  process.kill = vi.fn();
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4451,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: `${process.execPath} ${path.join(process.cwd(), 'dist', 'cli.js')} __eval-locomo-native run --install-dir ${path.join(dataDir, 'evals', 'locomo')} --budget 4000 --max-questions 20`,
+        displayCommand: 'locomo run --budget 4000 --max-questions 20',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'status'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.text).toContain('Latest run: eval-locomo-run (running)');
+  expect(result.text).toContain('Completed samples: 0/2');
+  expect(result.text).toContain('Questions: 7/20');
+  expect(result.text).toContain('Score so far: 0.429');
+  expect(result.text).toContain('Current sample: conv-26');
+  expect(result.text).toContain('Current sample questions: 7/20');
+  expect(result.text).toContain('Progress JSON:');
+});
+
 test('shows generic managed suite setup logs in results', async () => {
   const dataDir = fs.mkdtempSync(
     path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
@@ -732,6 +1330,287 @@ test('shows generic managed suite setup logs in results', async () => {
   expect(result.text).not.toContain('docker check pending');
 });
 
+test('shows locomo run summary in results when a run exists', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  writeLocomoResult(jobDir, {});
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4452,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        finishedAt: '2026-04-10T08:01:00.000Z',
+        exitCode: 0,
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: 'locomo run --budget 4000 --num-samples 2',
+        displayCommand: 'locomo run --budget 4000 --num-samples 2',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'results'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.title).toBe('LOCOMO Results');
+  expect(result.text).toMatch(/Evaluated model\s+hybridai\/gpt-4\.1-mini/);
+  expect(result.text).toMatch(/Dataset\s+locomo10\.json/);
+  expect(result.text).toMatch(/Samples\s+2/);
+  expect(result.text).toMatch(/Questions\s+40/);
+  expect(result.text).toMatch(/Budget\s+4000/);
+  expect(result.text).toMatch(/Overall score\s+0\.537/);
+  expect(result.text).toMatch(/cat1\s+Category 1 \| Score 0\.625 \| Q 16/);
+  expect(result.text).toMatch(
+    /Tokens\s+1380 total \(1200 prompt \+ 180 completion\)/,
+  );
+  expect(result.text).toContain('Predictions JSON');
+  expect(result.text).toContain('Result JSON');
+});
+
+test('shows locomo retrieval summary in results when a run exists', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  writeLocomoResult(jobDir, {
+    mode: 'retrieval',
+    overallScore: 0.812,
+    contextF1: 0.143,
+    categories: {
+      '1': {
+        meanScore: 0.875,
+        questionCount: 16,
+        contextF1: 0.188,
+      },
+    },
+  });
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4452,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        finishedAt: '2026-04-10T08:01:00.000Z',
+        exitCode: 0,
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: 'locomo run --mode retrieval --budget 4000 --num-samples 2',
+        displayCommand:
+          'locomo run --mode retrieval --budget 4000 --num-samples 2',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'results'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.title).toBe('LOCOMO Results');
+  expect(result.text).toMatch(/Mode\s+retrieval/);
+  expect(result.text).not.toMatch(/Evaluated model\s+/);
+  expect(result.text).toMatch(/Hit rate\s+0\.812/);
+  expect(result.text).toMatch(/Context F1\s+0\.143/);
+  expect(result.text).toMatch(
+    /cat1\s+Category 1 \| Hit 0\.875 \| F1 0\.188 \| Q 16/,
+  );
+  expect(result.text).not.toContain('Tokens');
+  expect(result.text).toContain('Predictions JSON');
+});
+
+test('logs debug when locomo result json is malformed', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-badjson-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  fs.mkdirSync(jobDir, { recursive: true });
+  fs.writeFileSync(
+    path.join(jobDir, 'result.json'),
+    '{not valid json',
+    'utf-8',
+  );
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run-badjson',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4452,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        finishedAt: '2026-04-10T08:01:00.000Z',
+        exitCode: 0,
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: 'locomo run --budget 4000 --max-questions 20',
+        displayCommand: 'locomo run --budget 4000 --max-questions 20',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { logger } = await import('../src/logger.ts');
+  const debugSpy = vi.spyOn(logger, 'debug').mockImplementation(() => logger);
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'results'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.title).toBe('LOCOMO Results');
+  expect(debugSpy).toHaveBeenCalledWith(
+    expect.objectContaining({
+      runId: 'eval-locomo-run-badjson',
+      resultPath: path.join(jobDir, 'result.json'),
+    }),
+    'Failed to parse LOCOMO result summary',
+  );
+});
+
+test('shows locomo run progress in results while a run is active', async () => {
+  const dataDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
+  );
+  installLocomoLayout(dataDir);
+  const runDir = path.join(dataDir, 'evals', 'eval-locomo-run-abc123');
+  const jobDir = path.join(dataDir, 'evals', 'locomo', 'jobs', '2026-04-10');
+  fs.mkdirSync(runDir, { recursive: true });
+  writeLocomoProgress(jobDir, {});
+  process.kill = vi.fn();
+  fs.writeFileSync(
+    path.join(runDir, 'run.json'),
+    JSON.stringify(
+      {
+        runId: 'eval-locomo-run',
+        suiteId: 'locomo',
+        operation: 'run',
+        pid: 4452,
+        startedAt: '2026-04-10T08:00:00.000Z',
+        cwd: path.join(dataDir, 'evals', 'locomo'),
+        command: 'locomo run --budget 4000 --max-questions 20',
+        displayCommand: 'locomo run --budget 4000 --max-questions 20',
+        openaiBaseUrl: 'http://127.0.0.1:9090/v1',
+        model: 'hybridai/gpt-4.1-mini',
+        baseModel: 'hybridai/gpt-4.1-mini',
+        authMode: 'loopback',
+        profile: {
+          workspaceMode: 'current-agent',
+          ablateSystemPrompt: false,
+          includePromptParts: [],
+          omitPromptParts: [],
+        },
+        stdoutPath: path.join(runDir, 'stdout.log'),
+        stderrPath: path.join(runDir, 'stderr.log'),
+      },
+      null,
+      2,
+    ),
+  );
+  fs.writeFileSync(path.join(runDir, 'stdout.log'), `Job dir: ${jobDir}\n`);
+  fs.writeFileSync(path.join(runDir, 'stderr.log'), '');
+
+  const { handleEvalCommand } = await import('../src/evals/eval-command.ts');
+  const result = await handleEvalCommand({
+    args: ['locomo', 'results'],
+    dataDir,
+    gatewayBaseUrl: 'http://127.0.0.1:9090',
+    webApiToken: '',
+    effectiveAgentId: 'main',
+    effectiveModel: 'hybridai/gpt-4.1-mini',
+  });
+
+  expect(result.kind).toBe('info');
+  expect(result.title).toBe('LOCOMO Results');
+  expect(result.text).toMatch(/Status\s+running/);
+  expect(result.text).toMatch(/Questions\s+7\/20/);
+  expect(result.text).toMatch(/Completed samples\s+0\/2/);
+  expect(result.text).toMatch(/Score so far\s+0\.429/);
+  expect(result.text).toMatch(/Current sample\s+conv-26/);
+  expect(result.text).toMatch(/Current sample questions\s+7\/20/);
+  expect(result.text).toContain('Progress JSON');
+});
+
 test('shows managed suite run summary in results when a run exists', async () => {
   const dataDir = fs.mkdtempSync(
     path.join(os.tmpdir(), 'hybridclaw-eval-run-'),
diff --git a/tests/gateway-service.eval-command.test.ts b/tests/gateway-service.eval-command.test.ts
index 1aa1ad3e..4fb3d1ef 100644
--- a/tests/gateway-service.eval-command.test.ts
+++ b/tests/gateway-service.eval-command.test.ts
@@ -53,8 +53,12 @@ test('eval help is available through the gateway command path', async () => {
   expect(result.text).toContain(
     '`/eval [--current-agent|--fresh-agent] [--ablate-system] [--include-prompt=<parts>] [--omit-prompt=<parts>] <shell command...>`',
   );
+  expect(result.text).toContain(
+    '/eval locomo [setup|run|status|stop|results|logs]',
+  );
   expect(result.text).toContain('/eval tau2 [setup|run|status|stop|results]');
   expect(result.text).toContain('swebench-verified');
+  expect(result.text).toContain('locomo');
   expect(result.text).toContain('terminal-bench-2.0');
   expect(result.text).toContain('agentbench');
   expect(result.text).toContain('gaia');
diff --git a/tests/google-workspace-auth.test.ts b/tests/google-workspace-auth.test.ts
new file mode 100644
index 00000000..a593a395
--- /dev/null
+++ b/tests/google-workspace-auth.test.ts
@@ -0,0 +1,226 @@
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { afterEach, describe, expect, it, vi } from 'vitest';
+
+const ORIGINAL_HOME = process.env.HOME;
+const ORIGINAL_CWD = process.cwd();
+
+function makeTempHome(): string {
+  return fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-google-workspace-auth-'),
+  );
+}
+
+function restoreEnvVar(name: string, value: string | undefined): void {
+  if (value === undefined) {
+    delete process.env[name];
+    return;
+  }
+  process.env[name] = value;
+}
+
+async function importFreshGoogleWorkspaceAuth(homeDir: string) {
+  process.env.HOME = homeDir;
+  process.chdir(homeDir);
+  vi.resetModules();
+  return import('../src/auth/google-workspace-auth.ts');
+}
+
+function writeClientSecretFile(homeDir: string): string {
+  const filePath = path.join(homeDir, 'client_secret.json');
+  fs.writeFileSync(
+    filePath,
+    JSON.stringify(
+      {
+        installed: {
+          client_id: 'client-id.apps.googleusercontent.com',
+          client_secret: 'client-secret',
+          auth_uri: 'https://accounts.google.com/o/oauth2/v2/auth',
+          token_uri: 'https://oauth2.googleapis.com/token',
+        },
+      },
+      null,
+      2,
+    ),
+    'utf-8',
+  );
+  return filePath;
+}
+
+afterEach(() => {
+  vi.restoreAllMocks();
+  vi.unstubAllGlobals();
+  vi.resetModules();
+  restoreEnvVar('HOME', ORIGINAL_HOME);
+  process.chdir(ORIGINAL_CWD);
+});
+
+describe('Google Workspace auth', () => {
+  it('stores the Google OAuth client secret in the runtime secret store', async () => {
+    const homeDir = makeTempHome();
+    const googleWorkspaceAuth = await importFreshGoogleWorkspaceAuth(homeDir);
+    const runtimeSecrets = await import('../src/security/runtime-secrets.ts');
+    const clientSecretPath = writeClientSecretFile(homeDir);
+
+    const result =
+      googleWorkspaceAuth.saveGoogleWorkspaceClientSecretFile(clientSecretPath);
+
+    expect(result.path).toBe(
+      path.join(homeDir, '.hybridclaw', 'credentials.json'),
+    );
+    expect(result.clientId).toBe('client-id.apps.googleusercontent.com');
+    expect(
+      runtimeSecrets.readStoredRuntimeSecret(
+        googleWorkspaceAuth.GOOGLE_WORKSPACE_CLIENT_SECRET_KEY,
+      ),
+    ).toContain('client-id.apps.googleusercontent.com');
+    expect(googleWorkspaceAuth.getGoogleWorkspaceAuthStatus()).toMatchObject({
+      authenticated: false,
+      clientConfigured: true,
+      pendingAuthorization: false,
+    });
+  });
+
+  it('creates and stores a pending PKCE session when printing the auth URL', async () => {
+    const homeDir = makeTempHome();
+    const googleWorkspaceAuth = await importFreshGoogleWorkspaceAuth(homeDir);
+    const runtimeSecrets = await import('../src/security/runtime-secrets.ts');
+    googleWorkspaceAuth.saveGoogleWorkspaceClientSecretFile(
+      writeClientSecretFile(homeDir),
+    );
+
+    const result = googleWorkspaceAuth.startGoogleWorkspaceAuth();
+    const authUrl = new URL(result.authUrl);
+    const pendingRaw = runtimeSecrets.readStoredRuntimeSecret(
+      googleWorkspaceAuth.GOOGLE_WORKSPACE_PENDING_AUTH_KEY,
+    );
+
+    expect(result.path).toBe(
+      path.join(homeDir, '.hybridclaw', 'credentials.json'),
+    );
+    expect(result.redirectUri).toBe(
+      googleWorkspaceAuth.GOOGLE_WORKSPACE_REDIRECT_URI,
+    );
+    expect(authUrl.origin + authUrl.pathname).toBe(
+      'https://accounts.google.com/o/oauth2/v2/auth',
+    );
+    expect(authUrl.searchParams.get('client_id')).toBe(
+      'client-id.apps.googleusercontent.com',
+    );
+    expect(authUrl.searchParams.get('response_type')).toBe('code');
+    expect(authUrl.searchParams.get('access_type')).toBe('offline');
+    expect(authUrl.searchParams.get('prompt')).toBe('consent');
+    expect(authUrl.searchParams.get('redirect_uri')).toBe(
+      googleWorkspaceAuth.GOOGLE_WORKSPACE_REDIRECT_URI,
+    );
+    expect(authUrl.searchParams.get('code_challenge_method')).toBe('S256');
+    expect(JSON.parse(pendingRaw || '{}')).toMatchObject({
+      redirectUri: googleWorkspaceAuth.GOOGLE_WORKSPACE_REDIRECT_URI,
+    });
+  });
+
+  it('exchanges a pasted redirect URL for a refreshable token and clears pending state', async () => {
+    const homeDir = makeTempHome();
+    const googleWorkspaceAuth = await importFreshGoogleWorkspaceAuth(homeDir);
+    const runtimeSecrets = await import('../src/security/runtime-secrets.ts');
+    googleWorkspaceAuth.saveGoogleWorkspaceClientSecretFile(
+      writeClientSecretFile(homeDir),
+    );
+    googleWorkspaceAuth.startGoogleWorkspaceAuth();
+    const pending = JSON.parse(
+      runtimeSecrets.readStoredRuntimeSecret(
+        googleWorkspaceAuth.GOOGLE_WORKSPACE_PENDING_AUTH_KEY,
+      ) || '{}',
+    ) as { state: string };
+    vi.stubGlobal(
+      'fetch',
+      vi.fn(async () => ({
+        ok: true,
+        json: async () => ({
+          access_token: 'ya29.access-token',
+          refresh_token: '1//refresh-token',
+          expires_in: 3600,
+          token_type: 'Bearer',
+          scope: googleWorkspaceAuth.GOOGLE_WORKSPACE_SCOPES.join(' '),
+        }),
+      })),
+    );
+
+    const result = await googleWorkspaceAuth.exchangeGoogleWorkspaceAuthCode(
+      `http://localhost:1/?code=auth-code&state=${pending.state}`,
+    );
+    const status = googleWorkspaceAuth.getGoogleWorkspaceAuthStatus();
+
+    expect(result.path).toBe(
+      path.join(homeDir, '.hybridclaw', 'credentials.json'),
+    );
+    expect(result.expiresAt).toBeGreaterThan(Date.now());
+    expect(result.scopes).toContain('https://www.googleapis.com/auth/calendar');
+    expect(status).toMatchObject({
+      authenticated: true,
+      clientConfigured: true,
+      pendingAuthorization: false,
+      refreshTokenConfigured: true,
+    });
+  });
+
+  it('rejects redirect URLs with a mismatched state', async () => {
+    const homeDir = makeTempHome();
+    const googleWorkspaceAuth = await importFreshGoogleWorkspaceAuth(homeDir);
+    googleWorkspaceAuth.saveGoogleWorkspaceClientSecretFile(
+      writeClientSecretFile(homeDir),
+    );
+    googleWorkspaceAuth.startGoogleWorkspaceAuth();
+
+    await expect(
+      googleWorkspaceAuth.exchangeGoogleWorkspaceAuthCode(
+        'http://localhost:1/?code=auth-code&state=wrong-state',
+      ),
+    ).rejects.toThrowError(
+      expect.objectContaining({
+        code: 'google_workspace_state_mismatch',
+      }),
+    );
+  });
+
+  it('refreshes an expired access token from the stored refresh token', async () => {
+    const homeDir = makeTempHome();
+    const googleWorkspaceAuth = await importFreshGoogleWorkspaceAuth(homeDir);
+    const runtimeSecrets = await import('../src/security/runtime-secrets.ts');
+    googleWorkspaceAuth.saveGoogleWorkspaceClientSecretFile(
+      writeClientSecretFile(homeDir),
+    );
+    runtimeSecrets.saveNamedRuntimeSecrets({
+      [googleWorkspaceAuth.GOOGLE_WORKSPACE_TOKEN_KEY]: JSON.stringify({
+        accessToken: 'ya29.expired',
+        refreshToken: '1//refresh-token',
+        tokenType: 'Bearer',
+        scopes: [...googleWorkspaceAuth.GOOGLE_WORKSPACE_SCOPES],
+        expiresAt: Date.now() - 60_000,
+        updatedAt: new Date().toISOString(),
+      }),
+    });
+    vi.stubGlobal(
+      'fetch',
+      vi.fn(async () => ({
+        ok: true,
+        json: async () => ({
+          access_token: 'ya29.refreshed',
+          expires_in: 1800,
+          token_type: 'Bearer',
+        }),
+      })),
+    );
+
+    const result =
+      await googleWorkspaceAuth.ensureFreshGoogleWorkspaceAccessToken();
+
+    expect(result).toMatchObject({
+      accessToken: 'ya29.refreshed',
+      refreshed: true,
+    });
+    expect(result.expiresAt).toBeGreaterThan(Date.now());
+  });
+});
diff --git a/tests/locomo-native.test.ts b/tests/locomo-native.test.ts
new file mode 100644
index 00000000..ed9ca881
--- /dev/null
+++ b/tests/locomo-native.test.ts
@@ -0,0 +1,1077 @@
+import fs from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+import { afterEach, expect, test, vi } from 'vitest';
+
+import { parseEvalProfileModel } from '../src/evals/eval-profile.ts';
+import { testOnlyLocomoNativeRetrieval } from '../src/evals/locomo-native.ts';
+import {
+  scoreOfficialLocomoAnswer,
+  testOnlyLocomoOfficialScoring,
+} from '../src/evals/locomo-official-scoring.ts';
+
+const originalFetch = globalThis.fetch;
+const originalOpenAIBaseUrl = process.env.OPENAI_BASE_URL;
+const originalOpenAIApiKey = process.env.OPENAI_API_KEY;
+const originalEvalModel = process.env.HYBRIDCLAW_EVAL_MODEL;
+const LOCOMO_DATASET_URL =
+  'https://raw.githubusercontent.com/snap-research/locomo/3eb6f2c585f5e1699204e3c3bdf7adc5c28cb376/data/locomo10.json';
+const LOCOMO_DATASET_SHA256 =
+  '79fa87e90f04081343b8c8debecb80a9a6842b76a7aa537dc9fdf651ea698ff4';
+
+async function flushMicrotasks(count = 4): Promise<void> {
+  for (let index = 0; index < count; index += 1) {
+    await Promise.resolve();
+  }
+}
+
+function createDeferred<T>() {
+  let resolve!: (value: T | PromiseLike<T>) => void;
+  let reject!: (reason?: unknown) => void;
+  const promise = new Promise<T>((res, rej) => {
+    resolve = res;
+    reject = rej;
+  });
+  return { promise, resolve, reject };
+}
+
+function buildTimeoutError(): Error {
+  return Object.assign(new Error('timed out'), { name: 'TimeoutError' });
+}
+
+function buildCompletionResponse(answer: string): Response {
+  return new Response(
+    JSON.stringify({
+      choices: [{ message: { content: answer } }],
+      usage: {
+        prompt_tokens: 10,
+        completion_tokens: 2,
+        total_tokens: 12,
+      },
+    }),
+    {
+      status: 200,
+      headers: { 'Content-Type': 'application/json' },
+    },
+  );
+}
+
+function toBuffer(
+  value: string | ArrayBuffer | ArrayBufferView<ArrayBufferLike>,
+): Buffer {
+  if (typeof value === 'string') {
+    return Buffer.from(value);
+  }
+  if (ArrayBuffer.isView(value)) {
+    return Buffer.from(value.buffer, value.byteOffset, value.byteLength);
+  }
+  return Buffer.from(value);
+}
+
+async function mockPinnedDatasetDigest(dataset: string): Promise<void> {
+  vi.resetModules();
+  vi.doMock('node:crypto', async () => {
+    const actual =
+      await vi.importActual<typeof import('node:crypto')>('node:crypto');
+    return {
+      ...actual,
+      createHash(algorithm: string) {
+        if (algorithm !== 'sha256') {
+          return actual.createHash(algorithm);
+        }
+
+        const fallbackHash = actual.createHash('sha256');
+        const chunks: Buffer[] = [];
+        const mockedHash = {
+          update(
+            value: string | ArrayBuffer | ArrayBufferView<ArrayBufferLike>,
+          ) {
+            const buffer = toBuffer(value);
+            chunks.push(buffer);
+            fallbackHash.update(buffer);
+            return mockedHash;
+          },
+          digest(encoding?: import('node:crypto').BinaryToTextEncoding) {
+            const buffer = Buffer.concat(chunks);
+            if (buffer.equals(Buffer.from(dataset, 'utf-8'))) {
+              return LOCOMO_DATASET_SHA256;
+            }
+            return encoding
+              ? fallbackHash.digest(encoding)
+              : fallbackHash.digest();
+          },
+        };
+
+        return mockedHash as unknown as ReturnType<typeof actual.createHash>;
+      },
+    };
+  });
+}
+
+afterEach(() => {
+  vi.restoreAllMocks();
+  vi.doUnmock('node:crypto');
+  vi.resetModules();
+  if (originalFetch) {
+    globalThis.fetch = originalFetch;
+  } else {
+    delete (globalThis as { fetch?: typeof fetch }).fetch;
+  }
+  if (originalOpenAIBaseUrl == null) {
+    delete process.env.OPENAI_BASE_URL;
+  } else {
+    process.env.OPENAI_BASE_URL = originalOpenAIBaseUrl;
+  }
+  if (originalOpenAIApiKey == null) {
+    delete process.env.OPENAI_API_KEY;
+  } else {
+    process.env.OPENAI_API_KEY = originalOpenAIApiKey;
+  }
+  if (originalEvalModel == null) {
+    delete process.env.HYBRIDCLAW_EVAL_MODEL;
+  } else {
+    process.env.HYBRIDCLAW_EVAL_MODEL = originalEvalModel;
+  }
+});
+
+function buildSampleDataset(): string {
+  return JSON.stringify([
+    {
+      sample_id: 'sample-1',
+      conversation: {
+        speaker_a: 'Alice',
+        speaker_b: 'Bob',
+        session_1_date_time: '2024-03-01 10:00:00',
+        session_1: [
+          {
+            speaker: 'Alice',
+            dia_id: 'D1:1',
+            text: 'Pepper loves playing fetch every evening.',
+          },
+          {
+            speaker: 'Bob',
+            dia_id: 'D1:2',
+            text: 'The weather turned rainy today.',
+          },
+          {
+            speaker: 'Alice',
+            dia_id: 'D1:3',
+            text: 'Tomorrow I will pack crunchy carrots for lunch.',
+          },
+        ],
+      },
+      qa: [
+        {
+          question: 'What does Pepper love playing every evening?',
+          answer: 'fetch',
+          evidence: ['D1:1'],
+          category: 1,
+        },
+        {
+          question: 'What will Alice pack for lunch tomorrow?',
+          answer: 'carrots',
+          evidence: ['D1:3'],
+          category: 1,
+        },
+      ],
+    },
+  ]);
+}
+
+function buildTwoSampleDataset(): string {
+  return JSON.stringify([
+    ...JSON.parse(buildSampleDataset()),
+    {
+      sample_id: 'sample-2',
+      conversation: {
+        speaker_a: 'Carol',
+        speaker_b: 'Dan',
+        session_1_date_time: '2024-03-02 10:00:00',
+        session_1: [
+          {
+            speaker: 'Carol',
+            dia_id: 'D2:1',
+            text: 'I adopted a greyhound named Orbit last spring.',
+          },
+        ],
+      },
+      qa: [
+        {
+          question: 'What is the name of Carol’s dog?',
+          answer: 'Orbit',
+          evidence: ['D2:1'],
+          category: 1,
+        },
+      ],
+    },
+  ]);
+}
+
+function buildCategoryFiveDataset(): string {
+  return JSON.stringify([
+    {
+      sample_id: 'sample-5',
+      conversation: {
+        speaker_a: 'Alice',
+        speaker_b: 'Bob',
+        session_1_date_time: '2024-03-01 10:00:00',
+        session_1: [
+          {
+            speaker: 'Alice',
+            dia_id: 'D1:1',
+            text: 'Pepper loves playing fetch every evening.',
+          },
+        ],
+      },
+      qa: [
+        {
+          question: 'What is Bob planning for his ski trip next month?',
+          answer: 'ski trip',
+          evidence: [],
+          category: 5,
+        },
+      ],
+    },
+  ]);
+}
+
+test('locomo native caches flattened conversation turns per conversation object', async () => {
+  const { testOnlyLocomoNative } = await import(
+    '../src/evals/locomo-native.ts'
+  );
+  const [sample] = JSON.parse(buildSampleDataset()) as Array<{
+    conversation: Record<string, unknown>;
+  }>;
+
+  const first = testOnlyLocomoNative.flattenConversationTurns(
+    sample.conversation,
+  );
+  const second = testOnlyLocomoNative.flattenConversationTurns(
+    sample.conversation,
+  );
+
+  expect(second).toBe(first);
+  expect(second.map((turn) => turn.dia_id)).toEqual(['D1:1', 'D1:2', 'D1:3']);
+});
+
+test('locomo native setup downloads the dataset and verifies bytes after redirects', async () => {
+  const dataset = buildSampleDataset();
+  await mockPinnedDatasetDigest(dataset);
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  const timeoutSpy = vi.spyOn(AbortSignal, 'timeout');
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  const fetchMock = vi.fn<typeof fetch>().mockResolvedValue({
+    ok: true,
+    status: 200,
+    url: 'https://objects.githubusercontent.com/redirected/locomo10.json',
+    arrayBuffer: async () => Buffer.from(dataset, 'utf-8'),
+  } as Response);
+  globalThis.fetch = fetchMock;
+
+  await runLocomoNativeCli(['setup', '--install-dir', installDir]);
+
+  expect(timeoutSpy).toHaveBeenCalledWith(120_000);
+  expect(fetchMock).toHaveBeenCalledWith(
+    LOCOMO_DATASET_URL,
+    expect.objectContaining({
+      signal: timeoutSpy.mock.results[0]?.value,
+    }),
+  );
+  expect(fs.existsSync(path.join(installDir, '.hybridclaw-setup-ok'))).toBe(
+    true,
+  );
+  expect(
+    fs.readFileSync(path.join(installDir, 'data', 'locomo10.json'), 'utf-8'),
+  ).toBe(dataset);
+});
+
+test('locomo native setup times out stalled dataset downloads', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  const timeoutSpy = vi.spyOn(AbortSignal, 'timeout');
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  const fetchMock = vi
+    .fn<typeof fetch>()
+    .mockRejectedValue(buildTimeoutError());
+  globalThis.fetch = fetchMock;
+
+  await expect(
+    runLocomoNativeCli(['setup', '--install-dir', installDir]),
+  ).rejects.toThrow(/^LOCOMO dataset download timed out after 120000ms\.$/);
+
+  expect(timeoutSpy).toHaveBeenCalledWith(120_000);
+  expect(fetchMock).toHaveBeenCalledWith(
+    LOCOMO_DATASET_URL,
+    expect.objectContaining({
+      signal: timeoutSpy.mock.results[0]?.value,
+    }),
+  );
+});
+
+test('locomo native setup rejects downloads that fail the pinned digest check after redirects', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  globalThis.fetch = vi.fn<typeof fetch>().mockResolvedValue({
+    ok: true,
+    status: 200,
+    url: 'https://objects.githubusercontent.com/redirected/locomo10.json',
+    arrayBuffer: async () => Buffer.from(buildSampleDataset(), 'utf-8'),
+  } as Response);
+
+  await expect(
+    runLocomoNativeCli(['setup', '--install-dir', installDir]),
+  ).rejects.toThrow(/SHA-256 verification/);
+});
+
+test('locomo native run reports setup guidance that works for cli and slash wrappers', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+
+  await expect(
+    runLocomoNativeCli(['run', '--install-dir', installDir]),
+  ).rejects.toThrow('LOCOMO is not set up. Run `setup` first');
+});
+
+test('locomo native run reports when the cached dataset is missing', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+
+  await expect(
+    runLocomoNativeCli(['run', '--install-dir', installDir]),
+  ).rejects.toThrow('LOCOMO dataset is missing');
+});
+
+test('locomo native run reports when the setup marker is missing', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+
+  await expect(
+    runLocomoNativeCli(['run', '--install-dir', installDir]),
+  ).rejects.toThrow('LOCOMO setup marker is missing');
+});
+
+test('locomo native run rejects malformed cached datasets immediately', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    JSON.stringify([{ sample_id: 'broken-sample', conversation: {} }]),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+
+  await expect(
+    runLocomoNativeCli(['run', '--install-dir', installDir]),
+  ).rejects.toThrow(/Invalid LOCOMO sample at index 0/);
+});
+
+test('locomo native run generates answers through the local gateway and scores them', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+  const requestModels: string[] = [];
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  globalThis.fetch = vi.fn<typeof fetch>(async (input, init) => {
+    if (String(input) !== 'http://127.0.0.1:9090/v1/chat/completions') {
+      throw new Error(`Unexpected fetch URL: ${String(input)}`);
+    }
+    const body = JSON.parse(String(init?.body || '{}')) as {
+      model: string;
+      messages: Array<{ role: string; content: string }>;
+    };
+    requestModels.push(body.model);
+    const parsedModel = parseEvalProfileModel(body.model);
+    expect(parsedModel.model).toBe('hybridai/gpt-4.1-mini');
+    expect(parsedModel.profile.agentId).toBeTruthy();
+    const prompt = body.messages[0]?.content || '';
+    const answer = prompt.includes('pack for lunch tomorrow')
+      ? 'carrots'
+      : 'fetch';
+    return new Response(
+      JSON.stringify({
+        choices: [{ message: { content: answer } }],
+        usage: {
+          prompt_tokens: 10,
+          completion_tokens: 2,
+          total_tokens: 12,
+        },
+      }),
+      {
+        status: 200,
+        headers: { 'Content-Type': 'application/json' },
+      },
+    );
+  });
+
+  await runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--budget',
+    '4000',
+    '--num-samples',
+    '1',
+  ]);
+
+  const jobRoot = path.join(installDir, 'jobs');
+  const [jobDirName] = fs.readdirSync(jobRoot);
+  const summary = JSON.parse(
+    fs.readFileSync(path.join(jobRoot, jobDirName, 'result.json'), 'utf-8'),
+  ) as {
+    suite: string;
+    model: string;
+    sampleCount: number;
+    questionCount: number;
+    overallScore: number;
+    tokenUsage: {
+      totalTokens: number;
+    };
+    categories: Record<
+      string,
+      {
+        meanScore: number;
+        questionCount: number;
+      }
+    >;
+    predictionsPath: string;
+  };
+
+  expect(summary.suite).toBe('locomo');
+  expect(summary.model).toBe('hybridai/gpt-4.1-mini');
+  expect(summary.sampleCount).toBe(1);
+  expect(summary.questionCount).toBe(2);
+  expect(summary.overallScore).toBe(1);
+  expect(summary.categories['1']).toEqual({
+    meanScore: 1,
+    questionCount: 2,
+    contextF1: null,
+  });
+  expect(summary.tokenUsage.totalTokens).toBe(24);
+  expect(fs.existsSync(summary.predictionsPath)).toBe(true);
+  const predictions = JSON.parse(
+    fs.readFileSync(summary.predictionsPath, 'utf-8'),
+  ) as Array<{
+    sampleId: string;
+    meanScore: number;
+    qa: Array<{ prediction: string; score: number }>;
+  }>;
+  expect(predictions[0]?.sampleId).toBe('sample-1');
+  expect(predictions[0]?.meanScore).toBe(1);
+  expect(predictions[0]?.qa.map((entry) => entry.prediction)).toEqual([
+    'fetch',
+    'carrots',
+  ]);
+  expect(requestModels).toHaveLength(2);
+  expect(requestModels[0]).toBe(requestModels[1]);
+});
+
+test('locomo native run evaluates QA questions concurrently while preserving question order', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+
+  const firstResponse = createDeferred<Response>();
+  const secondResponse = createDeferred<Response>();
+  let activeRequests = 0;
+  let maxConcurrentRequests = 0;
+
+  globalThis.fetch = vi.fn<typeof fetch>(async (_input, init) => {
+    activeRequests += 1;
+    maxConcurrentRequests = Math.max(maxConcurrentRequests, activeRequests);
+    const body = JSON.parse(String(init?.body || '{}')) as {
+      messages: Array<{ role: string; content: string }>;
+    };
+    const prompt = body.messages[0]?.content || '';
+    const response = prompt.includes('pack for lunch tomorrow')
+      ? await secondResponse.promise
+      : await firstResponse.promise;
+    activeRequests -= 1;
+    return response;
+  });
+
+  const runPromise = runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--budget',
+    '4000',
+    '--num-samples',
+    '1',
+  ]);
+
+  await flushMicrotasks();
+  expect(maxConcurrentRequests).toBe(2);
+  secondResponse.resolve(buildCompletionResponse('carrots'));
+  firstResponse.resolve(buildCompletionResponse('fetch'));
+  await runPromise;
+
+  const jobRoot = path.join(installDir, 'jobs');
+  const [jobDirName] = fs.readdirSync(jobRoot);
+  const summary = JSON.parse(
+    fs.readFileSync(path.join(jobRoot, jobDirName, 'result.json'), 'utf-8'),
+  ) as {
+    predictionsPath: string;
+  };
+  const predictions = JSON.parse(
+    fs.readFileSync(summary.predictionsPath, 'utf-8'),
+  ) as Array<{
+    qa: Array<{ prediction: string }>;
+  }>;
+
+  expect(predictions[0]?.qa.map((entry) => entry.prediction)).toEqual([
+    'fetch',
+    'carrots',
+  ]);
+});
+
+test('locomo native run times out stalled model calls', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  const timeoutSpy = vi.spyOn(AbortSignal, 'timeout');
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  const fetchMock = vi
+    .fn<typeof fetch>()
+    .mockRejectedValue(buildTimeoutError());
+  globalThis.fetch = fetchMock;
+
+  await expect(
+    runLocomoNativeCli([
+      'run',
+      '--install-dir',
+      installDir,
+      '--budget',
+      '4000',
+      '--num-samples',
+      '1',
+    ]),
+  ).rejects.toThrow(/^LOCOMO model call timed out after 30000ms\.$/);
+
+  expect(timeoutSpy).toHaveBeenCalledWith(30_000);
+  expect(fetchMock).toHaveBeenCalledWith(
+    'http://127.0.0.1:9090/v1/chat/completions',
+    expect.objectContaining({
+      signal: timeoutSpy.mock.results[0]?.value,
+    }),
+  );
+});
+
+test('locomo native run does not echo failed model response bodies', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  globalThis.fetch = vi.fn<typeof fetch>().mockResolvedValue(
+    new Response('token=test-key server stack trace', {
+      status: 401,
+      headers: { 'Content-Type': 'text/plain' },
+    }),
+  );
+
+  await expect(
+    runLocomoNativeCli([
+      'run',
+      '--install-dir',
+      installDir,
+      '--budget',
+      '4000',
+      '--num-samples',
+      '1',
+    ]),
+  ).rejects.toThrow(/^LOCOMO model call failed with HTTP 401\.$/);
+});
+
+test('locomo native default creates one fresh agent per conversation sample', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildTwoSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  const requestModels: string[] = [];
+  globalThis.fetch = vi.fn<typeof fetch>(async (_input, init) => {
+    const body = JSON.parse(String(init?.body || '{}')) as {
+      model: string;
+      messages: Array<{ role: string; content: string }>;
+    };
+    requestModels.push(body.model);
+    const prompt = body.messages[0]?.content || '';
+    let answer = 'fetch';
+    if (prompt.includes('pack for lunch tomorrow')) {
+      answer = 'carrots';
+    } else if (prompt.includes('name of Carol')) {
+      answer = 'Orbit';
+    }
+    return new Response(
+      JSON.stringify({
+        choices: [{ message: { content: answer } }],
+        usage: {
+          prompt_tokens: 10,
+          completion_tokens: 2,
+          total_tokens: 12,
+        },
+      }),
+      {
+        status: 200,
+        headers: { 'Content-Type': 'application/json' },
+      },
+    );
+  });
+
+  await runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--budget',
+    '4000',
+    '--num-samples',
+    '2',
+  ]);
+
+  expect(requestModels).toHaveLength(3);
+  const [first, second, third] = requestModels.map((model) =>
+    parseEvalProfileModel(model),
+  );
+  expect(first.model).toBe('hybridai/gpt-4.1-mini');
+  expect(first.profile.agentId).toBeTruthy();
+  expect(second.profile.agentId).toBe(first.profile.agentId);
+  expect(third.profile.agentId).toBeTruthy();
+  expect(third.profile.agentId).not.toBe(first.profile.agentId);
+});
+
+test('locomo native run maps category-5 option labels back to the correct answer key', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildCategoryFiveDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  globalThis.fetch = vi.fn<typeof fetch>(async (_input, init) => {
+    const body = JSON.parse(String(init?.body || '{}')) as {
+      messages: Array<{ role: string; content: string }>;
+    };
+    const prompt = body.messages[0]?.content || '';
+    const answer = prompt.includes('(a) Not mentioned in the conversation')
+      ? 'a'
+      : 'b';
+    return new Response(
+      JSON.stringify({
+        choices: [{ message: { content: answer } }],
+        usage: {
+          prompt_tokens: 10,
+          completion_tokens: 2,
+          total_tokens: 12,
+        },
+      }),
+      {
+        status: 200,
+        headers: { 'Content-Type': 'application/json' },
+      },
+    );
+  });
+
+  await runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--budget',
+    '4000',
+    '--num-samples',
+    '1',
+  ]);
+
+  const jobRoot = path.join(installDir, 'jobs');
+  const [jobDirName] = fs.readdirSync(jobRoot);
+  const summary = JSON.parse(
+    fs.readFileSync(path.join(jobRoot, jobDirName, 'result.json'), 'utf-8'),
+  ) as {
+    overallScore: number;
+    categories: Record<
+      string,
+      {
+        meanScore: number;
+        questionCount: number;
+      }
+    >;
+    predictionsPath: string;
+  };
+
+  expect(summary.overallScore).toBe(1);
+  expect(summary.categories['5']).toEqual({
+    meanScore: 1,
+    questionCount: 1,
+    contextF1: null,
+  });
+  const predictions = JSON.parse(
+    fs.readFileSync(summary.predictionsPath, 'utf-8'),
+  ) as Array<{
+    qa: Array<{ prediction: string; score: number }>;
+  }>;
+  expect(predictions[0]?.qa[0]).toMatchObject({
+    prediction: 'Not mentioned in the conversation',
+    score: 1,
+  });
+});
+
+test('locomo native run respects max question limits and writes progress metadata', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  globalThis.fetch = vi.fn<typeof fetch>().mockResolvedValue(
+    new Response(
+      JSON.stringify({
+        choices: [{ message: { content: 'fetch' } }],
+        usage: {
+          prompt_tokens: 10,
+          completion_tokens: 2,
+          total_tokens: 12,
+        },
+      }),
+      {
+        status: 200,
+        headers: { 'Content-Type': 'application/json' },
+      },
+    ),
+  );
+
+  await runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--budget',
+    '4000',
+    '--num-samples',
+    '1',
+    '--max-questions',
+    '1',
+  ]);
+
+  expect(globalThis.fetch).toHaveBeenCalledTimes(1);
+  const jobRoot = path.join(installDir, 'jobs');
+  const [jobDirName] = fs.readdirSync(jobRoot);
+  const jobDir = path.join(jobRoot, jobDirName);
+  const summary = JSON.parse(
+    fs.readFileSync(path.join(jobDir, 'result.json'), 'utf-8'),
+  ) as {
+    sampleCount: number;
+    questionCount: number;
+    overallScore: number;
+    predictionsPath: string;
+  };
+  const progress = JSON.parse(
+    fs.readFileSync(path.join(jobDir, 'progress.json'), 'utf-8'),
+  ) as {
+    sampleCount: number;
+    completedSampleCount: number;
+    questionCount: number;
+    completedQuestionCount: number;
+    overallScore: number;
+    currentSampleId: string | null;
+  };
+  const predictions = JSON.parse(
+    fs.readFileSync(summary.predictionsPath, 'utf-8'),
+  ) as Array<{
+    qa: Array<{ prediction: string }>;
+  }>;
+
+  expect(summary.sampleCount).toBe(1);
+  expect(summary.questionCount).toBe(1);
+  expect(summary.overallScore).toBe(1);
+  expect(progress.sampleCount).toBe(1);
+  expect(progress.completedSampleCount).toBe(1);
+  expect(progress.questionCount).toBe(1);
+  expect(progress.completedQuestionCount).toBe(1);
+  expect(progress.overallScore).toBe(1);
+  expect(progress.currentSampleId).toBeNull();
+  expect(predictions[0]?.qa).toHaveLength(1);
+});
+
+test('locomo native run throttles in-sample progress writes', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  globalThis.fetch = vi.fn<typeof fetch>(async () =>
+    buildCompletionResponse('fetch'),
+  );
+
+  const writeFileSpy = vi.spyOn(fs, 'writeFileSync');
+
+  await runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--budget',
+    '4000',
+    '--num-samples',
+    '1',
+  ]);
+
+  const progressWrites = writeFileSpy.mock.calls.filter(([filePath]) =>
+    String(filePath).endsWith('progress.json'),
+  );
+
+  expect(progressWrites).toHaveLength(3);
+});
+
+test('official LOCOMO scoring keeps lexical F1 behavior for paraphrases', () => {
+  expect(
+    scoreOfficialLocomoAnswer({
+      category: 1,
+      answer: 'Transgender woman',
+      prediction: 'a trans woman',
+    }),
+  ).toBe(0.5);
+});
+
+test('locomo native retrieval mode scores native memory hit rate without model calls', async () => {
+  const { runLocomoNativeCli } = await import('../src/evals/locomo-native.ts');
+  const installDir = fs.mkdtempSync(
+    path.join(os.tmpdir(), 'hybridclaw-locomo-'),
+  );
+  vi.spyOn(console, 'log').mockImplementation(() => {});
+  process.env.OPENAI_BASE_URL = 'http://127.0.0.1:9090/v1';
+  process.env.OPENAI_API_KEY = 'test-key';
+  process.env.HYBRIDCLAW_EVAL_MODEL = 'hybridai/gpt-4.1-mini';
+
+  fs.mkdirSync(path.join(installDir, 'data'), { recursive: true });
+  fs.writeFileSync(
+    path.join(installDir, 'data', 'locomo10.json'),
+    buildSampleDataset(),
+    'utf-8',
+  );
+  fs.writeFileSync(path.join(installDir, '.hybridclaw-setup-ok'), 'ok\n');
+  globalThis.fetch = vi.fn<typeof fetch>(async () => {
+    throw new Error('retrieval mode should not call the model gateway');
+  });
+
+  await runLocomoNativeCli([
+    'run',
+    '--install-dir',
+    installDir,
+    '--mode',
+    'retrieval',
+    '--budget',
+    '4000',
+    '--num-samples',
+    '1',
+  ]);
+
+  expect(globalThis.fetch).not.toHaveBeenCalled();
+  const jobRoot = path.join(installDir, 'jobs');
+  const [jobDirName] = fs.readdirSync(jobRoot);
+  const summary = JSON.parse(
+    fs.readFileSync(path.join(jobRoot, jobDirName, 'result.json'), 'utf-8'),
+  ) as {
+    mode: string;
+    model: string | null;
+    overallScore: number;
+    contextF1: number | null;
+    tokenUsage: unknown;
+    categories: Record<
+      string,
+      {
+        meanScore: number;
+        questionCount: number;
+        contextF1: number | null;
+      }
+    >;
+    predictionsPath: string;
+  };
+  const predictions = JSON.parse(
+    fs.readFileSync(summary.predictionsPath, 'utf-8'),
+  ) as Array<{
+    qa: Array<{
+      score: number;
+      contextF1: number | null;
+      retrievedSourceMessageIds: number[];
+    }>;
+  }>;
+
+  expect(summary.mode).toBe('retrieval');
+  expect(summary.model).toBeNull();
+  expect(summary.overallScore).toBe(1);
+  expect(summary.contextF1).toBeGreaterThan(0);
+  expect(summary.tokenUsage).toBeNull();
+  expect(summary.categories['1']?.meanScore).toBe(1);
+  expect(summary.categories['1']?.contextF1).toBeGreaterThan(0);
+  expect(predictions[0]?.qa).toHaveLength(2);
+  expect(predictions[0]?.qa.every((entry) => entry.score === 1)).toBe(true);
+  expect(
+    predictions[0]?.qa.every(
+      (entry) =>
+        Array.isArray(entry.retrievedSourceMessageIds) &&
+        entry.retrievedSourceMessageIds.length > 0,
+    ),
+  ).toBe(true);
+});
+
+test('locomo native retrieval hit-rate matches substring evidence recall semantics', () => {
+  const [sample] = JSON.parse(buildSampleDataset()) as Array<{
+    sample_id: string;
+    conversation: Record<string, unknown>;
+    qa: Array<{ evidence: string[] }>;
+  }>;
+
+  const hitRate = testOnlyLocomoNativeRetrieval.computeRetrievalHitRate({
+    sample,
+    evidence: ['D1:1', 'D1:3'],
+    retrievedContent: [
+      'DATE: 2024-03-01 10:00:00',
+      'Alice said, "Pepper loves playing fetch every evening."',
+      'Alice said, "Tomorrow I will pack crunchy carrots for lunch."',
+    ].join('\n'),
+  });
+  const missRate = testOnlyLocomoNativeRetrieval.computeRetrievalHitRate({
+    sample,
+    evidence: ['D1:1', 'D1:3'],
+    retrievedContent: 'Bob said, "The weather turned rainy today."',
+  });
+
+  expect(hitRate).toBe(1);
+  expect(missRate).toBe(0);
+});
+
+test('locomo native retrieval context F1 matches whitespace token overlap semantics', () => {
+  expect(
+    testOnlyLocomoNativeRetrieval.computeContextTokenF1('fetch,', 'fetch'),
+  ).toBe(0);
+  expect(
+    testOnlyLocomoNativeRetrieval.computeContextTokenF1('fetch fetch', 'fetch'),
+  ).toBeCloseTo(2 / 3, 6);
+});
+
+test('official LOCOMO scoring uses Porter stemming semantics', () => {
+  expect(
+    testOnlyLocomoOfficialScoring.singleAnswerF1('adopted', 'adoption'),
+  ).toBe(1);
+  expect(
+    scoreOfficialLocomoAnswer({
+      category: 2,
+      answer: 'adoption',
+      prediction: 'adopted',
+    }),
+  ).toBe(1);
+});
diff --git a/tests/memory-service.test.ts b/tests/memory-service.test.ts
index 53db4fe6..143a87cf 100644
--- a/tests/memory-service.test.ts
+++ b/tests/memory-service.test.ts
@@ -1564,6 +1564,78 @@ describe('MemoryService', () => {
     expect(semanticWrites).toBe(0);
   });
 
+  test('storeSemanticMemory derives plain array embeddings with the default provider', () => {
+    let capturedEmbedding: number[] | null | undefined;
+    const backend: MemoryBackend = {
+      resetSessionIfExpired: () => false,
+      getOrCreateSession: (sessionId, guildId, channelId) =>
+        makeSession({
+          id: sessionId,
+          guild_id: guildId,
+          channel_id: channelId,
+        }),
+      getSessionById: () => makeSession(),
+      getConversationHistory: () => [] as StoredMessage[],
+      getConversationHistoryPage: () => ({
+        sessionKey: null,
+        mainSessionKey: null,
+        history: [] as StoredMessage[],
+        branchFamilies: [],
+      }),
+      getRecentMessages: () => [] as StoredMessage[],
+      get: () => null,
+      set: () => {},
+      delete: () => false,
+      list: () => [],
+      appendCanonicalMessages: () => ({
+        canonical_id: 'entity-id:u1',
+        agent_id: 'entity-id',
+        user_id: 'u1',
+        messages: [],
+        compaction_cursor: 0,
+        compacted_summary: null,
+        message_count: 0,
+        created_at: new Date().toISOString(),
+        updated_at: new Date().toISOString(),
+      }),
+      getCanonicalContext: () => ({ summary: null, recent_messages: [] }),
+      addKnowledgeEntity: () => 'entity-id',
+      addKnowledgeRelation: () => 'relation-id',
+      queryKnowledgeGraph: () => [],
+      getCompactionCandidateMessages: () => null,
+      storeMessage: () => 42,
+      storeSemanticMemory: ({ embedding }) => {
+        capturedEmbedding = embedding;
+        return 10;
+      },
+      recallSemanticMemories: () => [] as SemanticMemoryEntry[],
+      forgetSemanticMemory: () => false,
+      decaySemanticMemories: () => 0,
+      clearSessionHistory: () => 0,
+      deleteMessagesBeforeId: () => 0,
+      deleteMessagesByIds: () => 0,
+      updateSessionSummary: () => {},
+      markSessionMemoryFlush: () => {},
+    };
+
+    const service = new MemoryService(backend, { embeddingDimensions: 32 });
+    service.storeSemanticMemory({
+      sessionId: 'session:test',
+      role: 'assistant',
+      content: 'Rust systems programming notes',
+      confidence: 0.9,
+    });
+
+    expect(Array.isArray(capturedEmbedding)).toBe(true);
+    expect(capturedEmbedding).not.toBeInstanceOf(Float32Array);
+    expect(capturedEmbedding).toHaveLength(32);
+    expect(
+      capturedEmbedding?.every(
+        (value) => typeof value === 'number' && Number.isFinite(value),
+      ),
+    ).toBe(true);
+  });
+
   test('semantic recall increments access_count on repeated identical queries', () => {
     const dbPath = createTempDbPath();
     initDatabase({ quiet: true, dbPath });
diff --git a/tests/tui-slash-menu.test.ts b/tests/tui-slash-menu.test.ts
index f20df96b..62e1cd6f 100644
--- a/tests/tui-slash-menu.test.ts
+++ b/tests/tui-slash-menu.test.ts
@@ -45,6 +45,7 @@ test('builds canonical, choice-based, and TUI-only slash menu entries', () => {
   expect(labels).toContain('/plugin check <plugin-id>');
   expect(labels).toContain('/eval [list|env|<suite>|<command...>]');
   expect(labels).toContain('/eval list');
+  expect(labels).toContain('/eval locomo');
   expect(labels).toContain('/eval tau2');
   expect(labels).toContain('/eval swebench-verified');
   expect(labels).not.toContain('/eval tau2-bench');