diff --git a/.cursor-plugin/marketplace.json b/.cursor-plugin/marketplace.json
new file mode 100644
index 000000000..6b7f9c0aa
--- /dev/null
+++ b/.cursor-plugin/marketplace.json
@@ -0,0 +1,17 @@
+{
+  "name": "nowledge-community",
+  "owner": {
+    "name": "Nowledge Labs",
+    "email": "hello@nowledge-labs.ai"
+  },
+  "metadata": {
+    "description": "Official Cursor plugins from Nowledge Labs."
+  },
+  "plugins": [
+    {
+      "name": "nowledge-mem",
+      "source": "./nowledge-mem-cursor-plugin",
+      "description": "Bring Working Memory, memory recall, and handoff summaries into Cursor with Nowledge Mem."
+    }
+  ]
+}
diff --git a/.github/workflows/release-gemini-extension.yml b/.github/workflows/release-gemini-extension.yml
new file mode 100644
index 000000000..b653caa42
--- /dev/null
+++ b/.github/workflows/release-gemini-extension.yml
@@ -0,0 +1,52 @@
+name: Release Gemini Extension
+
+on:
+  push:
+    tags:
+      - 'nowledge-mem-gemini-cli-extension-v*'
+
+permissions:
+  contents: write
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: nowledge-mem-gemini-cli-extension
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Resolve release metadata
+        id: release_meta
+        run: |
+          VERSION=$(node -p "require('./package.json').version")
+          EXPECTED_TAG="nowledge-mem-gemini-cli-extension-v${VERSION}"
+          if [ "${GITHUB_REF_NAME}" != "${EXPECTED_TAG}" ]; then
+            echo "Tag ${GITHUB_REF_NAME} does not match package version ${VERSION}" >&2
+            exit 1
+          fi
+          echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
+          echo "expected_tag=${EXPECTED_TAG}" >> "$GITHUB_OUTPUT"
+
+      - name: Validate extension package
+        run: npm run validate
+
+      - name: Build release archive
+        run: npm run package:release
+
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          name: Nowledge Mem Gemini CLI Extension v${{ steps.release_meta.outputs.version }}
+          body_path: nowledge-mem-gemini-cli-extension/release-notes/${{ steps.release_meta.outputs.version }}.md
+          files: |
+            nowledge-mem-gemini-cli-extension/dist/nowledge-mem-gemini-cli-extension.tar.gz
+            nowledge-mem-gemini-cli-extension/dist/nowledge-mem-gemini-cli-extension.tar.gz.sha256
diff --git a/.github/workflows/validate-gemini-extension.yml b/.github/workflows/validate-gemini-extension.yml
new file mode 100644
index 000000000..f26218af7
--- /dev/null
+++ b/.github/workflows/validate-gemini-extension.yml
@@ -0,0 +1,38 @@
+name: Validate Gemini Extension
+
+on:
+  pull_request:
+    paths:
+      - 'nowledge-mem-gemini-cli-extension/**'
+      - '.github/workflows/validate-gemini-extension.yml'
+      - '.github/workflows/release-gemini-extension.yml'
+  push:
+    branches:
+      - main
+      - dev
+    paths:
+      - 'nowledge-mem-gemini-cli-extension/**'
+      - '.github/workflows/validate-gemini-extension.yml'
+      - '.github/workflows/release-gemini-extension.yml'
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: nowledge-mem-gemini-cli-extension
+
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Set up Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '20'
+
+      - name: Validate extension package
+        run: npm run validate
+
+      - name: Verify release packaging
+        run: npm run package:release
diff --git a/README.md b/README.md
index e29eb29ba..d4009e960 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 
 <img src="https://github.com/user-attachments/assets/fbf6f921-ff0a-40dc-be43-8f9b0d66cb09" width="200" alt="Nowledge Community Logo">
 
-**Support, feedback, and community hub for Nowledge Labs products**
+**Community integrations for [Nowledge Mem](https://mem.nowledge.co)**
 
 [![Discord](https://img.shields.io/badge/Discord-Join%20Community-5865F2?style=flat&logo=discord&logoColor=white)](https://nowled.ge/discord)
 [![Docs](https://img.shields.io/badge/Docs-Read-orange?style=flat&logo=readthedocs&logoColor=white)](https://nowled.ge/mem-docs)
@@ -13,111 +13,60 @@
 
 </div>
 
-## ⚡ Nowledge Communities
+## Integrations
 
-### Nowledge Mem
+Each directory is a standalone integration. Pick the one that matches your tool.
 
-[![Get Mem](https://img.shields.io/badge/Get-Mem-00A3A3?style=flat&logo=rocket&logoColor=white)](https://mem.nowledge.co/)
+| Integration | Install | What it does |
+|-------------|---------|--------------|
+| **[Skills](nowledge-mem-npx-skills)** | `npx skills add nowledge-co/community/nowledge-mem-npx-skills` | Reusable workflow package for Working Memory, routed recall, resumable handoffs, and distillation. Prefer native packages when your tool has one. |
+| **[Claude Code Plugin](nowledge-mem-claude-code-plugin)** | `claude plugin marketplace add nowledge-co/community` then `claude plugin install nowledge-mem@nowledge-community` | Claude Code native plugin with hooks for Working Memory bootstrap, routed recall, and automatic session capture. |
+| **[Gemini CLI Extension](nowledge-mem-gemini-cli-extension)** | `cd nowledge-mem-gemini-cli-extension && gemini extensions link .` | Gemini-native context, hooks, commands, and skills for Working Memory, routed recall, real thread save, and handoff summaries. |
+| **[Cursor Plugin](nowledge-mem-cursor-plugin)** | Use the packaged Cursor plugin directory with Cursor's plugin workflow | Cursor-native plugin package with bundled MCP config, rules, Working Memory, routed recall, distillation, and honest `save-handoff` semantics. |
+| **[Codex Prompts](nowledge-mem-codex-prompts)** | Copy `AGENTS.md` to your project | Codex-native workflow pack for Working Memory, routed recall, real session save, and distillation. |
+| **[OpenClaw Plugin](nowledge-mem-openclaw-plugin)** | `openclaw plugins install @nowledge/openclaw-nowledge-mem` | Full memory lifecycle with memory tools, thread tools, automatic capture, and distillation. |
+| **[Alma Plugin](nowledge-mem-alma-plugin)** | Search Nowledge in Alma official Plugin marketplace | Alma-native plugin with Working Memory, thread-aware recall, structured saves, and optional auto-capture. |
+| **[Raycast Extension](nowledge-mem-raycast)** | Search Nowledge in Raycast Extension Store | Search memories from Raycast launcher. |
+| **[MCP](#direct-mcp)** | For tools without a dedicated Nowledge package, use [direct MCP](#direct-mcp). | Standard memory and thread tools exposed through one shared MCP server. |
 
-A **local-first, graph-augmented personal context manager** that preserves conversations and insights from your AI interactions. Stop opening 5 chatbot apps to find *that one conversation*. Type → found.
+## Direct MCP
 
-Mem persists entire conversation threads across all your AI tools while also distilling key insights into searchable memories. Everything connected through an intelligent knowledge graph.
-
-**Privacy-first. Graph-native. AI-integrated.**
-
-#### Ways to Use Mem
-
-##### Capture Knowledge
-
-- **🤖 MCP Integration** | Works with [Claude Code](https://github.com/nowledge-co/community/tree/main/nowledge-mem-claude-code-plugin), [Claude Desktop](https://github.com/nowledge-co/claude-dxt), [Cursor](https://mem.nowledge.co/docs/integrations), Github Copilot, Gemini CLI, Qwen Code & more, [ref config](./mcp.json)
-- **🌐 Browser Extension** | One-click imports from ChatGPT, Claude, Gemini, Perplexity (Chrome, Edge)
-- **📂 Thread Import** | Upload conversation files from Cursor, ChatWise, Markdown exports or Claude Code/Codex command
-
-> [!NOTE]
-> General MCP Configuration:
+Add to your tool's MCP settings:
 
 ```json
 {
   "mcpServers": {
     "nowledge-mem": {
       "url": "http://localhost:14242/mcp",
-      "type": "streamableHttp",
-      "headers": {
-        "APP": "<MCP Client App Name here>"
-      }
+      "type": "streamableHttp"
     }
   }
 }
 ```
 
-##### Access Your Knowledge
-
-- **⌨️ Global Launcher (⌘⇧K)** | Paste memories anywhere without leaving your workflow
-- **🔍 In-App Search** | Deep exploration with semantic search, keywords, and graph navigation
-- **🤝 Agent Access** | Let AI agents autonomously search and save memories during tasks
-
-##### Advanced Features
-
-- **🕸️ Knowledge Graph** | Auto-extract entities and relationships for semantic connections
-- **🎯 Topic Clustering** | Discover natural groupings and expertise areas via graph algorithms
-- **📊 Visual Exploration** | Interactive graph view to explore connections and patterns
-
-### Open Knowledge Layer
-
-[![GitHub](https://img.shields.io/badge/GitHub-OpenKL-181717?style=flat&logo=github&logoColor=white)](https://github.com/nowledge-co/OpenKL)
-
-An **open-source unified protocol** enabling AI agents to access and interact with personal knowledge. **CLI-ready** and file-based for seamless integration with tools like Claude Code. Built for broader knowledge management: massive document collections, entire codebases, and external knowledge corpus with automatic chunking.
+See [mcp.json](mcp.json) for the reference config.
 
-## 🚀 Quick Links
+## Requirements
 
-### Nowledge Mem Community
+- [Nowledge Mem](https://mem.nowledge.co) running locally
+- `nmem` CLI on your PATH: if Mem is running on the same machine, install it from **Settings > Preferences > Developer Tools > Install CLI** in the app, or use `pip install nmem-cli` for a standalone setup
 
-- 📚 **[Documentation](https://mem.nowledge.co/docs)** | Learn how to use Nowledge Mem
-- 📖 **[Blog Post](https://www.nowledge-labs.ai/blog/nowledge-mem)** | Deep dive into our vision
-- 🔌 **Claude Code Plugin** | Install via marketplace:
-
-  ```bash
-  claude plugin marketplace add nowledge-co/community
-  claude plugin install nowledge-mem@nowledge-community
-  ```
-
-- 🛠️ **npx Skills** | Install to any supported agent (Claude Code, Cursor, OpenCode, Codex, etc.):
-
-  ```bash
-  npx skills add nowledge-co/community/nowledge-mem-npx-skills
-  ```
-
-- 🐾 **OpenClaw Plugin** | Install for [OpenClaw](https://openclaw.ai) agents:
-
-  ```bash
-  openclaw plugin add @nowledge/openclaw-nowledge-mem
-  ```
-
-- 🧠 **Alma Plugin** | Install local plugin for [Alma](https://alma.now):
-
-  ```bash
-  cd nowledge-mem-alma-plugin && npm install
-  ```
-
-- 🔍 **Raycast Extension** | Search memories from [Raycast](https://raycast.com):
-
-  ```bash
-  cd nowledge-mem-raycast && npm install && npm run dev
-  ```
-
-- 🐛 **[Report Bug](https://github.com/nowledge-co/community/issues/new?template=bug_report.md)** | Found an issue? Let us know
-- 💡 **[Request Feature](https://github.com/nowledge-co/community/issues/new?template=feature_request.md)** | Share your ideas
-- 💌 **[Send Feedback](mailto:hello@nowledge-labs.ai)** | We'd love to hear from you
+```bash
+nmem status   # verify Nowledge Mem is running
+```
 
-### Open Knowledge Layer Community
+## Links
 
-- 🔗 **[GitHub Repository](https://nowled.ge/kl)** | Explore the open-source project
-- 💬 **[Join Discord](https://nowled.ge/discord)** | Connect with users and the team
+- [Documentation](https://mem.nowledge.co/docs)
+- [Blog](https://www.nowledge-labs.ai/blog/nowledge-mem)
+- [Report a Bug](https://github.com/nowledge-co/community/issues/new?template=bug_report.md)
+- [Request a Feature](https://github.com/nowledge-co/community/issues/new?template=feature_request.md)
+- [hello@nowledge-labs.ai](mailto:hello@nowledge-labs.ai)
 
 ---
 
 <div align="center">
 
-**Built with ❤️ by [Nowledge Labs](https://nowledge-labs.ai)**
+**Built by [Nowledge Labs](https://nowledge-labs.ai)**
 
 </div>
diff --git a/examples/conversation-markdown.md b/examples/conversation-markdown.md
new file mode 100644
index 000000000..92ee454e8
--- /dev/null
+++ b/examples/conversation-markdown.md
@@ -0,0 +1,78 @@
+---
+title: Python Async Patterns
+source: chatgpt
+date: 2025-06-15
+---
+
+## System
+
+You are a senior Python developer. Explain concepts clearly with practical examples.
+
+## User
+
+I keep getting confused about when to use `asyncio.gather` vs `asyncio.create_task`. What's the difference?
+
+## Assistant
+
+They serve different purposes:
+
+**`asyncio.gather(*coros)`** runs multiple coroutines concurrently and waits for all of them to complete. It returns results in the same order you passed them:
+
+```python
+async def fetch_all():
+    user, posts, comments = await asyncio.gather(
+        fetch_user(1),
+        fetch_posts(1),
+        fetch_comments(1),
+    )
+    return user, posts, comments
+```
+
+**`asyncio.create_task(coro)`** schedules a coroutine to run in the background. You get a `Task` object back and can await it later — or never:
+
+```python
+async def process():
+    # Fire and forget — runs in background
+    task = asyncio.create_task(send_analytics())
+
+    # Do other work while analytics sends
+    result = await compute_something()
+
+    # Optionally await if you need the result
+    await task
+```
+
+**Rule of thumb:**
+- Use `gather` when you need all results before continuing
+- Use `create_task` when you want fire-and-forget or need to start work early
+
+## User
+
+What about error handling? If one task in `gather` fails, do the others keep running?
+
+## Assistant
+
+By default, if any coroutine in `gather` raises an exception, the exception propagates immediately — but the other tasks **keep running** in the background. This is a common source of bugs.
+
+You can change this with `return_exceptions=True`:
+
+```python
+results = await asyncio.gather(
+    risky_operation_1(),
+    risky_operation_2(),
+    return_exceptions=True,
+)
+
+for result in results:
+    if isinstance(result, Exception):
+        logger.error(f"Task failed: {result}")
+    else:
+        process(result)
+```
+
+For `create_task`, unhandled exceptions are logged as warnings when the task is garbage-collected. Always either `await` the task or add an explicit error handler:
+
+```python
+task = asyncio.create_task(background_work())
+task.add_done_callback(handle_task_exception)
+```
diff --git a/mcp.json b/mcp.json
index f6b60cfe4..625d21992 100644
--- a/mcp.json
+++ b/mcp.json
@@ -4,8 +4,9 @@
       "url": "http://localhost:14242/mcp",
       "type": "streamableHttp",
       "headers": {
-        "APP": "<MCP Client App Name here>"
+        "APP": "<MCP Client App Name here>",
+        "X-NMEM-API-Key": "nmem_..."
       }
     }
   }
-}
+}
\ No newline at end of file
diff --git a/nowledge-mem-alma-plugin/alma-skill-nowledge-mem.md b/nowledge-mem-alma-plugin/alma-skill-nowledge-mem.md
index 9a847c30f..48b57c20a 100644
--- a/nowledge-mem-alma-plugin/alma-skill-nowledge-mem.md
+++ b/nowledge-mem-alma-plugin/alma-skill-nowledge-mem.md
@@ -16,7 +16,7 @@ Use Nowledge Mem as the primary external memory system.
 2. `nowledge_mem_search` for focused retrieval with filters.
 3. `nowledge_mem_show` for full detail on selected memory IDs — includes `sourceThreadId`.
 4. `nowledge_mem_thread_search` / `nowledge_mem_thread_show` for conversation history.
-5. When a memory has a `sourceThreadId`, use `nowledge_mem_thread_show` to read the full source conversation.
+5. When a memory has a `sourceThreadId`, use `nowledge_mem_thread_show` or `nmem --json t show` progressively: start with the first page and fetch more only if the current page is not enough.
 
 For writes:
 
@@ -42,8 +42,10 @@ If neither plugin tools nor Bash are available, state the exact blocker once and
 ## Query Heuristics
 
 - Trigger retrieval when user asks about prior decisions, historical context, previous threads, “what did we do before,” or asks to continue prior work.
+- Start with memory/query search for distilled knowledge, and use thread search when the user is really asking about conversation history.
 - Start normal mode first; use deep mode only when normal retrieval misses likely context.
 - Prefer narrower queries over broad vague queries.
+- Avoid dumping a huge thread when one page of messages is enough to answer.
 
 ## Write Heuristics
 
diff --git a/nowledge-mem-bench/.gitignore b/nowledge-mem-bench/.gitignore
new file mode 100644
index 000000000..9b53134a3
--- /dev/null
+++ b/nowledge-mem-bench/.gitignore
@@ -0,0 +1,8 @@
+__pycache__/
+*.pyc
+.venv/
+dist/
+*.egg-info/
+data/
+results/
+.env
diff --git a/nowledge-mem-bench/README.md b/nowledge-mem-bench/README.md
new file mode 100644
index 000000000..0838a2a08
--- /dev/null
+++ b/nowledge-mem-bench/README.md
@@ -0,0 +1,138 @@
+# nmem-bench
+
+> **Status: WIP**
+
+Open-source benchmark for personal memory systems. Evaluates memory retrieval quality using established academic datasets via the `nmem` CLI.
+
+## Supported Benchmarks
+
+| Dataset | Paper | Questions | Categories |
+|---------|-------|-----------|------------|
+| **LoCoMo** | ACL 2024 | 1,986 | multi-hop, single-hop, temporal, open-domain, adversarial |
+| **LongMemEval** | — | ~500 | single-session, multi-session, temporal-reasoning, knowledge-update |
+
+## How It Works
+
+```
+Conversations ──► nmem t create ──► Threads
+                  nmem t distill ──► Memories + KG extraction
+                  (Knowledge Agent) ──► Entities, EVOLVES, Crystals
+                  nmem m search ──► Retrieved context
+                  LLM ──► Answer
+                  Token-F1 + LLM Judge ──► Score
+```
+
+The benchmark exercises the **full nowledge-mem pipeline**: thread ingestion, AI distillation, knowledge graph extraction, and hybrid search — measuring end-to-end retrieval quality.
+
+## Quick Start
+
+```bash
+# Install
+pip install -e .
+
+# Verify nmem CLI is available
+nmem status
+
+# Smoke test: 1 conversation, 10 questions
+nmem-bench run -b locomo -s conv-26 -l 10
+
+# Full LoCoMo benchmark
+nmem-bench run -b locomo
+
+# Fast vs deep mode comparison
+nmem-bench run -b locomo -m normal --run-id locomo-fast
+nmem-bench run -b locomo -m deep --run-id locomo-deep
+nmem-bench compare results/locomo-fast/report.json results/locomo-deep/report.json
+
+# LongMemEval benchmark
+nmem-bench run -b longmemeval
+```
+
+## CLI Reference
+
+### `nmem-bench run`
+
+Run a full benchmark pipeline (ingest → distill → search → answer → evaluate → report).
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `-b`, `--benchmark` | `locomo` | Dataset: `locomo` or `longmemeval` |
+| `-m`, `--search-mode` | `normal` | Search mode: `normal` or `deep` |
+| `-k`, `--top-k` | `10` | Top-K search results per question |
+| `--answer-model` | `gpt-4o-mini` | LLM for answer generation |
+| `--judge-model` | `gpt-4o-mini` | LLM for judge evaluation |
+| `--extraction-level` | `guided` | Distillation depth: `swift` (fast) or `guided` (thorough) |
+| `-s`, `--sample` | all | LoCoMo sample ID(s) to evaluate |
+| `-l`, `--limit` | all | Max questions to evaluate |
+| `--skip-distill` | false | Skip distillation (use existing memories) |
+| `--skip-llm-judge` | false | Skip LLM judge (F1 only) |
+| `--run-id` | auto | Custom run ID (enables resume) |
+
+### `nmem-bench report <path>`
+
+Display results from a previous run.
+
+### `nmem-bench compare <path1> <path2> [...]`
+
+Compare multiple benchmark reports side-by-side.
+
+### `nmem-bench stats`
+
+Show dataset statistics without running a benchmark.
+
+## Metrics
+
+### Token-F1 (primary)
+Token-level F1 score between prediction and ground truth. Ported from the LoCoMo evaluation code (ACL 2024). Category-specific handling:
+- **Multi-hop**: Mean F1 over comma-split sub-answers
+- **Temporal**: F1 on first semicolon-delimited ground truth part
+- **Adversarial**: Binary (1 if prediction correctly abstains)
+
+### LLM-as-Judge (secondary)
+Uses an LLM to semantically judge correctness. Handles paraphrasing, equivalent date formats, and knowledge updates better than token-F1. Question-type-specific judge prompts.
+
+### Latency
+Search and answer generation latency (p50, p95, p99, mean) per question type.
+
+## Resuming Interrupted Runs
+
+Long benchmarks can be interrupted and resumed:
+
+```bash
+# Start a run with a named ID
+nmem-bench run -b locomo --run-id my-run
+
+# If interrupted, resume with the same ID
+nmem-bench run -b locomo --run-id my-run
+```
+
+The pipeline checkpoints after each phase per question.
+
+## Output
+
+Each run produces:
+- `results/<run-id>/checkpoint.json` — Full pipeline state (for resuming)
+- `results/<run-id>/report.json` — Structured results
+- `results/<run-id>/report.md` — Human-readable report
+
+## Requirements
+
+- Python 3.10+
+- `nmem` CLI installed and connected to a running nowledge-mem instance
+- LLM API key (for answer generation + judge): set `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, etc.
+
+## Methodology
+
+This benchmark measures **retrieval-augmented QA accuracy** — how well a memory system can find and surface relevant context for answering questions about past conversations.
+
+The pipeline tests the full user-facing workflow:
+1. **Thread ingestion**: Conversations stored as threads (like saving a chat session)
+2. **Memory distillation**: AI extraction of key facts, decisions, insights
+3. **Knowledge graph**: Entity extraction, relationship detection, EVOLVES chains
+4. **Search**: Hybrid search (vector + BM25 + entity + community strategies)
+5. **Answer**: LLM generates answer from retrieved context
+6. **Evaluation**: Token-F1 + LLM-as-judge scoring against ground truth
+
+## License
+
+MIT
diff --git a/nowledge-mem-bench/pyproject.toml b/nowledge-mem-bench/pyproject.toml
new file mode 100644
index 000000000..a06205429
--- /dev/null
+++ b/nowledge-mem-bench/pyproject.toml
@@ -0,0 +1,24 @@
+[project]
+name = "nmem-bench"
+version = "0.1.0"
+description = "Open-source benchmark for personal memory systems — evaluates via nmem CLI against LoCoMo and LongMemEval datasets"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.10"
+dependencies = [
+    "click>=8.0",
+    "rich>=13.0",
+    "litellm>=1.40",
+    "nltk>=3.8",
+    "regex>=2024.0",
+]
+
+[project.scripts]
+nmem-bench = "nmem_bench.cli:main"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/nmem_bench"]
diff --git a/nowledge-mem-bench/src/nmem_bench/__init__.py b/nowledge-mem-bench/src/nmem_bench/__init__.py
new file mode 100644
index 000000000..c3bd2f227
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/__init__.py
@@ -0,0 +1,3 @@
+"""nmem-bench: Open-source benchmark for personal memory systems."""
+
+__version__ = "0.1.0"
diff --git a/nowledge-mem-bench/src/nmem_bench/__main__.py b/nowledge-mem-bench/src/nmem_bench/__main__.py
new file mode 100644
index 000000000..512778810
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/__main__.py
@@ -0,0 +1,5 @@
+"""Allow running as `python -m nmem_bench`."""
+
+from nmem_bench.cli import main
+
+main()
diff --git a/nowledge-mem-bench/src/nmem_bench/benchmarks/__init__.py b/nowledge-mem-bench/src/nmem_bench/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nowledge-mem-bench/src/nmem_bench/benchmarks/locomo.py b/nowledge-mem-bench/src/nmem_bench/benchmarks/locomo.py
new file mode 100644
index 000000000..f57ff1ba5
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/benchmarks/locomo.py
@@ -0,0 +1,181 @@
+"""LoCoMo benchmark dataset loader.
+
+Loads the LoCoMo dataset (ACL 2024) — 10 annotated long conversations
+with 1986 QA pairs across 5 categories.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from .types import (
+    LOCOMO_CATEGORY_NAMES,
+    UnifiedMessage,
+    UnifiedQuestion,
+    UnifiedSession,
+)
+
+logger = logging.getLogger(__name__)
+
+# Default path relative to the monorepo root
+_DEFAULT_DATA_PATH = Path(__file__).resolve().parents[5] / "3pp" / "locomo" / "data" / "locomo10.json"
+
+
+def load_locomo(data_path: Path | str | None = None) -> list[dict[str, Any]]:
+    """Load raw LoCoMo JSON data."""
+    path = Path(data_path) if data_path else _DEFAULT_DATA_PATH
+    if not path.exists():
+        raise FileNotFoundError(
+            f"LoCoMo dataset not found at {path}. "
+            "Ensure 3pp/locomo/data/locomo10.json exists."
+        )
+    with open(path) as f:
+        return json.load(f)
+
+
+def extract_sessions(sample: dict[str, Any]) -> list[UnifiedSession]:
+    """Extract UnifiedSessions from a single LoCoMo conversation sample."""
+    conv = sample["conversation"]
+    speaker_a = conv.get("speaker_a", "Speaker A")
+    speaker_b = conv.get("speaker_b", "Speaker B")
+    sample_id = sample["sample_id"]
+    sessions = []
+
+    # Sessions are numbered: session_1, session_2, ..., session_N
+    session_num = 1
+    while True:
+        key = f"session_{session_num}"
+        if key not in conv:
+            break
+
+        date_key = f"{key}_date_time"
+        date = conv.get(date_key, "")
+        turns = conv[key]
+
+        messages = []
+        for turn in turns:
+            speaker = turn.get("speaker", "")
+            messages.append(
+                UnifiedMessage(
+                    role="user" if speaker == speaker_a else "assistant",
+                    content=turn.get("text", ""),
+                    speaker=speaker,
+                    dia_id=turn.get("dia_id", ""),
+                )
+            )
+
+        sessions.append(
+            UnifiedSession(
+                session_id=f"{sample_id}-session_{session_num}",
+                messages=messages,
+                date=date,
+                metadata={
+                    "sample_id": sample_id,
+                    "session_num": session_num,
+                    "speaker_a": speaker_a,
+                    "speaker_b": speaker_b,
+                },
+            )
+        )
+        session_num += 1
+
+    return sessions
+
+
+def extract_questions(sample: dict[str, Any]) -> list[UnifiedQuestion]:
+    """Extract UnifiedQuestions from a single LoCoMo conversation sample."""
+    sample_id = sample["sample_id"]
+    questions = []
+
+    for i, qa in enumerate(sample.get("qa", [])):
+        cat = qa.get("category", 0)
+        # Category 5 (adversarial) uses 'adversarial_answer' instead of 'answer'
+        if "answer" in qa:
+            ground_truth = str(qa["answer"])
+        elif "adversarial_answer" in qa:
+            ground_truth = str(qa["adversarial_answer"])
+        else:
+            ground_truth = ""
+
+        questions.append(
+            UnifiedQuestion(
+                question_id=f"{sample_id}-q{i}",
+                question=qa["question"],
+                ground_truth=ground_truth,
+                question_type=LOCOMO_CATEGORY_NAMES.get(cat, f"category-{cat}"),
+                category=cat,
+                evidence=qa.get("evidence", []),
+                metadata={"sample_id": sample_id, "qa_index": i},
+            )
+        )
+
+    return questions
+
+
+class LoComoBenchmark:
+    """LoCoMo benchmark dataset manager."""
+
+    def __init__(self, data_path: Path | str | None = None):
+        self.name = "locomo"
+        self._data_path = data_path
+        self._raw: list[dict[str, Any]] = []
+        self._samples: dict[str, dict[str, Any]] = {}
+
+    def load(self) -> None:
+        """Load the LoCoMo dataset."""
+        self._raw = load_locomo(self._data_path)
+        self._samples = {s["sample_id"]: s for s in self._raw}
+        logger.info("Loaded LoCoMo: %d conversations", len(self._raw))
+
+    @property
+    def sample_ids(self) -> list[str]:
+        return list(self._samples.keys())
+
+    def get_sessions(self, sample_id: str) -> list[UnifiedSession]:
+        """Get all sessions for a conversation."""
+        return extract_sessions(self._samples[sample_id])
+
+    def get_questions(
+        self,
+        sample_id: str | None = None,
+        category: int | None = None,
+        limit: int | None = None,
+    ) -> list[UnifiedQuestion]:
+        """Get questions, optionally filtered by sample and/or category."""
+        if sample_id:
+            questions = extract_questions(self._samples[sample_id])
+        else:
+            questions = []
+            for s in self._raw:
+                questions.extend(extract_questions(s))
+
+        if category is not None:
+            questions = [q for q in questions if q.category == category]
+
+        if limit is not None:
+            questions = questions[:limit]
+
+        return questions
+
+    def get_all_sessions(self) -> dict[str, list[UnifiedSession]]:
+        """Get all sessions grouped by sample_id."""
+        return {sid: self.get_sessions(sid) for sid in self.sample_ids}
+
+    def stats(self) -> dict[str, Any]:
+        """Get dataset statistics."""
+        all_q = self.get_questions()
+        cat_counts: dict[str, int] = {}
+        for q in all_q:
+            cat_counts[q.question_type] = cat_counts.get(q.question_type, 0) + 1
+        total_sessions = sum(
+            len(self.get_sessions(sid)) for sid in self.sample_ids
+        )
+        return {
+            "conversations": len(self._samples),
+            "total_sessions": total_sessions,
+            "total_questions": len(all_q),
+            "by_category": cat_counts,
+        }
diff --git a/nowledge-mem-bench/src/nmem_bench/benchmarks/longmemeval.py b/nowledge-mem-bench/src/nmem_bench/benchmarks/longmemeval.py
new file mode 100644
index 000000000..72bbb25ae
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/benchmarks/longmemeval.py
@@ -0,0 +1,157 @@
+"""LongMemEval benchmark dataset loader.
+
+Downloads and loads the LongMemEval dataset from HuggingFace.
+6 question types: single-session-user, single-session-assistant,
+single-session-preference, multi-session, temporal-reasoning, knowledge-update.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from .types import UnifiedMessage, UnifiedQuestion, UnifiedSession
+
+logger = logging.getLogger(__name__)
+
+HF_DATASET_URL = (
+    "https://huggingface.co/datasets/xiaowu0162/longmemeval-cleaned"
+    "/resolve/main/longmemeval_s_cleaned.json"
+)
+
+
+def _parse_date(raw: str) -> tuple[str, str]:
+    """Parse LongMemEval date format 'YYYY/MM/DD (day) HH:MM' → (iso, human)."""
+    import re
+
+    m = re.match(r"(\d{4})/(\d{2})/(\d{2})\s*\([^)]*\)\s*(\d{2}):(\d{2})", raw)
+    if not m:
+        return raw, raw
+    y, mo, d, h, mi = m.groups()
+    iso = f"{y}-{mo}-{d}T{h}:{mi}:00"
+    # Human-readable
+    months = [
+        "", "January", "February", "March", "April", "May", "June",
+        "July", "August", "September", "October", "November", "December",
+    ]
+    month_name = months[int(mo)]
+    hour = int(h)
+    ampm = "am" if hour < 12 else "pm"
+    hour_12 = hour % 12 or 12
+    human = f"{hour_12}:{mi} {ampm} on {int(d)} {month_name}, {y}"
+    return iso, human
+
+
+class LongMemEvalBenchmark:
+    """LongMemEval benchmark dataset manager."""
+
+    def __init__(self, data_dir: Path | str | None = None):
+        self.name = "longmemeval"
+        self._data_dir = Path(data_dir) if data_dir else Path("data/longmemeval")
+        self._items: list[dict[str, Any]] = []
+        self._by_id: dict[str, dict[str, Any]] = {}  # question_id → item lookup
+
+    def load(self) -> None:
+        """Load (and download if needed) the LongMemEval dataset."""
+        self._data_dir.mkdir(parents=True, exist_ok=True)
+        data_file = self._data_dir / "longmemeval_s_cleaned.json"
+
+        if not data_file.exists():
+            self._download(data_file)
+
+        with open(data_file) as f:
+            self._items = json.load(f)
+
+        self._by_id = {item["question_id"]: item for item in self._items}
+        logger.info("Loaded LongMemEval: %d items", len(self._items))
+
+    def _download(self, dest: Path) -> None:
+        """Download dataset from HuggingFace."""
+        import urllib.request
+
+        logger.info("Downloading LongMemEval from %s ...", HF_DATASET_URL)
+        urllib.request.urlretrieve(HF_DATASET_URL, dest)
+        logger.info("Downloaded to %s", dest)
+
+    def get_questions(
+        self,
+        question_type: str | None = None,
+        limit: int | None = None,
+    ) -> list[UnifiedQuestion]:
+        """Get questions, optionally filtered by type."""
+        questions = []
+        for item in self._items:
+            qid = item["question_id"]
+            qtype = item.get("question_type", "unknown")
+
+            if question_type and qtype != question_type:
+                continue
+
+            # Build haystack session IDs
+            n_sessions = len(item.get("haystack_sessions", []))
+            session_ids = [f"{qid}-session-{i}" for i in range(n_sessions)]
+
+            questions.append(
+                UnifiedQuestion(
+                    question_id=qid,
+                    question=item["question"],
+                    ground_truth=item["answer"],
+                    question_type=qtype,
+                    haystack_session_ids=session_ids,
+                    metadata={
+                        "question_date": item.get("question_date", ""),
+                    },
+                )
+            )
+
+        if limit is not None:
+            questions = questions[:limit]
+        return questions
+
+    def get_sessions(self, question_id: str) -> list[UnifiedSession]:
+        """Get haystack sessions for a specific question."""
+        item = self._find_item(question_id)
+        if not item:
+            return []
+
+        sessions = []
+        haystack = item.get("haystack_sessions", [])
+        dates = item.get("haystack_dates", [])
+
+        for i, msg_list in enumerate(haystack):
+            date_raw = dates[i] if i < len(dates) else ""
+            iso_date, human_date = _parse_date(date_raw) if date_raw else ("", "")
+
+            messages = [
+                UnifiedMessage(
+                    role=m.get("role", "user"),
+                    content=m.get("content", ""),
+                )
+                for m in msg_list
+            ]
+
+            sessions.append(
+                UnifiedSession(
+                    session_id=f"{question_id}-session-{i}",
+                    messages=messages,
+                    date=human_date,
+                    metadata={"iso_date": iso_date, "question_id": question_id},
+                )
+            )
+
+        return sessions
+
+    def _find_item(self, question_id: str) -> dict[str, Any] | None:
+        return self._by_id.get(question_id)
+
+    def stats(self) -> dict[str, Any]:
+        all_q = self.get_questions()
+        type_counts: dict[str, int] = {}
+        for q in all_q:
+            type_counts[q.question_type] = type_counts.get(q.question_type, 0) + 1
+        return {
+            "total_questions": len(all_q),
+            "by_type": type_counts,
+        }
diff --git a/nowledge-mem-bench/src/nmem_bench/benchmarks/types.py b/nowledge-mem-bench/src/nmem_bench/benchmarks/types.py
new file mode 100644
index 000000000..53325502f
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/benchmarks/types.py
@@ -0,0 +1,62 @@
+"""Unified data types for benchmark datasets."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any
+
+
+@dataclass
+class UnifiedMessage:
+    role: str  # "user" | "assistant" | "system"
+    content: str
+    speaker: str = ""
+    dia_id: str = ""  # LoCoMo dialogue ID, e.g. "D1:3"
+
+
+@dataclass
+class UnifiedSession:
+    session_id: str
+    messages: list[UnifiedMessage]
+    date: str = ""  # ISO date or human-readable
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_markdown(self) -> str:
+        """Format session as conversation markdown for thread creation."""
+        lines = []
+        if self.date:
+            lines.append(f"**Date**: {self.date}\n")
+        for msg in self.messages:
+            speaker = msg.speaker or msg.role.capitalize()
+            lines.append(f"**{speaker}**: {msg.content}")
+        return "\n\n".join(lines)
+
+
+@dataclass
+class UnifiedQuestion:
+    question_id: str
+    question: str
+    ground_truth: str
+    question_type: str  # e.g. "single-hop", "multi-hop", "temporal"
+    category: int = 0  # LoCoMo numeric category (1-5)
+    evidence: list[str] = field(default_factory=list)  # dialogue IDs for retrieval recall
+    haystack_session_ids: list[str] = field(default_factory=list)
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+
+LOCOMO_CATEGORY_NAMES = {
+    1: "multi-hop",
+    2: "single-hop",
+    3: "temporal",
+    4: "open-domain",
+    5: "adversarial",
+}
+
+LONGMEMEVAL_QUESTION_TYPES = [
+    "single-session-user",
+    "single-session-assistant",
+    "single-session-preference",
+    "multi-session",
+    "temporal-reasoning",
+    "knowledge-update",
+]
diff --git a/nowledge-mem-bench/src/nmem_bench/cli.py b/nowledge-mem-bench/src/nmem_bench/cli.py
new file mode 100644
index 000000000..5e684ebe4
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/cli.py
@@ -0,0 +1,252 @@
+"""CLI for nmem-bench — benchmark personal memory systems."""
+
+from __future__ import annotations
+
+import json
+import logging
+import sys
+from pathlib import Path
+
+import click
+from rich.console import Console
+from rich.logging import RichHandler
+from rich.table import Table
+
+console = Console()
+
+
+def _setup_logging(verbose: bool = False) -> None:
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(message)s",
+        handlers=[RichHandler(console=console, show_time=False, show_path=False)],
+    )
+
+
+@click.group()
+@click.option("-v", "--verbose", is_flag=True, help="Enable debug logging")
+def main(verbose: bool) -> None:
+    """nmem-bench: Benchmark personal memory systems against established datasets."""
+    _setup_logging(verbose)
+
+
+@main.command()
+@click.option("-b", "--benchmark", type=click.Choice(["locomo", "longmemeval"]), default="locomo")
+@click.option("-m", "--search-mode", type=click.Choice(["normal", "deep"]), default="normal")
+@click.option("-k", "--top-k", default=10, help="Top-K search results")
+@click.option("--answer-model", default="gpt-4o-mini", help="LLM model for answer generation")
+@click.option("--judge-model", default="gpt-4o-mini", help="LLM model for judge evaluation")
+@click.option("--extraction-level", type=click.Choice(["swift", "guided"]), default="guided")
+@click.option("-s", "--sample", multiple=True, help="LoCoMo sample IDs (e.g. conv-26)")
+@click.option("-l", "--limit", type=int, default=None, help="Limit number of questions")
+@click.option("--skip-distill", is_flag=True, help="Skip distillation (use pre-existing memories)")
+@click.option("--skip-llm-judge", is_flag=True, help="Skip LLM-as-judge evaluation")
+@click.option("--process-timeout", default=600, help="Timeout for background processing (seconds)")
+@click.option("--nmem-path", default=None, help="Path to nmem CLI binary")
+@click.option("--api-url", default=None, help="nmem API URL override")
+@click.option("--run-id", default=None, help="Custom run ID (for resuming)")
+@click.option("-o", "--results-dir", default="results", help="Results output directory")
+def run(
+    benchmark: str,
+    search_mode: str,
+    top_k: int,
+    answer_model: str,
+    judge_model: str,
+    extraction_level: str,
+    sample: tuple[str, ...],
+    limit: int | None,
+    skip_distill: bool,
+    skip_llm_judge: bool,
+    process_timeout: int,
+    nmem_path: str | None,
+    api_url: str | None,
+    run_id: str | None,
+    results_dir: str,
+) -> None:
+    """Run a full benchmark pipeline."""
+    from nmem_bench.pipeline.orchestrator import RunConfig, run_benchmark
+
+    config = RunConfig(
+        benchmark=benchmark,
+        search_mode=search_mode,
+        top_k=top_k,
+        answer_model=answer_model,
+        judge_model=judge_model,
+        extraction_level=extraction_level,
+        sample_ids=list(sample) if sample else None,
+        limit=limit,
+        skip_distill=skip_distill,
+        skip_llm_judge=skip_llm_judge,
+        process_timeout=process_timeout,
+        nmem_path=nmem_path,
+        api_url=api_url,
+        run_id=run_id,
+        results_dir=Path(results_dir),
+    )
+
+    console.print(f"\n[bold]nmem-bench[/bold] — {benchmark} benchmark")
+    console.print(f"  Search mode: {search_mode}")
+    console.print(f"  Answer model: {answer_model}")
+    console.print(f"  Extraction: {extraction_level}")
+    if sample:
+        console.print(f"  Samples: {', '.join(sample)}")
+    if limit:
+        console.print(f"  Question limit: {limit}")
+    console.print()
+
+    report_path = run_benchmark(config)
+    _print_report_summary(report_path)
+
+
+@main.command()
+@click.argument("report_path", type=click.Path(exists=True))
+def report(report_path: str) -> None:
+    """Display a benchmark report."""
+    _print_report_summary(Path(report_path))
+
+
+@main.command()
+@click.argument("report_paths", nargs=-1, type=click.Path(exists=True))
+def compare(report_paths: tuple[str, ...]) -> None:
+    """Compare multiple benchmark reports."""
+    if len(report_paths) < 2:
+        console.print("[red]Need at least 2 reports to compare[/red]")
+        sys.exit(1)
+
+    reports = []
+    for p in report_paths:
+        with open(p) as f:
+            reports.append(json.load(f))
+
+    table = Table(title="Benchmark Comparison")
+    table.add_column("Metric")
+    for r in reports:
+        label = f"{r['run_id']}\n{r['search_mode']}"
+        table.add_column(label, justify="right")
+
+    # Overall
+    table.add_row(
+        "F1 Mean",
+        *[f"{r['summary']['f1_mean']:.4f}" for r in reports],
+    )
+    judge_row = []
+    for r in reports:
+        ja = r["summary"].get("judge_accuracy")
+        judge_row.append(f"{ja:.4f}" if ja is not None else "—")
+    table.add_row("Judge Accuracy", *judge_row)
+    table.add_row(
+        "Total Questions",
+        *[str(r["summary"]["total_questions"]) for r in reports],
+    )
+
+    # Latency
+    table.add_row("Search p50 (ms)",
+        *[f"{r['latency']['search']['median']:.0f}" for r in reports])
+    table.add_row("Search p95 (ms)",
+        *[f"{r['latency']['search']['p95']:.0f}" for r in reports])
+
+    # Per-type breakdown
+    all_types = set()
+    for r in reports:
+        all_types.update(r.get("by_question_type", {}).keys())
+    for qtype in sorted(all_types):
+        scores = []
+        for r in reports:
+            data = r.get("by_question_type", {}).get(qtype)
+            scores.append(f"{data['f1_mean']:.4f}" if data else "—")
+        table.add_row(f"  {qtype}", *scores)
+
+    console.print(table)
+
+
+@main.command()
+@click.option("-b", "--benchmark", type=click.Choice(["locomo", "longmemeval"]), default="locomo")
+def stats(benchmark: str) -> None:
+    """Show benchmark dataset statistics."""
+    if benchmark == "locomo":
+        from nmem_bench.benchmarks.locomo import LoComoBenchmark
+        bench = LoComoBenchmark()
+        bench.load()
+        s = bench.stats()
+        console.print(f"\n[bold]LoCoMo Dataset[/bold]")
+        console.print(f"  Conversations: {s['conversations']}")
+        console.print(f"  Total sessions: {s['total_sessions']}")
+        console.print(f"  Total questions: {s['total_questions']}")
+        console.print(f"  By category:")
+        for cat, count in sorted(s["by_category"].items()):
+            console.print(f"    {cat}: {count}")
+    else:
+        from nmem_bench.benchmarks.longmemeval import LongMemEvalBenchmark
+        bench = LongMemEvalBenchmark()
+        bench.load()
+        s = bench.stats()
+        console.print(f"\n[bold]LongMemEval Dataset[/bold]")
+        console.print(f"  Total questions: {s['total_questions']}")
+        console.print(f"  By type:")
+        for qtype, count in sorted(s["by_type"].items()):
+            console.print(f"    {qtype}: {count}")
+
+
+def _print_report_summary(report_path: Path) -> None:
+    """Print a summary of a benchmark report."""
+    with open(report_path) as f:
+        report = json.load(f)
+
+    s = report["summary"]
+    console.print(f"\n[bold green]═══ Benchmark Results ═══[/bold green]")
+    console.print(f"  Run: {report['run_id']}")
+    console.print(f"  Benchmark: {report['benchmark']}")
+    console.print(f"  Mode: {report['search_mode']}")
+    console.print()
+
+    # Overall
+    table = Table(title="Overall")
+    table.add_column("Metric")
+    table.add_column("Value", justify="right")
+    table.add_row("Questions", str(s["total_questions"]))
+    table.add_row("F1 Mean", f"{s['f1_mean']:.4f}")
+    if s.get("judge_accuracy") is not None:
+        table.add_row("LLM Judge Accuracy", f"{s['judge_accuracy']:.4f}")
+    console.print(table)
+    console.print()
+
+    # Per-type
+    type_table = Table(title="By Question Type")
+    type_table.add_column("Type")
+    type_table.add_column("Count", justify="right")
+    type_table.add_column("F1", justify="right")
+    type_table.add_column("Judge", justify="right")
+    type_table.add_column("Search p50", justify="right")
+
+    for qtype, data in sorted(report.get("by_question_type", {}).items()):
+        judge = f"{data['judge_accuracy']:.2f}" if data.get("judge_accuracy") is not None else "—"
+        type_table.add_row(
+            qtype,
+            str(data["total"]),
+            f"{data['f1_mean']:.4f}",
+            judge,
+            f"{data['search_latency']['median']:.0f}ms",
+        )
+    console.print(type_table)
+    console.print()
+
+    # Latency
+    lat_table = Table(title="Latency")
+    lat_table.add_column("Phase")
+    lat_table.add_column("p50", justify="right")
+    lat_table.add_column("p95", justify="right")
+    lat_table.add_column("Mean", justify="right")
+    for phase in ("search", "answer", "total"):
+        lat = report.get("latency", {}).get(phase, {})
+        lat_table.add_row(
+            phase,
+            f"{lat.get('median', 0):.0f}ms",
+            f"{lat.get('p95', 0):.0f}ms",
+            f"{lat.get('mean', 0):.0f}ms",
+        )
+    console.print(lat_table)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nowledge-mem-bench/src/nmem_bench/evaluation/__init__.py b/nowledge-mem-bench/src/nmem_bench/evaluation/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nowledge-mem-bench/src/nmem_bench/evaluation/f1.py b/nowledge-mem-bench/src/nmem_bench/evaluation/f1.py
new file mode 100644
index 000000000..a7781397d
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/evaluation/f1.py
@@ -0,0 +1,131 @@
+"""Token-F1 scoring — ported from LoCoMo (ACL 2024).
+
+Computes token-level F1 between prediction and ground truth,
+with category-specific handling for LoCoMo question types.
+"""
+
+from __future__ import annotations
+
+import string
+from collections import Counter
+
+import regex
+
+try:
+    from nltk.stem import PorterStemmer
+    _stemmer = PorterStemmer()
+except ImportError:
+    _stemmer = None
+
+
+# ── Normalization ──
+
+def normalize_answer(s: str) -> str:
+    """Normalize answer text for comparison.
+
+    Matches the exact order from LoCoMo reference (evaluation.py):
+    comma removal → lowercase → punctuation removal → article removal → whitespace fix.
+    """
+    s = s.replace(",", "")
+    s = s.lower()
+    # Remove punctuation (BEFORE articles — order matters for word boundaries)
+    exclude = set(string.punctuation)
+    s = "".join(ch for ch in s if ch not in exclude)
+    # Remove articles
+    s = regex.sub(r"\b(a|an|the|and)\b", " ", s)
+    # Fix whitespace
+    s = " ".join(s.split())
+    return s
+
+
+def _stem(word: str) -> str:
+    if _stemmer:
+        return _stemmer.stem(word)
+    return word
+
+
+# ── F1 Scoring ──
+
+def f1_score_single(prediction: str, ground_truth: str) -> float:
+    """Compute token-level F1 between a single prediction and ground truth."""
+    pred_tokens = [_stem(w) for w in normalize_answer(prediction).split()]
+    gt_tokens = [_stem(w) for w in normalize_answer(ground_truth).split()]
+
+    if not pred_tokens or not gt_tokens:
+        return 0.0
+
+    common = Counter(pred_tokens) & Counter(gt_tokens)
+    num_same = sum(common.values())
+
+    if num_same == 0:
+        return 0.0
+
+    precision = num_same / len(pred_tokens)
+    recall = num_same / len(gt_tokens)
+    return (2 * precision * recall) / (precision + recall)
+
+
+def f1_score_multi(prediction: str, ground_truth: str) -> float:
+    """F1 for multi-answer questions (comma-separated ground truth).
+
+    For each ground truth sub-answer, find the best-matching prediction
+    sub-answer and average the F1 scores.
+    """
+    predictions = [p.strip() for p in prediction.split(",")]
+    ground_truths = [g.strip() for g in ground_truth.split(",")]
+
+    scores = []
+    for gt in ground_truths:
+        best = max(f1_score_single(pred, gt) for pred in predictions)
+        scores.append(best)
+
+    return sum(scores) / len(scores) if scores else 0.0
+
+
+# ── Category-specific evaluation ──
+
+def evaluate_question(prediction: str, ground_truth: str, category: int) -> float:
+    """Evaluate a single QA pair using category-specific scoring.
+
+    Categories (LoCoMo):
+        1: multi-hop — split ground truth by comma, compute partial F1
+        2: single-hop — standard token-F1
+        3: temporal — token-F1 on first semicolon-delimited part
+        4: open-domain — standard token-F1
+        5: adversarial — binary (1 if prediction indicates "not mentioned")
+
+    For LongMemEval question types, use the type name instead of numeric category.
+    """
+    if category == 5:
+        # Adversarial: binary abstention check.
+        # Matches LoCoMo reference exactly — only these 2 phrases.
+        pred_lower = prediction.lower()
+        if "no information available" in pred_lower or "not mentioned" in pred_lower:
+            return 1.0
+        return 0.0
+
+    if category == 3:
+        # Temporal: use first semicolon-delimited part of ground truth
+        ground_truth = ground_truth.split(";")[0].strip()
+        return f1_score_single(prediction, ground_truth)
+
+    if category == 1:
+        # Multi-hop: split and compute partial F1
+        return f1_score_multi(prediction, ground_truth)
+
+    # Categories 2, 4 and default: standard F1
+    return f1_score_single(prediction, ground_truth)
+
+
+def evaluate_longmemeval_question(
+    prediction: str, ground_truth: str, question_type: str
+) -> float:
+    """Evaluate a LongMemEval question using type-specific scoring."""
+    if "abstention" in question_type or "adversarial" in question_type:
+        return evaluate_question(prediction, ground_truth, category=5)
+    if "temporal" in question_type:
+        return evaluate_question(prediction, ground_truth, category=3)
+    if "multi" in question_type:
+        return evaluate_question(prediction, ground_truth, category=1)
+    # Default: standard F1
+    return f1_score_single(prediction, ground_truth)
diff --git a/nowledge-mem-bench/src/nmem_bench/evaluation/llm_judge.py b/nowledge-mem-bench/src/nmem_bench/evaluation/llm_judge.py
new file mode 100644
index 000000000..e29ccaac2
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/evaluation/llm_judge.py
@@ -0,0 +1,145 @@
+"""LLM-as-judge evaluation — semantic answer scoring.
+
+Uses an LLM to judge whether a prediction is semantically correct,
+handling cases where token-F1 fails (paraphrasing, equivalent answers).
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+# ── Judge Prompts (adapted from MemoryBench) ──
+
+DEFAULT_JUDGE_PROMPT = """You are an evaluation judge. Given a question, a correct answer (ground truth), and a model's response (hypothesis), determine if the hypothesis is correct.
+
+The hypothesis is CORRECT if it contains the essential information from the ground truth answer, even if worded differently.
+
+Question: {question}
+Ground Truth: {ground_truth}
+Hypothesis: {hypothesis}
+
+Respond with ONLY a JSON object:
+{{"score": 1, "label": "correct", "explanation": "..."}}
+or
+{{"score": 0, "label": "incorrect", "explanation": "..."}}"""
+
+ABSTENTION_JUDGE_PROMPT = """You are an evaluation judge. The model should ABSTAIN from answering (say "I don't know" or similar) because the information is not available.
+
+Question: {question}
+Ground Truth: {ground_truth}
+Hypothesis: {hypothesis}
+
+The hypothesis is CORRECT if the model properly abstains, says "I don't know", indicates uncertainty, or states the information is not available.
+The hypothesis is INCORRECT if it provides a specific answer (hallucination).
+
+Respond with ONLY a JSON object:
+{{"score": 1, "label": "correct", "explanation": "..."}}
+or
+{{"score": 0, "label": "incorrect", "explanation": "..."}}"""
+
+TEMPORAL_JUDGE_PROMPT = """You are an evaluation judge for temporal/date questions. Given a question, a correct answer, and a model's response, determine if the hypothesis is correct.
+
+Allow minor date variations (off-by-one day, different date formats referring to the same date).
+
+Question: {question}
+Ground Truth: {ground_truth}
+Hypothesis: {hypothesis}
+
+Respond with ONLY a JSON object:
+{{"score": 1, "label": "correct", "explanation": "..."}}
+or
+{{"score": 0, "label": "incorrect", "explanation": "..."}}"""
+
+KNOWLEDGE_UPDATE_JUDGE_PROMPT = """You are an evaluation judge for knowledge update questions. The model should reflect the LATEST known information.
+
+Question: {question}
+Ground Truth (latest answer): {ground_truth}
+Hypothesis: {hypothesis}
+
+The hypothesis is CORRECT if it reflects the updated/latest answer. It may also mention the old answer as context, as long as the latest answer is present.
+
+Respond with ONLY a JSON object:
+{{"score": 1, "label": "correct", "explanation": "..."}}
+or
+{{"score": 0, "label": "incorrect", "explanation": "..."}}"""
+
+
+def _select_prompt(question_type: str, category: int) -> str:
+    """Select the appropriate judge prompt for a question."""
+    qt = question_type.lower()
+    if category == 5 or "adversarial" in qt or "abstention" in qt:
+        return ABSTENTION_JUDGE_PROMPT
+    if category == 3 or "temporal" in qt:
+        return TEMPORAL_JUDGE_PROMPT
+    if "update" in qt or "changing" in qt:
+        return KNOWLEDGE_UPDATE_JUDGE_PROMPT
+    return DEFAULT_JUDGE_PROMPT
+
+
+async def judge_answer(
+    question: str,
+    ground_truth: str,
+    hypothesis: str,
+    question_type: str = "",
+    category: int = 0,
+    model: str = "gpt-4o-mini",
+) -> dict[str, Any]:
+    """Use an LLM to judge whether a prediction is correct.
+
+    Returns: {"score": 0|1, "label": "correct"|"incorrect", "explanation": str}
+    """
+    from litellm import acompletion
+
+    prompt_template = _select_prompt(question_type, category)
+    prompt = prompt_template.format(
+        question=question,
+        ground_truth=ground_truth,
+        hypothesis=hypothesis,
+    )
+
+    try:
+        response = await acompletion(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            max_tokens=200,
+        )
+        content = response.choices[0].message.content.strip()
+
+        # Parse JSON from response
+        # Handle cases where response has extra text around JSON
+        start = content.find("{")
+        end = content.rfind("}") + 1
+        if start >= 0 and end > start:
+            result = json.loads(content[start:end])
+            return {
+                "score": int(result.get("score", 0)),
+                "label": result.get("label", "incorrect"),
+                "explanation": result.get("explanation", ""),
+            }
+
+    except Exception as exc:
+        logger.warning("LLM judge failed: %s", exc)
+        return {"score": 0, "label": "error", "explanation": f"Judge error: {exc}"}
+
+    return {"score": 0, "label": "error", "explanation": "No valid JSON in judge response"}
+
+
+def judge_answer_sync(
+    question: str,
+    ground_truth: str,
+    hypothesis: str,
+    question_type: str = "",
+    category: int = 0,
+    model: str = "gpt-4o-mini",
+) -> dict[str, Any]:
+    """Synchronous wrapper for judge_answer."""
+    import asyncio
+
+    return asyncio.run(
+        judge_answer(question, ground_truth, hypothesis, question_type, category, model)
+    )
diff --git a/nowledge-mem-bench/src/nmem_bench/evaluation/retrieval.py b/nowledge-mem-bench/src/nmem_bench/evaluation/retrieval.py
new file mode 100644
index 000000000..1772b2fe0
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/evaluation/retrieval.py
@@ -0,0 +1,83 @@
+"""Retrieval quality metrics — Hit@K, MRR, NDCG."""
+
+from __future__ import annotations
+
+import math
+
+
+def hit_at_k(
+    retrieved_contents: list[str],
+    evidence_texts: list[str],
+    k: int = 10,
+) -> float:
+    """Was any relevant evidence found in top-K results?
+
+    Returns 1.0 if at least one evidence text appears in top-K retrieved contents.
+    Uses substring matching since evidence IDs may not match exactly.
+    """
+    top_k = retrieved_contents[:k]
+    for evidence in evidence_texts:
+        evidence_lower = evidence.lower()
+        for content in top_k:
+            if evidence_lower in content.lower():
+                return 1.0
+    return 0.0
+
+
+def mrr(
+    retrieved_contents: list[str],
+    evidence_texts: list[str],
+) -> float:
+    """Mean Reciprocal Rank — position of first relevant result.
+
+    Returns 1/rank of the first retrieved item containing any evidence text.
+    """
+    for i, content in enumerate(retrieved_contents):
+        content_lower = content.lower()
+        for evidence in evidence_texts:
+            if evidence.lower() in content_lower:
+                return 1.0 / (i + 1)
+    return 0.0
+
+
+def ndcg_at_k(
+    retrieved_contents: list[str],
+    evidence_texts: list[str],
+    k: int = 10,
+) -> float:
+    """Normalized Discounted Cumulative Gain at K."""
+    top_k = retrieved_contents[:k]
+
+    # Binary relevance: 1 if content contains any evidence
+    relevances = []
+    for content in top_k:
+        content_lower = content.lower()
+        rel = 0.0
+        for evidence in evidence_texts:
+            if evidence.lower() in content_lower:
+                rel = 1.0
+                break
+        relevances.append(rel)
+
+    # DCG
+    dcg = sum(rel / math.log2(i + 2) for i, rel in enumerate(relevances))
+
+    # Ideal DCG (all relevant items first)
+    n_relevant = min(len(evidence_texts), k)
+    idcg = sum(1.0 / math.log2(i + 2) for i in range(n_relevant))
+
+    return dcg / idcg if idcg > 0 else 0.0
+
+
+def compute_retrieval_metrics(
+    retrieved_contents: list[str],
+    evidence_texts: list[str],
+    k: int = 10,
+) -> dict[str, float]:
+    """Compute all retrieval metrics."""
+    return {
+        "hit_at_k": hit_at_k(retrieved_contents, evidence_texts, k),
+        "mrr": mrr(retrieved_contents, evidence_texts),
+        "ndcg_at_k": ndcg_at_k(retrieved_contents, evidence_texts, k),
+        "k": k,
+    }
diff --git a/nowledge-mem-bench/src/nmem_bench/nmem/__init__.py b/nowledge-mem-bench/src/nmem_bench/nmem/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nowledge-mem-bench/src/nmem_bench/nmem/client.py b/nowledge-mem-bench/src/nmem_bench/nmem/client.py
new file mode 100644
index 000000000..386ade34e
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/nmem/client.py
@@ -0,0 +1,338 @@
+"""nmem CLI client wrapper.
+
+Executes nmem commands via subprocess with --json output parsing.
+Handles the exact JSON output formats documented in nmem-cli.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import shutil
+import subprocess
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SearchResult:
+    """A single memory search result."""
+
+    memory_id: str
+    title: str
+    content: str
+    confidence: float
+    importance: float
+    labels: list[str] = field(default_factory=list)
+    created_at: str = ""
+    event_start: str = ""
+    source: str = ""
+
+
+@dataclass
+class ThreadInfo:
+    """Result of thread creation."""
+
+    thread_id: str
+    title: str
+    message_count: int
+
+
+@dataclass
+class Stats:
+    """Database statistics snapshot."""
+
+    memory_count: int
+    thread_count: int
+    entity_count: int
+    community_count: int
+    crystal_count: int = 0
+    raw: dict[str, Any] = field(default_factory=dict)
+
+
+class NmemClient:
+    """Wrapper around the nmem CLI tool.
+
+    All commands are executed via subprocess with --json flag.
+    Field names match the exact nmem CLI JSON output format.
+    """
+
+    def __init__(self, nmem_path: str | None = None, api_url: str | None = None):
+        self._nmem = nmem_path or self._find_nmem()
+        self._api_url = api_url
+        self._verify_connection()
+
+    def _find_nmem(self) -> str:
+        """Find nmem CLI in PATH or common locations."""
+        path = shutil.which("nmem")
+        if path:
+            return path
+        candidates = [
+            Path.home() / ".local" / "bin" / "nmem",
+            Path("/usr/local/bin/nmem"),
+        ]
+        for c in candidates:
+            if c.exists():
+                return str(c)
+        raise FileNotFoundError(
+            "nmem CLI not found. Install it or pass nmem_path= explicitly."
+        )
+
+    def _verify_connection(self) -> None:
+        """Verify nmem server is reachable."""
+        try:
+            self._run(["status"])
+            logger.info("nmem connection OK (%s)", self._nmem)
+        except Exception as e:
+            logger.warning("nmem status check failed: %s", e)
+
+    def _run(
+        self,
+        args: list[str],
+        *,
+        json_output: bool = True,
+        timeout: int = 300,
+    ) -> Any:
+        """Execute an nmem command and return parsed output."""
+        cmd = [self._nmem]
+        if json_output:
+            cmd.append("--json")
+        if self._api_url:
+            cmd.extend(["--api-url", self._api_url])
+        cmd.extend(args)
+
+        logger.debug("nmem: %s", " ".join(cmd))
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"nmem failed (exit {result.returncode}): "
+                f"{' '.join(args)}\nstderr: {result.stderr[:500]}"
+            )
+
+        if not json_output:
+            return result.stdout
+
+        try:
+            return json.loads(result.stdout)
+        except json.JSONDecodeError:
+            logger.warning("Non-JSON output from nmem: %.200s", result.stdout)
+            return {"raw_output": result.stdout}
+
+    # ── Thread operations ──
+
+    def thread_create(
+        self,
+        title: str,
+        content: str | None = None,
+        file_path: str | None = None,
+        messages_json: str | None = None,
+        source: str = "nmem-bench",
+    ) -> ThreadInfo:
+        """Create a thread.
+
+        nmem JSON output: {"success": true, "id": "...", "title": "...", "messages": N}
+        """
+        args = ["t", "create", "-t", title, "-s", source]
+        if file_path:
+            args.extend(["-f", file_path])
+        elif messages_json:
+            args.extend(["-m", messages_json])
+        elif content:
+            args.extend(["-c", content])
+
+        data = self._run(args, timeout=60)
+        # CLI outputs: id, title, messages (count)
+        return ThreadInfo(
+            thread_id=data.get("id", data.get("thread_id", "")),
+            title=data.get("title", title),
+            message_count=data.get("messages", data.get("message_count", 0)),
+        )
+
+    def thread_create_from_file(self, title: str, file_path: Path) -> ThreadInfo:
+        """Create a thread from a markdown file."""
+        return self.thread_create(title=title, file_path=str(file_path))
+
+    def thread_distill(
+        self,
+        thread_id: str,
+        extraction_level: str = "guided",
+    ) -> dict[str, Any]:
+        """Distill memories from a thread.
+
+        Returns the raw distillation response (memory object + metadata).
+        Valid extraction levels: "swift" (fast) or "guided" (multi-step, thorough).
+        """
+        args = [
+            "t", "distill", thread_id,
+            "--level", extraction_level,
+        ]
+        return self._run(args, timeout=180)
+
+    def thread_list(self, limit: int = 50) -> list[dict[str, Any]]:
+        """List threads.
+
+        nmem JSON output: {"threads": [...], "total": N}
+        """
+        data = self._run(["t", "-n", str(limit)])
+        if isinstance(data, dict):
+            return data.get("threads", [])
+        return data
+
+    # ── Memory operations ──
+
+    def memory_search(
+        self,
+        query: str,
+        limit: int = 10,
+        mode: str = "normal",
+    ) -> list[SearchResult]:
+        """Search memories.
+
+        nmem JSON output: {"query": "...", "total": N, "search_mode": "...",
+                           "memories": [{id, title, content, score, importance, ...}]}
+        """
+        args = ["m", "search", query, "-n", str(limit)]
+        if mode == "deep":
+            args.extend(["--mode", "deep"])
+
+        data = self._run(args)
+
+        # Extract memories array from response
+        if isinstance(data, list):
+            items = data
+        elif isinstance(data, dict):
+            items = data.get("memories", data.get("results", []))
+        else:
+            items = []
+
+        results = []
+        for item in items:
+            results.append(
+                SearchResult(
+                    memory_id=item.get("id", ""),
+                    title=item.get("title", ""),
+                    content=item.get("content", ""),
+                    confidence=float(item.get("score", item.get("confidence", 0))),
+                    importance=float(item.get("importance", 0.5)),
+                    labels=item.get("labels", []),
+                    created_at=item.get("created_at", ""),
+                    event_start=item.get("event_start", ""),
+                    source=item.get("source", ""),
+                )
+            )
+        return results
+
+    def memory_add(
+        self,
+        content: str,
+        title: str = "",
+        importance: float = 0.5,
+        labels: str = "",
+        unit_type: str = "fact",
+    ) -> dict[str, Any]:
+        """Add a memory directly."""
+        args = ["m", "add", content]
+        if title:
+            args.extend(["-t", title])
+        args.extend(["-i", str(importance)])
+        if labels:
+            args.extend(["-l", labels])
+        args.extend(["--unit-type", unit_type])
+        return self._run(args)
+
+    # ── System operations ──
+
+    def stats(self) -> Stats:
+        """Get database statistics.
+
+        nmem JSON output: {"memories": N, "threads": N, "entities": N, "labels": N, "communities": N}
+        """
+        data = self._run(["stats"])
+        return Stats(
+            memory_count=data.get("memories", data.get("memory_count", 0)),
+            thread_count=data.get("threads", data.get("thread_count", 0)),
+            entity_count=data.get("entities", data.get("entity_count", 0)),
+            community_count=data.get("communities", data.get("community_count", 0)),
+            crystal_count=data.get("crystals", data.get("crystal_count", 0)),
+            raw=data,
+        )
+
+    def feed_events(
+        self,
+        days: int = 7,
+        event_type: str | None = None,
+    ) -> list[dict[str, Any]]:
+        """Get recent feed events.
+
+        nmem JSON output: {"events": [...], "total": N, "last_n_days": N}
+        """
+        args = ["f", "--days", str(days)]
+        if event_type:
+            args.extend(["--type", event_type])
+        data = self._run(args)
+        if isinstance(data, dict):
+            return data.get("events", [])
+        return data
+
+    def wait_for_processing(
+        self,
+        initial_stats: Stats | None = None,
+        timeout: int = 300,
+        poll_interval: int = 10,
+        min_wait: int = 15,
+    ) -> Stats:
+        """Wait for background processing to settle.
+
+        After distillation, the Knowledge Agent needs time to start processing.
+        We wait at least `min_wait` seconds before polling, then require counts
+        to stabilize for 2 consecutive intervals.
+        """
+        if initial_stats is None:
+            initial_stats = self.stats()
+
+        last_memory_count = initial_stats.memory_count
+        last_entity_count = initial_stats.entity_count
+        stable_count = 0
+        elapsed = 0
+
+        # Wait before first poll to let background processing start
+        if min_wait > 0:
+            logger.debug("Initial wait %ds for processing to begin...", min_wait)
+            time.sleep(min_wait)
+            elapsed += min_wait
+
+        while elapsed < timeout:
+            time.sleep(poll_interval)
+            elapsed += poll_interval
+
+            current = self.stats()
+            if (
+                current.memory_count == last_memory_count
+                and current.entity_count == last_entity_count
+            ):
+                stable_count += 1
+                if stable_count >= 2:
+                    logger.info("Processing settled after %ds", elapsed)
+                    return current
+            else:
+                stable_count = 0
+                last_memory_count = current.memory_count
+                last_entity_count = current.entity_count
+                logger.debug(
+                    "Processing: memories=%d, entities=%d",
+                    current.memory_count,
+                    current.entity_count,
+                )
+
+        logger.warning("Processing did not settle within %ds", timeout)
+        return self.stats()
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/__init__.py b/nowledge-mem-bench/src/nmem_bench/pipeline/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/answer.py b/nowledge-mem-bench/src/nmem_bench/pipeline/answer.py
new file mode 100644
index 000000000..3407ac56f
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/answer.py
@@ -0,0 +1,140 @@
+"""Phase 4: Answer — generate answers using LLM + retrieved context."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import time
+
+from nmem_bench.benchmarks.types import UnifiedQuestion
+from nmem_bench.pipeline.checkpoint import RunCheckpoint
+
+logger = logging.getLogger(__name__)
+
+# ── Answer prompts ──
+
+QA_PROMPT = """Based on the retrieved memories below, answer the question.
+
+Retrieved Memories:
+{context}
+
+Question: {question}
+
+Instructions:
+- Answer with a short phrase using information from the memories.
+- If the memories don't contain relevant information, say "No information available."
+- Be concise — answer in a few words or a short sentence.
+
+Answer:"""
+
+
+def _format_context(search_results: list[dict]) -> str:
+    """Format search results as context for the LLM."""
+    if not search_results:
+        return "(No relevant memories found)"
+
+    parts = []
+    for i, r in enumerate(search_results, 1):
+        title = r.get("title", "")
+        content = r.get("content", "")
+        confidence = r.get("confidence", 0)
+        header = f"[Memory {i}]"
+        if title:
+            header += f" {title}"
+        header += f" (relevance: {confidence:.2f})"
+        parts.append(f"{header}\n{content}")
+
+    return "\n\n".join(parts)
+
+
+async def _generate_answer(
+    question: UnifiedQuestion,
+    context: str,
+    model: str,
+) -> str:
+    """Generate an answer using the LLM."""
+    from litellm import acompletion
+
+    prompt = QA_PROMPT.format(context=context, question=question.question)
+
+    response = await acompletion(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=0,
+        max_tokens=150,
+    )
+    return response.choices[0].message.content.strip()
+
+
+async def _answer_one(
+    q: UnifiedQuestion,
+    qstate: "QuestionState",
+    context: str,
+    model: str,
+    sem: asyncio.Semaphore,
+) -> None:
+    """Answer a single question with semaphore-bounded concurrency."""
+    from nmem_bench.pipeline.checkpoint import QuestionState
+
+    async with sem:
+        t0 = time.time()
+        try:
+            answer = await _generate_answer(q, context, model)
+            qstate.answer = answer
+            qstate.answer_latency_ms = (time.time() - t0) * 1000
+            qstate.phase = "answered"
+        except Exception as e:
+            qstate.error = f"Answer failed: {e}"
+            logger.error("Answer generation failed for %s: %s", q.question_id, e)
+
+
+def answer_questions(
+    questions: list[UnifiedQuestion],
+    checkpoint: RunCheckpoint,
+    model: str = "gpt-4o-mini",
+    concurrency: int = 5,
+    on_progress: callable = None,
+) -> None:
+    """Generate answers for all searched questions.
+
+    Uses asyncio.gather with a semaphore for bounded concurrent LLM calls.
+    Checkpoints after each batch.
+    """
+    total = len(questions)
+    answered = 0
+
+    async def _process_batch(batch):
+        nonlocal answered
+        sem = asyncio.Semaphore(concurrency)
+        tasks = []
+
+        for q in batch:
+            qstate = checkpoint.get_question(q.question_id)
+            if qstate.phase in ("answered", "evaluated"):
+                answered += 1
+                continue
+            if qstate.phase != "searched":
+                continue
+
+            context = _format_context(qstate.search_results)
+            tasks.append(_answer_one(q, qstate, context, model, sem))
+
+        if tasks:
+            await asyncio.gather(*tasks)
+            answered += len(tasks)
+
+        if on_progress:
+            on_progress(answered, total, "")
+
+        checkpoint.save()
+
+    # Process in batches for checkpointing
+    batch_size = concurrency * 4
+    for i in range(0, len(questions), batch_size):
+        batch = questions[i : i + batch_size]
+        asyncio.run(_process_batch(batch))
+        if answered % 50 == 0:
+            logger.info("[%d/%d] Answered", answered, total)
+
+    checkpoint.save()
+    logger.info("Answer generation complete: %d questions", total)
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/checkpoint.py b/nowledge-mem-bench/src/nmem_bench/pipeline/checkpoint.py
new file mode 100644
index 000000000..b7cfa795e
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/checkpoint.py
@@ -0,0 +1,116 @@
+"""Checkpoint management for resumable benchmark runs."""
+
+from __future__ import annotations
+
+import json
+import logging
+from dataclasses import asdict, dataclass, field, fields
+from pathlib import Path
+from typing import Any
+
+
+def _filter_fields(cls: type, data: dict) -> dict:
+    """Filter dict to only keys that are valid dataclass fields."""
+    valid = {f.name for f in fields(cls)}
+    return {k: v for k, v in data.items() if k in valid}
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class QuestionState:
+    """Per-question checkpoint state."""
+
+    question_id: str
+    phase: str = "pending"  # pending | ingested | searched | answered | evaluated
+    search_results: list[dict[str, Any]] = field(default_factory=list)
+    search_latency_ms: float = 0
+    answer: str = ""
+    answer_latency_ms: float = 0
+    f1_score: float = 0
+    llm_judge_score: int = -1  # -1 = not evaluated
+    llm_judge_label: str = ""
+    llm_judge_explanation: str = ""
+    retrieval_metrics: dict[str, float] = field(default_factory=dict)
+    error: str = ""
+
+
+@dataclass
+class ConversationState:
+    """Per-conversation checkpoint state (LoCoMo)."""
+
+    sample_id: str
+    phase: str = "pending"  # pending | ingested | distilled | processed
+    thread_ids: list[str] = field(default_factory=list)
+    memory_count_before: int = 0
+    memory_count_after: int = 0
+
+
+@dataclass
+class RunCheckpoint:
+    """Full benchmark run state."""
+
+    run_id: str
+    benchmark: str  # "locomo" | "longmemeval"
+    search_mode: str = "normal"
+    answer_model: str = "gpt-4o-mini"
+    judge_model: str = "gpt-4o-mini"
+    started_at: str = ""
+    conversations: dict[str, ConversationState] = field(default_factory=dict)
+    questions: dict[str, QuestionState] = field(default_factory=dict)
+    save_path: Path | None = field(default=None, repr=False)
+
+    def save(self, path: Path | None = None) -> None:
+        path = path or self.save_path
+        if path is None:
+            raise ValueError("No save path set — pass path= or set checkpoint.save_path")
+        path.parent.mkdir(parents=True, exist_ok=True)
+        # Exclude save_path from serialization
+        data = asdict(self)
+        data.pop("save_path", None)
+        with open(path, "w") as f:
+            json.dump(data, f, indent=2)
+        logger.debug("Checkpoint saved: %s", path)
+
+    @classmethod
+    def load(cls, path: Path) -> RunCheckpoint:
+        with open(path) as f:
+            data = json.load(f)
+
+        cp = cls(
+            run_id=data["run_id"],
+            benchmark=data["benchmark"],
+            search_mode=data.get("search_mode", "normal"),
+            answer_model=data.get("answer_model", "gpt-4o-mini"),
+            judge_model=data.get("judge_model", "gpt-4o-mini"),
+            started_at=data.get("started_at", ""),
+            save_path=path,
+        )
+
+        for sid, cdata in data.get("conversations", {}).items():
+            cp.conversations[sid] = ConversationState(**_filter_fields(ConversationState, cdata))
+
+        for qid, qdata in data.get("questions", {}).items():
+            cp.questions[qid] = QuestionState(**_filter_fields(QuestionState, qdata))
+
+        return cp
+
+    def get_question(self, qid: str) -> QuestionState:
+        if qid not in self.questions:
+            self.questions[qid] = QuestionState(question_id=qid)
+        return self.questions[qid]
+
+    def get_conversation(self, sid: str) -> ConversationState:
+        if sid not in self.conversations:
+            self.conversations[sid] = ConversationState(sample_id=sid)
+        return self.conversations[sid]
+
+    @property
+    def completed_questions(self) -> list[QuestionState]:
+        return [q for q in self.questions.values() if q.phase == "evaluated"]
+
+    @property
+    def pending_questions(self) -> list[str]:
+        return [
+            qid for qid, q in self.questions.items() if q.phase != "evaluated"
+        ]
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/evaluate.py b/nowledge-mem-bench/src/nmem_bench/pipeline/evaluate.py
new file mode 100644
index 000000000..f0ffacb34
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/evaluate.py
@@ -0,0 +1,128 @@
+"""Phase 5: Evaluate — score answers using F1 and LLM judge."""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+
+from nmem_bench.benchmarks.types import UnifiedQuestion
+from nmem_bench.evaluation.f1 import evaluate_question, evaluate_longmemeval_question
+from nmem_bench.evaluation.retrieval import compute_retrieval_metrics
+from nmem_bench.pipeline.checkpoint import RunCheckpoint
+
+logger = logging.getLogger(__name__)
+
+JUDGE_CONCURRENCY = 10
+
+
+def evaluate_f1(
+    questions: list[UnifiedQuestion],
+    checkpoint: RunCheckpoint,
+    on_progress: callable = None,
+) -> None:
+    """Evaluate answers using token-F1 scoring."""
+    total = len(questions)
+    evaluated = 0
+
+    for q in questions:
+        qstate = checkpoint.get_question(q.question_id)
+        if qstate.phase == "evaluated":
+            evaluated += 1
+            continue
+        if qstate.phase != "answered" or not qstate.answer:
+            continue
+
+        # F1 scoring
+        if q.category > 0:
+            # LoCoMo — use category-specific scoring
+            qstate.f1_score = evaluate_question(
+                qstate.answer, q.ground_truth, q.category
+            )
+        else:
+            # LongMemEval — use type-specific scoring
+            qstate.f1_score = evaluate_longmemeval_question(
+                qstate.answer, q.ground_truth, q.question_type
+            )
+
+        # Retrieval metrics (if evidence available)
+        if q.evidence and qstate.search_results:
+            contents = [r.get("content", "") for r in qstate.search_results]
+            qstate.retrieval_metrics = compute_retrieval_metrics(
+                contents, q.evidence
+            )
+
+        qstate.phase = "evaluated"
+        evaluated += 1
+
+        if on_progress:
+            on_progress(evaluated, total, q.question_id)
+
+    checkpoint.save()
+    logger.info("F1 evaluation complete: %d questions", evaluated)
+
+
+def evaluate_llm_judge(
+    questions: list[UnifiedQuestion],
+    checkpoint: RunCheckpoint,
+    judge_model: str = "gpt-4o-mini",
+    on_progress: callable = None,
+) -> None:
+    """Evaluate answers using LLM-as-judge (secondary scoring).
+
+    Uses asyncio.gather with bounded concurrency for parallel judge calls.
+    """
+    from nmem_bench.evaluation.llm_judge import judge_answer
+
+    total = len(questions)
+    judged = 0
+
+    async def _judge_one(q, qstate, sem):
+        async with sem:
+            try:
+                result = await judge_answer(
+                    question=q.question,
+                    ground_truth=q.ground_truth,
+                    hypothesis=qstate.answer,
+                    question_type=q.question_type,
+                    category=q.category,
+                    model=judge_model,
+                )
+                qstate.llm_judge_score = result["score"]
+                qstate.llm_judge_label = result["label"]
+                qstate.llm_judge_explanation = result.get("explanation", "")
+            except Exception as e:
+                logger.error("LLM judge failed for %s: %s", q.question_id, e)
+
+    async def _judge_batch(batch):
+        nonlocal judged
+        sem = asyncio.Semaphore(JUDGE_CONCURRENCY)
+        tasks = []
+
+        for q in batch:
+            qstate = checkpoint.get_question(q.question_id)
+            if qstate.llm_judge_score >= 0:
+                judged += 1
+                continue
+            if not qstate.answer:
+                judged += 1
+                continue
+            tasks.append(_judge_one(q, qstate, sem))
+
+        if tasks:
+            await asyncio.gather(*tasks)
+            judged += len(tasks)
+
+        if on_progress:
+            on_progress(judged, total, "")
+
+    # Process in batches for checkpointing
+    batch_size = JUDGE_CONCURRENCY * 4
+    for i in range(0, len(questions), batch_size):
+        batch = questions[i : i + batch_size]
+        asyncio.run(_judge_batch(batch))
+        checkpoint.save()
+        if judged % 50 == 0:
+            logger.info("[%d/%d] LLM-judged", judged, total)
+
+    checkpoint.save()
+    logger.info("LLM judge evaluation complete: %d questions", judged)
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/ingest.py b/nowledge-mem-bench/src/nmem_bench/pipeline/ingest.py
new file mode 100644
index 000000000..88de3a7eb
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/ingest.py
@@ -0,0 +1,144 @@
+"""Phase 1: Ingest — load conversations/documents into nowledge-mem."""
+
+from __future__ import annotations
+
+import logging
+import tempfile
+from pathlib import Path
+
+from nmem_bench.benchmarks.locomo import LoComoBenchmark
+from nmem_bench.benchmarks.longmemeval import LongMemEvalBenchmark
+from nmem_bench.benchmarks.types import UnifiedSession
+from nmem_bench.nmem.client import NmemClient
+from nmem_bench.pipeline.checkpoint import RunCheckpoint
+
+logger = logging.getLogger(__name__)
+
+
+def _write_session_md(session: UnifiedSession, tmp_dir: Path) -> Path:
+    """Write a session as markdown to a temp file."""
+    md = session.to_markdown()
+    path = tmp_dir / f"{session.session_id}.md"
+    path.write_text(md)
+    return path
+
+
+def ingest_locomo(
+    benchmark: LoComoBenchmark,
+    client: NmemClient,
+    checkpoint: RunCheckpoint,
+    sample_ids: list[str] | None = None,
+    on_progress: callable = None,
+) -> None:
+    """Ingest LoCoMo conversations as threads.
+
+    Each session becomes a separate thread, preserving temporal structure.
+    """
+    ids = sample_ids or benchmark.sample_ids
+    total = len(ids)
+
+    with tempfile.TemporaryDirectory(prefix="nmem-bench-") as tmp_dir:
+        tmp_path = Path(tmp_dir)
+
+        for idx, sample_id in enumerate(ids):
+            conv_state = checkpoint.get_conversation(sample_id)
+
+            if conv_state.phase in ("ingested", "distilled", "processed"):
+                logger.info("Skipping %s (already %s)", sample_id, conv_state.phase)
+                continue
+
+            sessions = benchmark.get_sessions(sample_id)
+            logger.info(
+                "[%d/%d] Ingesting %s: %d sessions",
+                idx + 1, total, sample_id, len(sessions),
+            )
+
+            # Record baseline stats
+            stats = client.stats()
+            conv_state.memory_count_before = stats.memory_count
+
+            thread_ids = []
+            for session in sessions:
+                md_path = _write_session_md(session, tmp_path)
+                title = (
+                    f"LoCoMo {sample_id} Session {session.metadata.get('session_num', '?')}"
+                    f" ({session.date})"
+                )
+                try:
+                    info = client.thread_create_from_file(title=title, file_path=md_path)
+                    thread_ids.append(info.thread_id)
+                    logger.debug("Created thread %s: %s", info.thread_id, title)
+                except Exception as e:
+                    logger.error("Failed to create thread for %s: %s", session.session_id, e)
+
+            conv_state.thread_ids = thread_ids
+            conv_state.phase = "ingested"
+            checkpoint.save()
+
+            if on_progress:
+                on_progress(idx + 1, total, sample_id)
+
+
+def ingest_longmemeval(
+    benchmark: LongMemEvalBenchmark,
+    client: NmemClient,
+    checkpoint: RunCheckpoint,
+    question_ids: list[str] | None = None,
+    on_progress: callable = None,
+) -> None:
+    """Ingest LongMemEval haystack sessions as threads.
+
+    Each question's haystack sessions become threads.
+    Deduplicates sessions that appear in multiple questions.
+    Thread IDs are stored in a single ConversationState keyed as "__longmemeval__".
+    """
+    conv_state = checkpoint.get_conversation("__longmemeval__")
+    if conv_state.phase in ("ingested", "distilled", "processed"):
+        logger.info("LongMemEval already ingested (%s)", conv_state.phase)
+        return
+
+    questions = benchmark.get_questions()
+    if question_ids:
+        questions = [q for q in questions if q.question_id in question_ids]
+
+    # Collect all unique sessions across questions
+    ingested_sessions: set[str] = set()
+    thread_ids: list[str] = []
+    total = len(questions)
+
+    stats = client.stats()
+    conv_state.memory_count_before = stats.memory_count
+
+    with tempfile.TemporaryDirectory(prefix="nmem-bench-lme-") as tmp_dir:
+        tmp_path = Path(tmp_dir)
+
+        for idx, question in enumerate(questions):
+            sessions = benchmark.get_sessions(question.question_id)
+
+            for session in sessions:
+                if session.session_id in ingested_sessions:
+                    continue
+
+                md_path = _write_session_md(session, tmp_path)
+                title = f"LongMemEval {session.session_id}"
+                if session.date:
+                    title += f" ({session.date})"
+
+                try:
+                    info = client.thread_create_from_file(title=title, file_path=md_path)
+                    thread_ids.append(info.thread_id)
+                    ingested_sessions.add(session.session_id)
+                except Exception as e:
+                    logger.error("Failed to ingest session %s: %s", session.session_id, e)
+
+            if on_progress:
+                on_progress(idx + 1, total, question.question_id)
+
+    conv_state.thread_ids = thread_ids
+    conv_state.phase = "ingested"
+    checkpoint.save()
+
+    logger.info(
+        "Ingested %d unique sessions (%d threads) for %d questions",
+        len(ingested_sessions), len(thread_ids), total,
+    )
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/orchestrator.py b/nowledge-mem-bench/src/nmem_bench/pipeline/orchestrator.py
new file mode 100644
index 000000000..56075969d
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/orchestrator.py
@@ -0,0 +1,373 @@
+"""Pipeline orchestrator — coordinates all benchmark phases."""
+
+from __future__ import annotations
+
+import datetime
+import json
+import logging
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any
+
+from nmem_bench.benchmarks.locomo import LoComoBenchmark
+from nmem_bench.benchmarks.longmemeval import LongMemEvalBenchmark
+from nmem_bench.benchmarks.types import UnifiedQuestion
+from nmem_bench.nmem.client import NmemClient
+from nmem_bench.pipeline.checkpoint import RunCheckpoint
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class RunConfig:
+    benchmark: str = "locomo"
+    search_mode: str = "normal"
+    top_k: int = 10
+    answer_model: str = "gpt-4o-mini"
+    judge_model: str = "gpt-4o-mini"
+    extraction_level: str = "guided"
+    sample_ids: list[str] | None = None  # LoCoMo: filter to specific conversations
+    limit: int | None = None  # Limit number of questions
+    skip_distill: bool = False
+    skip_llm_judge: bool = False
+    process_timeout: int = 600
+    nmem_path: str | None = None
+    api_url: str | None = None
+    run_id: str | None = None
+    results_dir: Path = field(default_factory=lambda: Path("results"))
+
+
+def _generate_run_id(config: RunConfig) -> str:
+    ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    return f"{config.benchmark}-{config.search_mode}-{ts}"
+
+
+def run_benchmark(config: RunConfig) -> Path:
+    """Execute a full benchmark run and return the report path."""
+    from nmem_bench.pipeline.ingest import ingest_locomo, ingest_longmemeval
+    from nmem_bench.pipeline.process import distill_threads, wait_for_processing
+    from nmem_bench.pipeline.search import search_questions
+    from nmem_bench.pipeline.answer import answer_questions
+    from nmem_bench.pipeline.evaluate import evaluate_f1, evaluate_llm_judge
+
+    run_id = config.run_id or _generate_run_id(config)
+    run_dir = config.results_dir / run_id
+    run_dir.mkdir(parents=True, exist_ok=True)
+
+    checkpoint_path = run_dir / "checkpoint.json"
+
+    # Load or create checkpoint
+    if checkpoint_path.exists():
+        checkpoint = RunCheckpoint.load(checkpoint_path)
+        # Validate checkpoint matches current config
+        if checkpoint.benchmark != config.benchmark:
+            raise ValueError(
+                f"Checkpoint benchmark mismatch: checkpoint has '{checkpoint.benchmark}' "
+                f"but config specifies '{config.benchmark}'. "
+                f"Use a different --run-id or delete {checkpoint_path}"
+            )
+        logger.info("Resuming run %s", run_id)
+    else:
+        checkpoint = RunCheckpoint(
+            run_id=run_id,
+            benchmark=config.benchmark,
+            search_mode=config.search_mode,
+            answer_model=config.answer_model,
+            judge_model=config.judge_model,
+            started_at=datetime.datetime.now().isoformat(),
+            save_path=checkpoint_path,
+        )
+    checkpoint.save_path = checkpoint_path
+
+    # Initialize nmem client
+    client = NmemClient(nmem_path=config.nmem_path, api_url=config.api_url)
+
+    # Progress callback
+    def _progress(done, total, label=""):
+        pct = done / total * 100 if total else 0
+        logger.info("  [%d/%d] %.0f%% %s", done, total, pct, label)
+
+    if config.benchmark == "locomo":
+        # ── Load LoCoMo ──
+        bench = LoComoBenchmark()
+        bench.load()
+        logger.info("Dataset stats: %s", bench.stats())
+
+        # Phase 1: Ingest
+        logger.info("═══ Phase 1: INGEST ═══")
+        ingest_locomo(
+            bench, client, checkpoint,
+            sample_ids=config.sample_ids,
+            on_progress=_progress,
+        )
+
+        # Phase 2: Process (distill + KG)
+        if not config.skip_distill:
+            logger.info("═══ Phase 2: PROCESS (distill + KG) ═══")
+            distill_threads(
+                client, checkpoint,
+                extraction_level=config.extraction_level,
+                on_progress=_progress,
+            )
+            wait_for_processing(
+                client, checkpoint,
+                timeout=config.process_timeout,
+            )
+
+        # Get questions — collect from all specified samples (or all)
+        if config.sample_ids:
+            all_qs: list[UnifiedQuestion] = []
+            for sid in config.sample_ids:
+                all_qs.extend(bench.get_questions(sample_id=sid))
+            if config.limit:
+                all_qs = all_qs[: config.limit]
+            questions = all_qs
+        else:
+            questions = bench.get_questions(limit=config.limit)
+
+    elif config.benchmark == "longmemeval":
+        # ── Load LongMemEval ──
+        bench = LongMemEvalBenchmark()
+        bench.load()
+        logger.info("Dataset stats: %s", bench.stats())
+
+        # Phase 1: Ingest
+        logger.info("═══ Phase 1: INGEST ═══")
+        ingest_longmemeval(
+            bench, client, checkpoint,
+            on_progress=_progress,
+        )
+
+        # Phase 2: Process (distill + KG)
+        if not config.skip_distill:
+            logger.info("═══ Phase 2: PROCESS (distill + KG) ═══")
+            distill_threads(
+                client, checkpoint,
+                extraction_level=config.extraction_level,
+                on_progress=_progress,
+            )
+            wait_for_processing(
+                client, checkpoint,
+                timeout=config.process_timeout,
+            )
+
+        questions = bench.get_questions(limit=config.limit)
+
+    else:
+        raise ValueError(f"Unknown benchmark: {config.benchmark}")
+
+    logger.info("Total questions to evaluate: %d", len(questions))
+
+    # Phase 3: Search
+    logger.info("═══ Phase 3: SEARCH ═══")
+    search_questions(
+        questions, client, checkpoint,
+        search_mode=config.search_mode,
+        top_k=config.top_k,
+        on_progress=_progress,
+    )
+
+    # Phase 4: Answer
+    logger.info("═══ Phase 4: ANSWER ═══")
+    answer_questions(
+        questions, checkpoint,
+        model=config.answer_model,
+        on_progress=_progress,
+    )
+
+    # Phase 5: Evaluate
+    logger.info("═══ Phase 5: EVALUATE (F1) ═══")
+    evaluate_f1(questions, checkpoint, on_progress=_progress)
+
+    if not config.skip_llm_judge:
+        logger.info("═══ Phase 5b: EVALUATE (LLM Judge) ═══")
+        evaluate_llm_judge(
+            questions, checkpoint,
+            judge_model=config.judge_model,
+            on_progress=_progress,
+        )
+
+    # Phase 6: Report
+    logger.info("═══ Phase 6: REPORT ═══")
+    report = generate_report(questions, checkpoint, config)
+    report_json_path = run_dir / "report.json"
+    report_md_path = run_dir / "report.md"
+
+    with open(report_json_path, "w") as f:
+        json.dump(report, f, indent=2)
+
+    md = render_report_markdown(report)
+    with open(report_md_path, "w") as f:
+        f.write(md)
+
+    logger.info("Report saved: %s", report_json_path)
+    logger.info("Report (MD): %s", report_md_path)
+
+    return report_json_path
+
+
+# ── Report Generation ──
+
+
+def _latency_stats(values: list[float]) -> dict[str, float]:
+    """Compute latency statistics from a list of durations (ms)."""
+    if not values:
+        return {"min": 0, "max": 0, "mean": 0, "median": 0, "p95": 0, "p99": 0, "count": 0}
+    values = sorted(values)
+    n = len(values)
+    return {
+        "min": round(values[0], 1),
+        "max": round(values[-1], 1),
+        "mean": round(sum(values) / n, 1),
+        "median": round(values[n // 2], 1),
+        "p95": round(values[int(n * 0.95)], 1) if n > 1 else round(values[0], 1),
+        "p99": round(values[int(n * 0.99)], 1) if n > 1 else round(values[0], 1),
+        "count": n,
+    }
+
+
+def generate_report(
+    questions: list[UnifiedQuestion],
+    checkpoint: RunCheckpoint,
+    config: RunConfig,
+) -> dict[str, Any]:
+    """Generate structured benchmark report."""
+    q_map = {q.question_id: q for q in questions}
+
+    # Aggregate by question type
+    by_type: dict[str, dict[str, Any]] = {}
+    all_f1: list[float] = []
+    all_judge: list[int] = []
+    search_latencies: list[float] = []
+    answer_latencies: list[float] = []
+
+    for qid, qstate in checkpoint.questions.items():
+        q = q_map.get(qid)
+        if not q or qstate.phase != "evaluated":
+            continue
+
+        qtype = q.question_type
+        if qtype not in by_type:
+            by_type[qtype] = {
+                "total": 0, "f1_scores": [], "judge_scores": [],
+                "search_latencies": [], "answer_latencies": [],
+            }
+
+        entry = by_type[qtype]
+        entry["total"] += 1
+        entry["f1_scores"].append(qstate.f1_score)
+        if qstate.llm_judge_score >= 0:
+            entry["judge_scores"].append(qstate.llm_judge_score)
+        entry["search_latencies"].append(qstate.search_latency_ms)
+        entry["answer_latencies"].append(qstate.answer_latency_ms)
+
+        all_f1.append(qstate.f1_score)
+        if qstate.llm_judge_score >= 0:
+            all_judge.append(qstate.llm_judge_score)
+        search_latencies.append(qstate.search_latency_ms)
+        answer_latencies.append(qstate.answer_latency_ms)
+
+    # Build per-type summary
+    type_summary = {}
+    for qtype, data in sorted(by_type.items()):
+        avg_f1 = sum(data["f1_scores"]) / len(data["f1_scores"]) if data["f1_scores"] else 0
+        avg_judge = sum(data["judge_scores"]) / len(data["judge_scores"]) if data["judge_scores"] else None
+        type_summary[qtype] = {
+            "total": data["total"],
+            "f1_mean": round(avg_f1, 4),
+            "judge_accuracy": round(avg_judge, 4) if avg_judge is not None else None,
+            "search_latency": _latency_stats(data["search_latencies"]),
+            "answer_latency": _latency_stats(data["answer_latencies"]),
+        }
+
+    overall_f1 = sum(all_f1) / len(all_f1) if all_f1 else 0
+    overall_judge = sum(all_judge) / len(all_judge) if all_judge else None
+
+    return {
+        "run_id": checkpoint.run_id,
+        "benchmark": config.benchmark,
+        "search_mode": config.search_mode,
+        "answer_model": config.answer_model,
+        "judge_model": config.judge_model,
+        "extraction_level": config.extraction_level,
+        "timestamp": datetime.datetime.now().isoformat(),
+        "summary": {
+            "total_questions": len(all_f1),
+            "f1_mean": round(overall_f1, 4),
+            "judge_accuracy": round(overall_judge, 4) if overall_judge is not None else None,
+        },
+        "by_question_type": type_summary,
+        "latency": {
+            "search": _latency_stats(search_latencies),
+            "answer": _latency_stats(answer_latencies),
+            "total": _latency_stats(
+                [s + a for s, a in zip(search_latencies, answer_latencies)]
+            ),
+        },
+        "pipeline": {
+            "conversations_ingested": len(checkpoint.conversations),
+            "threads_created": sum(
+                len(c.thread_ids) for c in checkpoint.conversations.values()
+            ),
+        },
+    }
+
+
+def render_report_markdown(report: dict[str, Any]) -> str:
+    """Render a report dict as markdown."""
+    lines = []
+    lines.append(f"# Benchmark Report: {report['benchmark']}")
+    lines.append("")
+    lines.append(f"- **Run ID**: {report['run_id']}")
+    lines.append(f"- **Search mode**: {report['search_mode']}")
+    lines.append(f"- **Answer model**: {report['answer_model']}")
+    lines.append(f"- **Extraction level**: {report['extraction_level']}")
+    lines.append(f"- **Timestamp**: {report['timestamp']}")
+    lines.append("")
+
+    s = report["summary"]
+    lines.append("## Overall Results")
+    lines.append("")
+    lines.append(f"| Metric | Value |")
+    lines.append(f"|--------|-------|")
+    lines.append(f"| Total Questions | {s['total_questions']} |")
+    lines.append(f"| F1 Score (mean) | {s['f1_mean']:.4f} |")
+    if s.get("judge_accuracy") is not None:
+        lines.append(f"| LLM Judge Accuracy | {s['judge_accuracy']:.4f} |")
+    lines.append("")
+
+    # Per-type breakdown
+    lines.append("## Results by Question Type")
+    lines.append("")
+    lines.append("| Type | Count | F1 Mean | Judge Acc | Search p50 (ms) | Answer p50 (ms) |")
+    lines.append("|------|-------|---------|-----------|-----------------|-----------------|")
+    for qtype, data in report.get("by_question_type", {}).items():
+        judge = f"{data['judge_accuracy']:.2f}" if data.get("judge_accuracy") is not None else "—"
+        lines.append(
+            f"| {qtype} | {data['total']} | {data['f1_mean']:.4f} | {judge} "
+            f"| {data['search_latency']['median']:.0f} | {data['answer_latency']['median']:.0f} |"
+        )
+    lines.append("")
+
+    # Latency
+    lines.append("## Latency Summary")
+    lines.append("")
+    lines.append("| Phase | p50 (ms) | p95 (ms) | p99 (ms) | Mean (ms) |")
+    lines.append("|-------|----------|----------|----------|-----------|")
+    for phase in ("search", "answer", "total"):
+        lat = report.get("latency", {}).get(phase, {})
+        lines.append(
+            f"| {phase} | {lat.get('median', 0):.0f} | {lat.get('p95', 0):.0f} "
+            f"| {lat.get('p99', 0):.0f} | {lat.get('mean', 0):.0f} |"
+        )
+    lines.append("")
+
+    # Pipeline info
+    p = report.get("pipeline", {})
+    lines.append("## Pipeline Stats")
+    lines.append("")
+    lines.append(f"- Conversations ingested: {p.get('conversations_ingested', 0)}")
+    lines.append(f"- Threads created: {p.get('threads_created', 0)}")
+    lines.append("")
+
+    return "\n".join(lines)
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/process.py b/nowledge-mem-bench/src/nmem_bench/pipeline/process.py
new file mode 100644
index 000000000..12fc4f0b2
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/process.py
@@ -0,0 +1,100 @@
+"""Phase 2: Process — distill threads and wait for KG extraction."""
+
+from __future__ import annotations
+
+import logging
+
+from nmem_bench.nmem.client import NmemClient
+from nmem_bench.pipeline.checkpoint import RunCheckpoint
+
+logger = logging.getLogger(__name__)
+
+
+def distill_threads(
+    client: NmemClient,
+    checkpoint: RunCheckpoint,
+    extraction_level: str = "guided",
+    on_progress: callable = None,
+) -> None:
+    """Distill memories from all ingested threads.
+
+    Finds all checkpoint conversations in "ingested" phase, runs
+    `nmem t distill` on each thread, and marks them "distilled".
+    Works for both LoCoMo and LongMemEval.
+    """
+    conversations = [
+        (sid, c)
+        for sid, c in checkpoint.conversations.items()
+        if c.phase == "ingested"
+    ]
+
+    if not conversations:
+        logger.info("No conversations to distill (all already processed)")
+        return
+
+    total = sum(len(c.thread_ids) for _, c in conversations)
+    done = 0
+
+    for sample_id, conv_state in conversations:
+        logger.info("Distilling %s: %d threads", sample_id, len(conv_state.thread_ids))
+
+        for tid in conv_state.thread_ids:
+            try:
+                client.thread_distill(tid, extraction_level=extraction_level)
+                done += 1
+                logger.debug("Distilled thread %s (%d/%d)", tid, done, total)
+            except Exception as e:
+                logger.error("Distillation failed for thread %s: %s", tid, e)
+                done += 1
+
+            if on_progress:
+                on_progress(done, total, f"{sample_id}/{tid}")
+
+        conv_state.phase = "distilled"
+        checkpoint.save()
+
+
+def wait_for_processing(
+    client: NmemClient,
+    checkpoint: RunCheckpoint,
+    timeout: int = 600,
+    poll_interval: int = 15,
+) -> None:
+    """Wait for background Knowledge Agent to finish processing.
+
+    After distillation, the Knowledge Agent automatically:
+    - Extracts entities and relationships (KG extraction)
+    - Detects EVOLVES chains (knowledge evolution)
+    - Creates crystals (consolidated knowledge)
+
+    We poll stats until counts stabilize.
+    """
+    logger.info("Waiting for background processing (timeout=%ds)...", timeout)
+
+    initial = client.stats()
+    logger.info(
+        "Current: memories=%d, entities=%d, crystals=%d",
+        initial.memory_count, initial.entity_count, initial.crystal_count,
+    )
+
+    final = client.wait_for_processing(
+        initial_stats=initial,
+        timeout=timeout,
+        poll_interval=poll_interval,
+    )
+
+    # Update conversation states
+    for sid, conv_state in checkpoint.conversations.items():
+        if conv_state.phase == "distilled":
+            conv_state.memory_count_after = final.memory_count
+            conv_state.phase = "processed"
+
+    checkpoint.save()
+
+    logger.info(
+        "Processing complete: memories=%d (+%d), entities=%d, crystals=%d",
+        final.memory_count,
+        final.memory_count - initial.memory_count,
+        final.entity_count,
+        final.crystal_count,
+    )
diff --git a/nowledge-mem-bench/src/nmem_bench/pipeline/search.py b/nowledge-mem-bench/src/nmem_bench/pipeline/search.py
new file mode 100644
index 000000000..7e08f51fb
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/pipeline/search.py
@@ -0,0 +1,73 @@
+"""Phase 3: Search — query for each benchmark question."""
+
+from __future__ import annotations
+
+import logging
+import time
+
+from nmem_bench.benchmarks.types import UnifiedQuestion
+from nmem_bench.nmem.client import NmemClient
+from nmem_bench.pipeline.checkpoint import RunCheckpoint
+
+logger = logging.getLogger(__name__)
+
+
+def search_questions(
+    questions: list[UnifiedQuestion],
+    client: NmemClient,
+    checkpoint: RunCheckpoint,
+    search_mode: str = "normal",
+    top_k: int = 10,
+    max_content_len: int = 2000,
+    on_progress: callable = None,
+) -> None:
+    """Search for context for each question via nmem.
+
+    For each question, calls `nmem m search` and records results + latency.
+    Content is truncated to max_content_len chars in checkpoint to bound size.
+    """
+    total = len(questions)
+    searched = 0
+
+    for idx, question in enumerate(questions):
+        qstate = checkpoint.get_question(question.question_id)
+
+        if qstate.phase in ("searched", "answered", "evaluated"):
+            searched += 1
+            continue
+
+        t0 = time.time()
+        try:
+            results = client.memory_search(
+                query=question.question,
+                limit=top_k,
+                mode=search_mode,
+            )
+            latency_ms = (time.time() - t0) * 1000
+
+            qstate.search_results = [
+                {
+                    "memory_id": r.memory_id,
+                    "title": r.title,
+                    "content": r.content[:max_content_len],
+                    "confidence": r.confidence,
+                }
+                for r in results
+            ]
+            qstate.search_latency_ms = latency_ms
+            qstate.phase = "searched"
+
+        except Exception as e:
+            qstate.error = f"Search failed: {e}"
+            logger.error("Search failed for %s: %s", question.question_id, e)
+
+        searched += 1
+        if searched % 20 == 0 or searched == total:
+            checkpoint.save()
+            logger.info("[%d/%d] Searched", searched, total)
+
+        if on_progress:
+            on_progress(searched, total, question.question_id)
+
+    checkpoint.save()
+    logger.info("Search complete: %d questions", total)
diff --git a/nowledge-mem-bench/src/nmem_bench/utils/__init__.py b/nowledge-mem-bench/src/nmem_bench/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/nowledge-mem-bench/src/nmem_bench/utils/config.py b/nowledge-mem-bench/src/nmem_bench/utils/config.py
new file mode 100644
index 000000000..f03601e4f
--- /dev/null
+++ b/nowledge-mem-bench/src/nmem_bench/utils/config.py
@@ -0,0 +1,25 @@
+"""Configuration utilities."""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+
+def get_project_root() -> Path:
+    """Get the nmem-bench project root directory."""
+    return Path(__file__).resolve().parents[3]
+
+
+def get_data_dir() -> Path:
+    """Get the data directory for downloaded datasets."""
+    d = get_project_root() / "data"
+    d.mkdir(parents=True, exist_ok=True)
+    return d
+
+
+def get_results_dir() -> Path:
+    """Get the results directory."""
+    d = get_project_root() / "results"
+    d.mkdir(parents=True, exist_ok=True)
+    return d
diff --git a/nowledge-mem-claude-code-plugin/.claude-plugin/plugin.json b/nowledge-mem-claude-code-plugin/.claude-plugin/plugin.json
index 75d621d7a..74f4fc7a7 100644
--- a/nowledge-mem-claude-code-plugin/.claude-plugin/plugin.json
+++ b/nowledge-mem-claude-code-plugin/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "nowledge-mem",
-  "description": "Personal knowledge graph for Claude Code — remembers decisions, searches past work, captures sessions",
-  "version": "0.7.0",
+  "description": "Personal knowledge graph for Claude Code \u2014 remembers decisions, searches past work, captures sessions",
+  "version": "0.7.1",
   "author": {
     "name": "Nowledge Labs",
     "email": "hello@nowledge-labs.ai",
@@ -22,4 +22,4 @@
     "knowledge-management",
     "cli"
   ]
-}
\ No newline at end of file
+}
diff --git a/nowledge-mem-claude-code-plugin/CHANGELOG.md b/nowledge-mem-claude-code-plugin/CHANGELOG.md
index 43a24ab43..9f766eb30 100644
--- a/nowledge-mem-claude-code-plugin/CHANGELOG.md
+++ b/nowledge-mem-claude-code-plugin/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to the Nowledge Mem Claude Code plugin will be documented in
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.7.1] - 2026-03-09
+
+### Changed
+
+- Clarified the plugin's remote-mode contract so the Stop hook is documented as local client-side transcript capture through `nmem t save --from claude-code`, not server-side filesystem access
+- Tightened README wording to match the unified Nowledge memory lifecycle: Working Memory, routed recall, real thread save, distillation, and honest handoff semantics
+
 ## [0.7.0] - 2026-03-04
 
 ### Added
@@ -77,4 +84,3 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 
 - N/A (initial release)
-
diff --git a/nowledge-mem-claude-code-plugin/README.md b/nowledge-mem-claude-code-plugin/README.md
index c267bab53..368e2e1ec 100644
--- a/nowledge-mem-claude-code-plugin/README.md
+++ b/nowledge-mem-claude-code-plugin/README.md
@@ -26,13 +26,13 @@ On Windows/Linux with the Nowledge Mem desktop app, `nmem` is already bundled.
 **Automatic (no action needed):**
 
 - Working Memory briefing loaded at every session start, resume, and clear
-- Per-turn behavioral nudge with search/save syntax
+- Per-turn behavioral nudge with memory search, thread search, and save syntax
 - Session conversations captured to your knowledge graph on each response
 - Context recovered after compaction events
 
 **Autonomous skills (Claude acts on its own):**
 
-- **Search Memory** -- searches past work when you reference it
+- **Search Memory** -- searches both distilled memories and prior sessions when continuity matters
 - **Distill Memory** -- suggests saving breakthroughs and decisions
 - **Read Working Memory** -- loads your daily context briefing
 
@@ -76,6 +76,8 @@ The plugin works transparently in both modes:
 
 You can also use environment variables (`NMEM_API_URL`, `NMEM_API_KEY`), which override the config file.
 
+In remote mode, the Stop hook still reads Claude session files locally through `nmem t save --from claude-code` on the machine where Claude Code is running, then uploads the normalized messages to Mem. The remote Mem server does not need direct access to your `~/.claude` directory.
+
 ## Update
 
 ```bash
diff --git a/nowledge-mem-claude-code-plugin/commands/search.md b/nowledge-mem-claude-code-plugin/commands/search.md
index 6ca741877..8944b3dc7 100644
--- a/nowledge-mem-claude-code-plugin/commands/search.md
+++ b/nowledge-mem-claude-code-plugin/commands/search.md
@@ -39,5 +39,7 @@ Returns matching memories with:
 ## Usage Tips
 
 - Use specific keywords that match stored memory titles
-- Check `source_thread` to view the full original conversation
+- If the user is asking about a prior conversation or session, also try `nmem --json t search "$ARGUMENTS" --limit 5`
+- If a result has `source_thread`, inspect that thread progressively with `nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200`
+- Page with a higher `--offset` only when more messages are actually needed
 - Higher scores indicate better semantic matches
diff --git a/nowledge-mem-claude-code-plugin/skills/distill-memory/SKILL.md b/nowledge-mem-claude-code-plugin/skills/distill-memory/SKILL.md
index 00f3d3a63..c5669b84b 100644
--- a/nowledge-mem-claude-code-plugin/skills/distill-memory/SKILL.md
+++ b/nowledge-mem-claude-code-plugin/skills/distill-memory/SKILL.md
@@ -38,6 +38,12 @@ nmem m add "Insight + context for future use" \
   -i 0.8
 ```
 
+If an existing memory already captures the same decision, workflow, or preference and the new information refines it, update that memory instead of creating a duplicate:
+
+```bash
+nmem m update <id> -t "Updated title"
+```
+
 **Content:** Outcome/insight focus, include "why", enough context
 
 **Importance:** 0.8-1.0 major | 0.5-0.7 useful | 0.3-0.4 minor
diff --git a/nowledge-mem-claude-code-plugin/skills/save-thread/SKILL.md b/nowledge-mem-claude-code-plugin/skills/save-thread/SKILL.md
index c4a7d36a3..c7ffb5b02 100644
--- a/nowledge-mem-claude-code-plugin/skills/save-thread/SKILL.md
+++ b/nowledge-mem-claude-code-plugin/skills/save-thread/SKILL.md
@@ -1,6 +1,6 @@
 ---
 name: Save Thread
-description: Save complete conversation as checkpoint. Only when user explicitly requests ("save session", "checkpoint this"). Use nmem t save to automatically import Claude Code sessions.
+description: Save the real Claude Code session messages only when the user explicitly requests it. Use nmem t save to import the recorded session, not a summary-only checkpoint.
 ---
 
 # Save Thread
@@ -44,7 +44,7 @@ nmem t save --from claude-code -p /path/to/project
 
 ## Thread vs Memory
 
-Thread = full history | Memory = distilled insights (different purposes, can do both)
+Thread = real session messages | Memory = distilled insights (different purposes, can do both)
 
 ## Response
 
diff --git a/nowledge-mem-claude-code-plugin/skills/search-memory/SKILL.md b/nowledge-mem-claude-code-plugin/skills/search-memory/SKILL.md
index f50a1a759..848280491 100644
--- a/nowledge-mem-claude-code-plugin/skills/search-memory/SKILL.md
+++ b/nowledge-mem-claude-code-plugin/skills/search-memory/SKILL.md
@@ -55,6 +55,20 @@ nmem --json m search "meeting notes" -t week
 
 **JSON Response:** Parse `memories` array, check `score` field for relevance
 
+Use thread search when the user is really asking about a prior conversation, previous session, or exact discussion:
+
+```bash
+nmem --json t search "query" --limit 5
+```
+
+If a memory result includes `source_thread` or thread search finds the likely conversation, inspect it progressively instead of loading the whole thread at once:
+
+```bash
+nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200
+```
+
+Increase `--offset` only when more messages are actually needed.
+
 **Scores:** 0.6-1.0 direct | 0.3-0.6 related | <0.3 skip
 
 **Examples:**
diff --git a/nowledge-mem-codex-prompts/AGENTS.md b/nowledge-mem-codex-prompts/AGENTS.md
new file mode 100644
index 000000000..7a11475ea
--- /dev/null
+++ b/nowledge-mem-codex-prompts/AGENTS.md
@@ -0,0 +1,68 @@
+# Nowledge Mem for Codex CLI
+
+You have access to the user's Nowledge Mem through the `nmem` CLI.
+
+Use the installed prompts when convenient, but compose direct `nmem` commands whenever that is clearer or more efficient.
+
+## Working Memory
+
+At session start, or when recent priorities would help, read Working Memory:
+
+```bash
+nmem --json wm read
+```
+
+If it returns `exists: false`, say there is no briefing yet and continue normally.
+
+Only fall back to `~/ai-now/memory.md` for older local-only setups.
+
+## Search Memory
+
+Search when:
+
+- the task connects to previous work
+- the user asks why a decision was made
+- the bug or design resembles a past issue
+- durable context would improve the answer
+
+Start with:
+
+```bash
+nmem --json m search "query"
+```
+
+Use `--mode deep` when the need is conceptual, historical, or the first pass is weak.
+
+When the user is asking about a past conversation or previous session, search threads directly too:
+
+```bash
+nmem --json t search "query" --limit 5
+```
+
+If a memory result includes `source_thread`, or thread search finds the likely conversation, inspect it progressively instead of loading everything at once:
+
+```bash
+nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200
+```
+
+## Distill Memory
+
+When the conversation produces a durable insight, decision, lesson, or procedure, save an atomic memory with `nmem --json m add`.
+
+Prefer high-signal memories over routine chatter.
+
+## Save Session
+
+Only save the Codex session when the user explicitly asks.
+
+Use:
+
+```bash
+nmem --json t save --from codex -p . -s "Brief summary of what was accomplished"
+```
+
+This saves the real Codex session messages. The summary is only metadata.
+
+## Remote Setup
+
+For remote Mem, prefer `~/.nowledge-mem/config.json`. Use `NMEM_API_URL` and `NMEM_API_KEY` only for temporary shell-level overrides.
diff --git a/nowledge-mem-codex-prompts/README.md b/nowledge-mem-codex-prompts/README.md
index 65d3b7f93..d96fc9807 100644
--- a/nowledge-mem-codex-prompts/README.md
+++ b/nowledge-mem-codex-prompts/README.md
@@ -1,97 +1,125 @@
-# Nowledge Mem Custom Prompts for Codex
+# Nowledge Mem for Codex CLI
 
-Custom prompts to save your Codex sessions or create memory entries to Nowledge Mem.
+> Memory-aware custom prompts for Codex CLI, with an optional project `AGENTS.md` companion for stronger default behavior.
+
+Codex does not yet have the same packaged extension surface as Gemini CLI. The stable Codex-native path today is:
+
+- install reusable custom prompts in `~/.codex/prompts`
+- optionally merge this package's `AGENTS.md` into your project root
+- let `nmem` handle local and remote memory operations directly
+
+That keeps the integration sharp, durable, and easy to reason about.
+
+## Memory Lifecycle
+
+This package follows the same core flow as the richer native integrations:
+
+1. read Working Memory for current priorities
+2. route recall across memories and threads
+3. save the real session when the user asks
+4. distill durable knowledge from the work
 
 ## Quick Install
 
-> Fresh install:
+Fresh install:
 
 ```bash
 curl -fsSL https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/install.sh | bash
 ```
 
-> Update install:
+Update install:
 
 ```bash
 curl -fsSL https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/install.sh -o /tmp/install.sh && bash /tmp/install.sh --force && rm /tmp/install.sh
 ```
 
-## Available Commands
-
-### `/prompts:read_working_memory`
-
-Load your daily Working Memory briefing — focus areas, priorities, flags, and recent activity. Run at session start for context.
+## What You Get
 
-### `/prompts:save_session`
+### Custom prompts
 
-Save your current Codex session to Nowledge. Lists available sessions and lets you choose which one to save.
+- `/prompts:read_working_memory`
+- `/prompts:search_memory`
+- `/prompts:save_session`
+- `/prompts:distill`
 
-### `/prompts:distill`
+### Project guidance
 
-Analyze your conversation and create structured memory entries with key insights and learnings.
+- `AGENTS.md` you can copy or merge into your project root to teach Codex when to read Working Memory, search memory, distill high-value insights, and save real sessions.
 
-## Prerequisites
+## Recommended Setup
 
-1. **nmem CLI**: Use `uvx --from nmem-cli nmem` (recommended) or install with `pip install nmem-cli`
-2. **jq**: Install with `brew install jq` (macOS) or `sudo apt install jq` (Debian/Ubuntu)
+### 1. Make sure `nmem` is available
 
-### nmem CLI Setup
+If Nowledge Mem is already running on the same machine through the desktop app, the cleanest setup is **Settings -> Preferences -> Developer Tools -> Install CLI**.
 
-**Option 1: uvx (Recommended - No Installation Required)**
-
-Use `uvx` to run `nmem` without installing it:
+You can also install `nmem` standalone:
 
 ```bash
-# Install uv if not already installed
+# Option 1: uvx
 curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Run nmem directly (downloads automatically on first use)
 uvx --from nmem-cli nmem --version
-```
 
-**Benefits:**
-- No manual installation or updates needed
-- Isolated from system Python
-- Cached for fast startup (use `uvx --refresh --from nmem-cli nmem` to update)
-- Works on macOS, Linux, and Windows
+# Option 2: pip
+pip install nmem-cli
+```
 
-**Option 2: pip/pipx (Traditional Installation)**
+Verify it:
 
 ```bash
-# Using pip
-pip install nmem-cli
+nmem status
+```
+
+### 2. Configure remote Mem the durable way when needed
 
-# Or using pipx for isolated installation
-pipx install nmem-cli
+Preferred long-term remote setup:
+
+```json
+{
+  "apiUrl": "https://mem.example.com",
+  "apiKey": "nmem_your_key"
+}
 ```
 
-Verify installation:
+Save it to:
 
-```bash
-nmem --version
-# or
-uvx --from nmem-cli nmem --version
+```text
+~/.nowledge-mem/config.json
 ```
 
-**Note**: 
-- On Windows/Linux with Nowledge Mem Desktop app installed, `nmem` is bundled
-- On macOS or when using Mem as a remote server, use `uvx` or install manually
-- The CLI connects to your Nowledge Mem instance at `http://localhost:14242` by default
+`nmem` resolves connection settings in this order:
 
-## Troubleshooting
+1. `--api-url`
+2. `NMEM_API_URL` / `NMEM_API_KEY`
+3. `~/.nowledge-mem/config.json`
+4. defaults
 
-- **"Command not found: uvx"** → Install uv with `curl -LsSf https://astral.sh/uv/install.sh | sh`
-- **"Command not found: nmem"** → Use `uvx --from nmem-cli nmem` or install with `pip install nmem-cli`
-- **"Command not found: jq"** → Install jq using your package manager
-- **"Cannot connect to server"** → Ensure Nowledge Mem is running at `http://localhost:14242`
-- **Sessions not listing** → Ensure you're in the correct project directory
+### 3. Optionally merge `AGENTS.md` into your project
+
+If your project already has an `AGENTS.md`, merge the Nowledge section into it instead of overwriting the file.
 
 ## Manual Install
 
+Install prompts:
+
 ```bash
 mkdir -p ~/.codex/prompts
 cd ~/.codex/prompts
 curl -O https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/read_working_memory.md
+curl -O https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/search_memory.md
 curl -O https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/save_session.md
 curl -O https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/distill.md
 ```
+
+Then copy or merge the project guidance file:
+
+```bash
+curl -O https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/AGENTS.md
+```
+
+## Troubleshooting
+
+- **"Command not found: uvx"** → Install uv with `curl -LsSf https://astral.sh/uv/install.sh | sh`
+- **"Command not found: nmem"** → Use `uvx --from nmem-cli nmem ...` or install with `pip install nmem-cli`
+- **"Cannot connect to server"** → Check `nmem status` and verify `~/.nowledge-mem/config.json` for remote setups
+- **Prompts do not appear in Codex** → Restart Codex CLI after installation
+- **Sessions not listing** → Make sure you are saving from the same project directory used in Codex
diff --git a/nowledge-mem-codex-prompts/distill.md b/nowledge-mem-codex-prompts/distill.md
index 74b226631..10c4321c1 100644
--- a/nowledge-mem-codex-prompts/distill.md
+++ b/nowledge-mem-codex-prompts/distill.md
@@ -1,64 +1,28 @@
 ---
-description: Analyze conversation and create structured memory entries
+description: Distill durable insights from the current Codex conversation into Nowledge Mem
 ---
 
-Analyze our current conversation and create structured memory entries for important information. Follow this systematic approach:
+Distill the most valuable insights from the current Codex conversation into Nowledge Mem.
 
-**Analysis Process:**
+## Workflow
 
-1. **Content Review**: Examine our conversation to identify:
-   - Key insights, decisions, and learnings
-   - Important details, names, and dates
-   - Actionable items and outcomes
-   - Information valuable for future reference
+1. Identify 1-3 durable insights, decisions, lessons, procedures, or preferences.
+2. Skip routine chatter, unresolved half-ideas, and low-signal implementation noise.
+3. If a memory likely already exists, search first instead of creating a duplicate.
+4. Use `nmem --json m add` for each selected memory.
+5. Use strong titles, a fitting `--unit-type`, and 0-3 labels only when they improve retrieval.
+6. Set `-s codex` so the capture path stays auditable.
 
-2. **Memory Structuring**: For each significant piece of information, create:
-   - **Concise Title** (max 60 characters): Captures the essence and is searchable
-   - **Structured Summary**: Preserves key details, uses clear language, includes actionable items
-   - **Importance Score** (0.1-1.0): Based on significance and future value
-   - **Relevant Labels**: 2-4 labels using lowercase with hyphens (e.g., work, meeting, python, decision)
+## Importance Guide
 
-3. **Quality Standards**:
-   - Write in clear, professional language
-   - Use specific details (names, dates, technical terms)
-   - Structure with bullet points or numbered lists when appropriate
-   - Ensure memories are standalone and understandable out of context
-   - Avoid redundancy across memories
+- `0.6-0.7`: useful but routine durable knowledge
+- `0.8-0.9`: major lesson, decision, or breakthrough
+- `1.0`: rare, foundational memory
 
-**Memory Creation**:
-
-Use the `nmem` CLI to create memories for each important piece of information:
-
-```bash
-nmem m add "Bullet list or short paragraph capturing the key insight" \
-  --title "Brief, searchable title" \
-  --importance 0.8 \
-  --unit-type learning \
-  -l topic1 -l topic2
-```
-
-Available unit types: `fact`, `preference`, `decision`, `plan`, `procedure`, `learning`, `context`, `event`.
-
-**Example:**
+## Example
 
 ```bash
-nmem m add "Implemented JWT-based authentication with refresh tokens. Access tokens expire in 15min, refresh tokens in 7 days. Tokens stored in httpOnly cookies for security." \
-  --title "JWT Authentication Implementation" \
-  --importance 0.9 \
-  --unit-type decision \
-  -l auth -l security
+nmem --json m add "JWT refresh failures came from gateway and API clock skew. Keep refresh verification in the API layer and validate expiry against remote sessions."   --title "JWT refresh failures traced to clock skew"   --importance 0.9   --unit-type learning   -l auth -l backend   -s codex
 ```
 
-**For programmatic use**, add `--json` flag to verify creation:
-
-```bash
-nmem --json m add "Content here" --title "Title" --importance 0.8
-```
-
-**Guidelines:**
-
-- **High importance** (0.8-1.0): Critical decisions, major insights, key learnings
-- **Medium importance** (0.5-0.7): Useful details, preferences, contextual information  
-- **Lower importance** (0.1-0.4): Minor notes, exploratory discussions
-
-Create 2-5 focused memories that capture the essential takeaways from our conversation.
+After saving, report what was stored, which unit types were used, and why each memory was worth keeping.
diff --git a/nowledge-mem-codex-prompts/install.sh b/nowledge-mem-codex-prompts/install.sh
index ec90f8ee5..80c4110d3 100755
--- a/nowledge-mem-codex-prompts/install.sh
+++ b/nowledge-mem-codex-prompts/install.sh
@@ -71,7 +71,7 @@ install_prompt() {
 
 # Install prompts with error handling
 failed=0
-for prompt in "read_working_memory.md" "save_session.md" "distill.md"; do
+for prompt in "read_working_memory.md" "search_memory.md" "save_session.md" "distill.md"; do
     if ! install_prompt "$prompt"; then
         failed=1
     fi
@@ -176,3 +176,6 @@ echo ""
 echo "📋 Installed prompts:"
 ls -lh "$PROMPTS_DIR"/*.md 2>/dev/null | awk '{print "   " $9}' | sed "s|$PROMPTS_DIR/|   |" || echo "   No prompts installed"
 echo ""
+
+echo "💡 Optional: copy or merge AGENTS.md from the package into your project root for stronger default memory behavior."
+echo "   https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-codex-prompts/AGENTS.md"
diff --git a/nowledge-mem-codex-prompts/read_working_memory.md b/nowledge-mem-codex-prompts/read_working_memory.md
index 8e0234d1f..78162a3ff 100644
--- a/nowledge-mem-codex-prompts/read_working_memory.md
+++ b/nowledge-mem-codex-prompts/read_working_memory.md
@@ -2,31 +2,24 @@
 description: Load your daily Working Memory briefing for session context
 ---
 
-Read your daily Working Memory briefing to understand current context, priorities, and recent activity.
+Load the user's Working Memory briefing before continuing.
 
-## Usage
+## Workflow
+
+Use:
 
 ```bash
-cat ~/ai-now/memory.md 2>/dev/null || echo "Working Memory not available. Ensure Nowledge Mem Background Intelligence is enabled."
+nmem --json wm read
 ```
 
-## What You'll See
-
-Working Memory is a daily briefing generated by Nowledge Mem's Background Intelligence. It contains:
-
-- **Focus areas** — topics you've been actively working on
-- **Priorities and flags** — contradictions, stale knowledge, items needing verification
-- **Recent activity** — memories created, evolved, or crystallized
-- **Deep links** — `nowledgemem://memory/<id>` links to referenced memories
+If the command succeeds but reports `exists: false`, say there is no Working Memory briefing yet.
 
-## When to Use
+Only if `nmem` is unavailable in an older local-only setup, fall back to:
 
-- **Session start** — read this first to understand what the user has been working on
-- **After context compaction** — re-read to restore awareness of current priorities
-- **Before making decisions** — check if there are relevant flags or recent insights
+```bash
+cat ~/ai-now/memory.md
+```
 
-## Notes
+Then summarize the user's active focus areas, priorities, unresolved flags, and the most relevant recent changes when a briefing is actually present.
 
-- Working Memory lives at `~/ai-now/memory.md` — it's a plain markdown file
-- Updated automatically each morning by Background Intelligence
-- If not available, the user may not have Background Intelligence enabled or configured a remote LLM
+If remote access is configured through `~/.nowledge-mem/config.json`, let `nmem` use it naturally.
diff --git a/nowledge-mem-codex-prompts/save_session.md b/nowledge-mem-codex-prompts/save_session.md
index 2b4b2462c..2e9f993f0 100644
--- a/nowledge-mem-codex-prompts/save_session.md
+++ b/nowledge-mem-codex-prompts/save_session.md
@@ -1,62 +1,38 @@
 ---
-description: Save current Codex session to Nowledge knowledge base
+description: Save the real current Codex session to Nowledge Mem
 ---
 
-Use the `nmem t save` command to automatically save the current Codex session:
+Save the real Codex session to Nowledge Mem with `nmem t save`.
 
 ## Basic Usage
 
 ```bash
-# Save current session
-nmem t save --from codex
+# Save current session from the current project
+nmem --json t save --from codex -p .
 
-# Save with summary
-nmem t save --from codex -s "Brief summary of what we accomplished"
+# Save with a short summary
+nmem --json t save --from codex -p . -s "Brief summary of what we accomplished"
 ```
 
-## Advanced Options
+## Workflow
 
-If you need to save a specific session or all sessions:
+1. Write a concise 1-2 sentence summary of what was accomplished.
+2. Save the session with:
 
 ```bash
-# List available sessions for current directory
-find ~/.codex/sessions -name "rollout-*.jsonl" -exec sh -c '
-  cwd=$(pwd)
-  meta=$(head -n1 "$1" | jq -r "select(.payload.cwd == \"$cwd\") | .payload")
-  if [ -n "$meta" ]; then
-    id=$(echo "$meta" | jq -r ".id")
-    ts=$(echo "$meta" | jq -r ".timestamp")
-    preview=$(head -n20 "$1" | jq -r "select(.type == \"event_msg\" and .payload.type == \"user_message\" and .payload.kind == \"plain\") | .payload.message" | head -n1 | cut -c1-80)
-    echo "$id | $ts | ${preview:-<no preview>}"
-  fi
-' _ {} \; | sort -r
-
-# Save specific session by ID
-nmem t save --from codex --session-id <session-id> -s "Summary"
-
-# Save all sessions for current project
-nmem t save --from codex -m all
+nmem --json t save --from codex -p . -s "Your summary here"
 ```
 
-## Workflow
-
-1. **Analyze our conversation** and create a concise 1-2 sentence summary of what we accomplished
+3. Report whether the thread was created or updated, how many messages were stored, and the thread id.
 
-2. **Save the session** using the command:
-   ```bash
-   nmem t save --from codex -s "Your summary here"
-   ```
-
-3. **Confirm the save** - The command will show:
-   - Thread ID (e.g., `codex-abc123`)
-   - Number of messages saved
-   - Whether it was created or appended
-
-## Example
+## Advanced Options
 
 ```bash
-# After completing work on authentication
-nmem t save --from codex -s "Implemented JWT authentication with refresh tokens"
+# Save a specific session by ID
+nmem --json t save --from codex -p . --session-id <session-id> -s "Summary"
+
+# Save all sessions for the current project
+nmem --json t save --from codex -p . -m all
 ```
 
-**Note:** The command is idempotent - re-running it will only append new messages, preventing duplicates.
+This is a real session import. Do not replace it with `t create`.
diff --git a/nowledge-mem-codex-prompts/search_memory.md b/nowledge-mem-codex-prompts/search_memory.md
new file mode 100644
index 000000000..840fc2ae3
--- /dev/null
+++ b/nowledge-mem-codex-prompts/search_memory.md
@@ -0,0 +1,39 @@
+---
+description: Search Nowledge Mem for relevant prior work before answering
+---
+
+Search Nowledge Mem for the current task.
+
+## Workflow
+
+1. Rewrite the request into a short retrieval query rather than copying a long prompt verbatim.
+2. Use:
+
+```bash
+nmem --json m search "best query here"
+```
+
+3. If the need is conceptual, historical, or the first search is weak, try a second pass with:
+
+```bash
+nmem --json m search "best query here" --mode deep
+```
+
+4. If the user is asking about a prior conversation, a previous session, or an exact discussion, use thread search too:
+
+```bash
+nmem --json t search "best query here" --limit 5
+```
+
+5. If a memory result includes `source_thread` or thread search returns the likely conversation, inspect it progressively:
+
+```bash
+nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200
+```
+
+6. Add filters only when the task clearly implies them:
+   - labels for project or domain scope
+   - `--importance` for high-signal recall
+   - `--event-from` / `--recorded-from` when time matters
+
+Summarize only the strongest matches, avoid dumping huge threads, and clearly say when nothing relevant was found.
diff --git a/nowledge-mem-cursor-plugin/.cursor-plugin/plugin.json b/nowledge-mem-cursor-plugin/.cursor-plugin/plugin.json
new file mode 100644
index 000000000..662bb54bb
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/.cursor-plugin/plugin.json
@@ -0,0 +1,22 @@
+{
+  "name": "nowledge-mem",
+  "description": "Bring Working Memory, memory recall, and handoff summaries into Cursor with Nowledge Mem.",
+  "version": "0.1.1",
+  "author": {
+    "name": "Nowledge Labs",
+    "email": "hello@nowledge-labs.ai",
+    "url": "https://nowledge-labs.ai"
+  },
+  "homepage": "https://mem.nowledge.co/docs/integrations/cursor",
+  "repository": "https://github.com/nowledge-co/community/tree/main/nowledge-mem-cursor-plugin",
+  "license": "MIT",
+  "keywords": [
+    "cursor",
+    "memory",
+    "knowledge-graph",
+    "mcp",
+    "nowledge"
+  ],
+  "displayName": "Nowledge Mem",
+  "logo": "assets/logo.png"
+}
diff --git a/nowledge-mem-cursor-plugin/.mcp.json b/nowledge-mem-cursor-plugin/.mcp.json
new file mode 100644
index 000000000..0ebb5d3d0
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/.mcp.json
@@ -0,0 +1,8 @@
+{
+  "mcpServers": {
+    "nowledge-mem": {
+      "url": "http://localhost:14242/mcp",
+      "type": "streamableHttp"
+    }
+  }
+}
diff --git a/nowledge-mem-cursor-plugin/CHANGELOG.md b/nowledge-mem-cursor-plugin/CHANGELOG.md
new file mode 100644
index 000000000..92f66bad7
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/CHANGELOG.md
@@ -0,0 +1,26 @@
+# Changelog
+
+## [0.1.1] - 2026-03-09
+
+### Added
+
+- Repository-level `.cursor-plugin/marketplace.json` for official Cursor Marketplace submission from the multi-integration `community` repository
+- Local validator for plugin structure, MCP config, rule semantics, and marketplace manifest wiring
+- Release guide for Cursor Marketplace submission and manual IDE validation
+
+### Changed
+
+- Tightened marketplace-facing plugin metadata with a docs-specific homepage, package-specific repository URL, explicit keywords, a display name, and an in-package logo asset
+- Clarified the rule contract so `save-handoff` is named explicitly and `save-thread` remains unavailable until Cursor has a real Nowledge live session importer
+- Clarified the README around local validation and release workflow
+
+## [0.1.0] - 2026-03-09
+
+### Added
+
+- Initial Cursor plugin package for Nowledge Mem
+- Cursor plugin manifest with `.cursor-plugin/plugin.json`
+- Bundled `.mcp.json` for local Nowledge Mem MCP connectivity
+- Always-on Cursor rule for Working Memory, routed recall, distillation, and handoff semantics
+- Four skills: `read-working-memory`, `search-memory`, `distill-memory`, and `save-handoff`
+- Explicit design constraint that keeps `save-thread` unavailable until Cursor has a real Nowledge live session importer
diff --git a/nowledge-mem-cursor-plugin/README.md b/nowledge-mem-cursor-plugin/README.md
new file mode 100644
index 000000000..c0535c93d
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/README.md
@@ -0,0 +1,80 @@
+# Nowledge Mem for Cursor
+
+> Cursor-native plugin package for Nowledge Mem: MCP-backed recall, Working Memory, distillation, and resumable handoffs.
+
+This package follows Cursor's plugin format with `.cursor-plugin/plugin.json`, bundled rules, skills, and `.mcp.json` server config.
+
+## What You Get
+
+- MCP-backed `read_working_memory`, `memory_search`, `thread_search`, `thread_fetch_messages`, `memory_add`, and `memory_update`
+- Cursor rules for Working Memory timing, proactive recall, retrieval routing, and add-vs-update behavior
+- Four skills: `read-working-memory`, `search-memory`, `distill-memory`, and `save-handoff`
+- A clear lifecycle: Working Memory, routed recall, distillation, and resumable handoffs
+
+## Important Constraint
+
+This plugin does **not** expose `save-thread` yet.
+
+Cursor does not currently have a first-class Nowledge live session importer in this package, so a summary-only action must stay named `save-handoff`, not `save-thread`.
+
+## Plugin Structure
+
+```text
+.cursor-plugin/plugin.json
+rules/nowledge-mem.mdc
+skills/*/SKILL.md
+.mcp.json
+```
+
+## MCP Setup
+
+The plugin ships a local default `.mcp.json`:
+
+```json
+{
+  "mcpServers": {
+    "nowledge-mem": {
+      "url": "http://localhost:14242/mcp",
+      "type": "streamableHttp"
+    }
+  }
+}
+```
+
+For remote Mem, adjust the MCP server URL and headers using Cursor's MCP configuration flow.
+
+## Optional CLI For Handoffs
+
+If Nowledge Mem is running on the same machine through the desktop app, install `nmem` from **Settings -> Preferences -> Developer Tools -> Install CLI**.
+
+That enables the `save-handoff` skill to create resumable handoff threads with `nmem --json t create`.
+
+## Why The Rules Matter
+
+Cursor can see the tools, but the bundled rules and skills tell it when to use them:
+
+- read Working Memory once near the beginning of a session
+- search proactively when the user references previous work or a similar bug
+- search threads only when exact prior conversation history matters
+- update an existing memory instead of duplicating it when the same decision evolves
+
+## Validate Locally
+
+Run the package validator before manual Cursor testing or marketplace submission:
+
+```bash
+cd community/nowledge-mem-cursor-plugin
+node scripts/validate-plugin.mjs
+```
+
+## Install
+
+This package is prepared for Cursor's plugin format and Cursor Marketplace review. Cursor's public docs currently center the marketplace install path and repository submission flow, not a stable documented local folder-loader. Treat this directory as the source-of-truth package for manual validation before publish, then install through the marketplace once the listing is live.
+
+Release and submission notes live in [`RELEASING.md`](./RELEASING.md).
+
+## Why This Design
+
+- MCP is the strongest native execution layer Cursor exposes today for Nowledge Mem.
+- Rules and skills add the lifecycle guidance that plain MCP config lacks.
+- Handoffs stay separate from real thread save so the product contract remains correct.
diff --git a/nowledge-mem-cursor-plugin/RELEASING.md b/nowledge-mem-cursor-plugin/RELEASING.md
new file mode 100644
index 000000000..cd77d7a81
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/RELEASING.md
@@ -0,0 +1,74 @@
+# Releasing the Cursor Plugin
+
+This package lives inside the `community` repository, so Cursor Marketplace readiness must account for two layers:
+
+- the plugin package itself at `nowledge-mem-cursor-plugin/`
+- the repository-level `.cursor-plugin/marketplace.json` manifest Cursor uses for multi-plugin repositories
+
+## Why This Release Path
+
+Cursor's plugin docs support repositories that contain multiple plugins.
+
+Because `community` contains many integrations and only one Cursor plugin package today, the repository root now carries `.cursor-plugin/marketplace.json` and points `nowledge-mem` at `nowledge-mem-cursor-plugin/`.
+
+That keeps the package clean while making the repository submission path explicit.
+
+## Validate Locally
+
+```bash
+cd community/nowledge-mem-cursor-plugin
+node scripts/validate-plugin.mjs
+```
+
+This validator checks:
+
+- required plugin files exist and are non-empty
+- `.cursor-plugin/plugin.json` has the core marketplace metadata
+- `.mcp.json` has a valid `nowledge-mem` server entry
+- the rule still documents honest `save-handoff` / `save-thread` semantics
+- the repository-level `.cursor-plugin/marketplace.json` points to this package
+
+## Manual Readiness Checks
+
+These still require a real Cursor IDE validation pass before submission:
+
+- manually validate the package in Cursor IDE using your current internal development workflow for unpublished plugins
+- confirm the rule is applied and all four skills are discovered
+- confirm MCP connects locally
+- confirm remote MCP configuration works when URL and headers are updated
+- confirm `save-handoff` works when `nmem` is present
+- confirm the package does not expose or claim `save-thread`
+
+## Marketplace Submission
+
+Cursor's docs say to submit the repository link at:
+
+- `https://cursor.com/marketplace/publish`
+
+Before submitting, confirm the repository is public and that the root `.cursor-plugin/marketplace.json` still points to `nowledge-mem-cursor-plugin`.
+
+## Recommended Publish Form Values
+
+Use stable, package-specific values when filling the submission form:
+
+- Organization name: `Nowledge Labs`
+- Organization handle: `nowledge-labs`
+- Contact email: `hello@nowledge-labs.ai`
+- GitHub repository: `https://github.com/nowledge-co/community`
+- Website URL: `https://mem.nowledge.co/docs/integrations/cursor`
+- Logo URL: `https://raw.githubusercontent.com/nowledge-co/community/main/nowledge-mem-cursor-plugin/assets/logo.png`
+- Short description: `Bring Working Memory, memory recall, and handoff summaries into Cursor with Nowledge Mem.`
+
+Prefer the docs page over the site homepage for the website field so reviewers land directly on setup instructions. Prefer the raw GitHub logo URL over ephemeral upload links so the asset remains stable during review.
+
+## Submission Checklist
+
+- run `node scripts/validate-plugin.mjs`
+- review `README.md` for accurate install/configuration instructions
+- confirm `plugin.json` metadata is final: name, description, author, homepage, repository, license
+- confirm the repository-level `.cursor-plugin/marketplace.json` is valid and committed
+- manually test in Cursor IDE with local Mem (do not rely on an undocumented public folder-loader claim)
+- manually test in Cursor IDE with remote Mem MCP configuration
+- verify that `save-handoff` is present and `save-thread` is not claimed
+- submit the repository link through Cursor's marketplace publish page
+- verify the marketplace listing points users to the correct docs page
diff --git a/nowledge-mem-cursor-plugin/assets/logo.png b/nowledge-mem-cursor-plugin/assets/logo.png
new file mode 100644
index 000000000..9b863a26e
Binary files /dev/null and b/nowledge-mem-cursor-plugin/assets/logo.png differ
diff --git a/nowledge-mem-cursor-plugin/rules/nowledge-mem.mdc b/nowledge-mem-cursor-plugin/rules/nowledge-mem.mdc
new file mode 100644
index 000000000..6e18a580c
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/rules/nowledge-mem.mdc
@@ -0,0 +1,43 @@
+---
+description: Nowledge Mem lifecycle for Cursor
+globs:
+alwaysApply: true
+---
+Use Nowledge Mem as the primary external memory system in Cursor.
+
+## Working Memory
+
+- Read Working Memory once near session start, resume, or when the user explicitly asks about current priorities.
+- After reading it, reuse that context mentally. Do not re-read on every turn unless the session context changed materially, the user asks for a refresh, or a long-running session clearly needs it.
+
+## When To Recall Proactively
+
+Call recall tools when past knowledge would materially improve the answer, especially when:
+
+- the user says "continue", "resume", "what did we decide", "how did we fix this before", or otherwise references previous work
+- a bug, refactor, migration, or feature resembles something solved earlier
+- the user asks for rationale, preferences, procedures, or recurring workflow details
+- the task returns to a named repo area, subsystem, feature, incident, or customer context
+- the current result is ambiguous and prior decisions or conversation history would clarify it
+
+## Retrieval Routing
+
+- Use `memory_search` first for durable knowledge: decisions, procedures, lessons, preferences, and facts.
+- Use `thread_search` when the user is asking about a prior discussion, previous session, or exact conversation history.
+- If memory search returns a likely result with `source_thread_id`, or thread search identifies the right discussion, use `thread_fetch_messages` progressively instead of loading a huge conversation all at once.
+- Prefer the smallest retrieval surface that answers the question.
+
+## Writing Rules
+
+- Use `memory_add` for new durable knowledge worth reusing later.
+- If recall already surfaced the same decision, preference, or workflow and the new information refines or corrects it, use `memory_update` instead of creating a duplicate.
+- Distill only decisions, procedures, lessons, or preferences that should remain useful after the current session ends.
+- Never present a summary-only save as a real transcript import.
+- Cursor does not yet expose a real Nowledge live session importer in this plugin, so do not claim `save-thread` exists here.
+- Use `save-handoff` only when the user explicitly asks for a resumable checkpoint or handoff summary.
+
+## Response Contract
+
+- Be explicit about whether you relied on Working Memory, memory recall, or thread recall.
+- Avoid repeated Working Memory reads or broad thread fetches when one targeted lookup is enough.
+- When `nmem` is available and the user explicitly asks for a resumable handoff, create it through the terminal with `nmem --json t create ...`.
diff --git a/nowledge-mem-cursor-plugin/scripts/validate-plugin.mjs b/nowledge-mem-cursor-plugin/scripts/validate-plugin.mjs
new file mode 100644
index 000000000..89909b7ec
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/scripts/validate-plugin.mjs
@@ -0,0 +1,119 @@
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import process from 'node:process';
+import { fileURLToPath } from 'node:url';
+
+const pluginRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
+const communityRoot = path.resolve(pluginRoot, '..');
+
+function fail(message) {
+  console.error(`ERROR: ${message}`);
+  process.exit(1);
+}
+
+async function readJson(filePath) {
+  return JSON.parse(await readFile(filePath, 'utf8'));
+}
+
+function assertString(value, label) {
+  if (typeof value !== 'string' || value.trim() === '') {
+    fail(`${label} must be a non-empty string`);
+  }
+}
+
+async function assertNonEmpty(relPath) {
+  const absPath = path.join(pluginRoot, relPath);
+  const text = await readFile(absPath, 'utf8');
+  if (text.trim() === '') {
+    fail(`${relPath} must not be empty`);
+  }
+  return text;
+}
+
+async function main() {
+  const manifest = await readJson(path.join(pluginRoot, '.cursor-plugin', 'plugin.json'));
+  const mcpConfig = await readJson(path.join(pluginRoot, '.mcp.json'));
+  const marketplace = await readJson(path.join(communityRoot, '.cursor-plugin', 'marketplace.json'));
+
+  assertString(manifest.name, 'plugin.json name');
+  assertString(manifest.version, 'plugin.json version');
+  assertString(manifest.displayName ?? manifest.name, 'plugin.json displayName or name');
+  assertString(manifest.description, 'plugin.json description');
+  assertString(manifest.homepage, 'plugin.json homepage');
+  assertString(manifest.repository, 'plugin.json repository');
+  assertString(manifest.license, 'plugin.json license');
+
+  if (manifest.name !== manifest.name.toLowerCase() || manifest.name.includes(' ')) {
+    fail('plugin.json name must be lowercase and space-free');
+  }
+
+  if (!manifest.author || typeof manifest.author !== 'object') {
+    fail('plugin.json author must be an object');
+  }
+  assertString(manifest.author.name, 'plugin.json author.name');
+
+  if (manifest.logo !== undefined) {
+    assertString(manifest.logo, 'plugin.json logo');
+    if (path.isAbsolute(manifest.logo) || manifest.logo.includes('..')) {
+      fail('plugin.json logo must be a relative in-package path');
+    }
+    await assertNonEmpty(manifest.logo);
+  }
+
+  if (manifest.keywords !== undefined) {
+    if (!Array.isArray(manifest.keywords) || manifest.keywords.length === 0) {
+      fail('plugin.json keywords must be a non-empty array when present');
+    }
+  }
+
+  const requiredPaths = [
+    '.cursor-plugin/plugin.json',
+    '.mcp.json',
+    'README.md',
+    'CHANGELOG.md',
+    'RELEASING.md',
+    'rules/nowledge-mem.mdc',
+    'skills/read-working-memory/SKILL.md',
+    'skills/search-memory/SKILL.md',
+    'skills/distill-memory/SKILL.md',
+    'skills/save-handoff/SKILL.md',
+    'scripts/validate-plugin.mjs'
+  ];
+
+  for (const relPath of requiredPaths) {
+    await assertNonEmpty(relPath);
+  }
+
+  const ruleText = await assertNonEmpty('rules/nowledge-mem.mdc');
+  if (!ruleText.includes('save-handoff')) {
+    fail('rules/nowledge-mem.mdc must mention save-handoff');
+  }
+  if (!ruleText.includes('save-thread')) {
+    fail('rules/nowledge-mem.mdc must explicitly clarify save-thread scope');
+  }
+
+  if (!mcpConfig.mcpServers || !mcpConfig.mcpServers['nowledge-mem']) {
+    fail('.mcp.json must declare mcpServers.nowledge-mem');
+  }
+  const server = mcpConfig.mcpServers['nowledge-mem'];
+  assertString(server.url, '.mcp.json mcpServers.nowledge-mem.url');
+  if (server.type !== 'streamableHttp') {
+    fail('.mcp.json mcpServers.nowledge-mem.type must be streamableHttp');
+  }
+
+  if (!marketplace.plugins || !Array.isArray(marketplace.plugins)) {
+    fail('community .cursor-plugin/marketplace.json must contain a plugins array');
+  }
+  const entry = marketplace.plugins.find((plugin) => plugin.name === manifest.name);
+  if (!entry) {
+    fail('community marketplace manifest must include this plugin by name');
+  }
+  const expectedSource = path.basename(pluginRoot);
+  if (entry.source !== expectedSource && entry.source !== `./${expectedSource}`) {
+    fail(`marketplace source for ${manifest.name} must be ${expectedSource} or ./${expectedSource}`);
+  }
+
+  console.log('Validated Cursor plugin manifest, package structure, MCP config, and community marketplace manifest.');
+}
+
+await main();
diff --git a/nowledge-mem-cursor-plugin/skills/distill-memory/SKILL.md b/nowledge-mem-cursor-plugin/skills/distill-memory/SKILL.md
new file mode 100644
index 000000000..be9d9cd57
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/skills/distill-memory/SKILL.md
@@ -0,0 +1,24 @@
+---
+name: distill-memory
+description: Capture durable decisions, lessons, and procedures from Cursor work into atomic memories.
+---
+
+# Distill Memory
+
+Capture only durable knowledge that should remain useful after the current session ends.
+
+## When To Save
+
+Use memory storage for:
+
+- decisions with rationale
+- repeatable procedures
+- lessons from debugging or incident work
+- durable preferences or constraints
+
+## Add vs Update
+
+- Use `memory_add` when the insight is genuinely new.
+- If recall already surfaced the same decision, workflow, or preference and the new information refines it, use `memory_update` instead of creating a duplicate.
+
+Prefer atomic, standalone memories with strong titles and clear meaning. Focus on what was learned or decided, not routine chatter.
diff --git a/nowledge-mem-cursor-plugin/skills/read-working-memory/SKILL.md b/nowledge-mem-cursor-plugin/skills/read-working-memory/SKILL.md
new file mode 100644
index 000000000..e1746cfb7
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/skills/read-working-memory/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: read-working-memory
+description: Read the user's Working Memory briefing when current priorities, recent activity, or active focus areas matter.
+---
+
+# Read Working Memory
+
+Use the `read_working_memory` MCP tool to load the user's current focus, priorities, and unresolved context.
+
+## When To Use
+
+- At session start
+- When resuming work after a break
+- When the user asks what they are focused on now
+- When the current task clearly depends on recent priorities or active initiatives
+
+## Usage Pattern
+
+- Read it once near the start of the session.
+- Reuse that context mentally instead of re-reading on every turn.
+- Only refresh if the session context changed materially, the user asks, or the work has gone on long enough that a fresh briefing is clearly useful.
+
+Summarize only the parts relevant to the current task.
diff --git a/nowledge-mem-cursor-plugin/skills/save-handoff/SKILL.md b/nowledge-mem-cursor-plugin/skills/save-handoff/SKILL.md
new file mode 100644
index 000000000..2f0ce60f3
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/skills/save-handoff/SKILL.md
@@ -0,0 +1,18 @@
+---
+name: save-handoff
+description: Save a resumable handoff summary only when the user explicitly asks. This is not a real transcript import.
+---
+
+# Save Handoff
+
+Use this only when the user explicitly asks for a checkpoint, resumable summary, or handoff.
+
+Cursor does not yet have a first-class Nowledge live session importer in this plugin, so do not claim a real thread save.
+
+When `nmem` is available, create a structured handoff summary through the terminal:
+
+```bash
+nmem --json t create -t "Cursor Session - <topic>" -c "Goal: ... Decisions: ... Files: ... Risks: ... Next: ..." -s cursor
+```
+
+If `nmem` is unavailable, explain that plainly instead of pretending the handoff was saved.
diff --git a/nowledge-mem-cursor-plugin/skills/search-memory/SKILL.md b/nowledge-mem-cursor-plugin/skills/search-memory/SKILL.md
new file mode 100644
index 000000000..a01f24319
--- /dev/null
+++ b/nowledge-mem-cursor-plugin/skills/search-memory/SKILL.md
@@ -0,0 +1,27 @@
+---
+name: search-memory
+description: Route recall across memories and threads when past work would improve the response.
+---
+
+# Search Memory
+
+Use Nowledge Mem proactively when prior knowledge would materially improve the answer.
+
+## Strong Triggers
+
+Search when:
+
+- the user references previous work, a prior fix, or an earlier decision
+- the task resumes a named feature, bug, refactor, incident, or subsystem
+- a debugging pattern resembles something solved earlier
+- the user asks for rationale, preferences, procedures, or "how we usually do this"
+- the current result is ambiguous and past context would make the answer sharper
+
+## Retrieval Routing
+
+1. Start with `memory_search` for durable knowledge.
+2. Use `thread_search` for prior discussions, previous sessions, or exact conversation history.
+3. If a memory result includes `source_thread_id`, or thread search finds the likely conversation, use `thread_fetch_messages` progressively.
+4. Prefer the smallest retrieval surface that answers the question.
+
+Avoid over-reading long conversations when one page of messages is enough.
diff --git a/nowledge-mem-gemini-cli-extension/.gitignore b/nowledge-mem-gemini-cli-extension/.gitignore
new file mode 100644
index 000000000..849ddff3b
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/.gitignore
@@ -0,0 +1 @@
+dist/
diff --git a/nowledge-mem-gemini-cli-extension/CHANGELOG.md b/nowledge-mem-gemini-cli-extension/CHANGELOG.md
new file mode 100644
index 000000000..6b07625a9
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/CHANGELOG.md
@@ -0,0 +1,43 @@
+# Changelog
+
+All notable changes to the Nowledge Mem Gemini CLI extension will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.1.1] - 2026-03-09
+
+### Changed
+
+- Clarified the remote-mode contract so real thread save is explicitly described as client-side transcript capture through `nmem`, not server-side filesystem access
+- Sharpened release readiness around hook validation, release packaging, and user-facing documentation for the marketplace path
+- Polished command, skill, README, website, and lifecycle wording so Working Memory, routed recall, real thread save, distillation, and handoff summaries present one coherent product model
+
+### Fixed
+
+- Corrected the Gemini hook schema to use the top-level `hooks` object Gemini expects during extension linking
+- Ensured `save-handoff` remains the resumable-summary surface while `save-thread` remains reserved for the real Gemini transcript importer
+
+## [0.1.0] - 2026-03-07
+
+### Added
+
+- Initial Gemini CLI extension for Nowledge Mem
+- Persistent `GEMINI.md` context for working memory, search, distillation, thread-save, and handoff behavior
+- Six custom commands: read working memory, search memory, distill memory, save thread, save handoff, and status
+- Five agent skills: `read-working-memory`, `search-memory`, `distill-memory`, `save-thread`, and `save-handoff`
+- Native Gemini thread-save guidance built around `nmem t save --from gemini-cli`
+- Support for `NMEM_API_URL` and `NMEM_API_KEY` shell-level overrides alongside durable `nmem` config-file setup
+- CLI-first remote setup guidance built around `~/.nowledge-mem/config.json`
+- Release packaging, checksum generation, and GitHub Actions automation for tagged Gemini extension releases
+- Release documentation and versioned release notes for reproducible Gemini marketplace publishing
+- Native Gemini extension hooks for Working Memory bootstrap at session start and best-effort thread import at session end
+
+### Changed
+
+- Tightened the Gemini manifest to better match Gemini CLI extension reference guidance, including a gallery-facing description field
+- Improved search, distillation, thread-save, and handoff prompts to prefer stronger `nmem` patterns like JSON mode, `--mode deep`, `--unit-type`, labels, and `-s gemini-cli`
+- Corrected Gemini save semantics by restoring `save-thread` for real session import and keeping `save-handoff` as the distinct resumable-summary action
+- Refined handoff guidance around a structured summary format with Goal, Decisions, Files, Risks, and Next
+- Fixed collapsed multiline command examples in the Gemini skills so the shipped prompts and docs stay clear and copyable
+- Clarified same-machine desktop setup, local-default status expectations, Working Memory empty-state behavior, Gemini install UX for local users, and the cross-agent memory lifecycle contract
diff --git a/nowledge-mem-gemini-cli-extension/GEMINI.md b/nowledge-mem-gemini-cli-extension/GEMINI.md
new file mode 100644
index 000000000..a9e2aacfa
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/GEMINI.md
@@ -0,0 +1,152 @@
+# Nowledge Mem for Gemini CLI
+
+You have access to the user's Nowledge Mem through the `nmem` CLI.
+
+This integration is intentionally CLI-first. Use the bundled commands when convenient, but compose direct `nmem` commands whenever that is clearer, more precise, or more efficient.
+
+## Core Memory Lifecycle
+
+Treat Nowledge Mem as four linked surfaces:
+
+1. Working Memory for current focus and active priorities
+2. Distilled memories for durable knowledge
+3. Threads for full searchable conversation history
+4. Handoff summaries for compact resumability when the user wants a manual handoff
+
+Prefer the smallest surface that answers the user's need, then move upward only when more context is necessary.
+
+## Connection Model
+
+`nmem` resolves remote access in this order:
+
+1. `--api-url` flag
+2. `NMEM_API_URL` / `NMEM_API_KEY`
+3. `~/.nowledge-mem/config.json`
+4. local defaults
+
+Preferred persistent remote setup:
+
+```json
+{
+  "apiUrl": "https://mem.example.com",
+  "apiKey": "nmem_your_key"
+}
+```
+
+Save it to:
+
+```text
+~/.nowledge-mem/config.json
+```
+
+## Working Memory
+
+At the start of a session, or when recent priorities would help, read Working Memory with:
+
+```bash
+nmem --json wm read
+```
+
+If the command succeeds but returns `exists: false`, there is no Working Memory briefing yet. Say that clearly instead of pretending a briefing exists.
+
+Only fall back to the legacy file below for older local-only setups where the user still keeps Working Memory there:
+
+```bash
+test -f ~/ai-now/memory.md && cat ~/ai-now/memory.md
+```
+
+Read Working Memory once near the start of a session, then reuse that context mentally. Do not re-read on every turn unless the user asks, the session context changed materially, or a long-running session clearly needs a refresh.
+
+## Search Memory
+
+Search past knowledge when:
+
+- the user references previous work, a prior fix, or an earlier decision
+- the task resumes a named feature, bug, refactor, incident, or subsystem
+- a debugging pattern resembles something solved earlier
+- the user asks for rationale, preferences, procedures, or recurring workflow details
+- the current result is ambiguous and prior context would make the answer sharper
+
+Start with durable recall:
+
+```bash
+nmem --json m search "query"
+```
+
+If the recall need is conceptual or the first pass is weak, use deep search:
+
+```bash
+nmem --json m search "query" --mode deep
+```
+
+If the user is really asking about a previous conversation or session, search threads directly:
+
+```bash
+nmem --json t search "query" --limit 5
+```
+
+If a memory search result includes `source_thread`, or thread search finds the likely conversation, inspect it progressively instead of loading the whole thread at once:
+
+```bash
+nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200
+```
+
+Prefer the smallest retrieval surface that answers the question.
+
+## Distill Memory
+
+Distill only durable knowledge worth keeping after the current session ends.
+
+Use `memory_add` for genuinely new decisions, procedures, lessons, preferences, or plans:
+
+```bash
+nmem --json m add "Insight with enough context to stand on its own." -t "Searchable title" -i 0.8 --unit-type decision -l project-name -s gemini-cli
+```
+
+If an existing memory already captures the same decision, workflow, or preference and the new information refines it, update that memory instead of creating a duplicate:
+
+```bash
+nmem m update <id> -t "Updated title"
+```
+
+## Save Thread
+
+Only save a thread when the user explicitly asks to persist the real Gemini session. The extension also performs a best-effort automatic thread import on session end, so this command is mainly for explicit mid-session capture or immediate confirmation.
+
+This is a real session import, not a summary fallback. Use:
+
+```bash
+nmem --json t save --from gemini-cli -p . -s "Brief summary of what was accomplished"
+```
+
+The summary is metadata only. The saved thread should come from Gemini's recorded session transcript.
+
+If the user wants a specific older Gemini session, add `--session-id`.
+
+## Save Handoff
+
+Only save a handoff when the user explicitly asks for a resumable summary rather than a full session import. Think of this as a handoff summary, not a transcript save.
+
+Structure the checkpoint around:
+
+- Goal
+- Major decisions
+- Files or surfaces touched
+- Open questions or risks
+- Next steps
+
+Then store it with:
+
+```bash
+nmem --json t create -t "Gemini CLI Session - topic" -c "Goal: ... Decisions: ... Files: ... Risks: ... Next: ..." -s gemini-cli
+```
+
+## Status
+
+When setup seems broken, run:
+
+```bash
+nmem status
+```
+
+Be concise, use memory tools naturally, and avoid saving routine or low-value chatter.
diff --git a/nowledge-mem-gemini-cli-extension/README.md b/nowledge-mem-gemini-cli-extension/README.md
new file mode 100644
index 000000000..a7b2a78c1
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/README.md
@@ -0,0 +1,156 @@
+# Nowledge Mem -- Gemini CLI Extension
+
+> Bring your Nowledge Mem knowledge base into Gemini CLI with persistent context, reusable slash commands, and agent skills.
+
+This package is the **Gemini-native product surface** for Nowledge Mem.
+
+It is deliberately **CLI-first**:
+
+- Gemini CLI loads `GEMINI.md` plus extension hooks for Working Memory bootstrap and session capture
+- bundled commands wrap common `nmem` workflows
+- bundled skills teach Gemini when to recall, distill, save real threads, and create handoff summaries
+- Gemini can still call `nmem` directly whenever it needs a more flexible path
+
+The recommended Gemini setup is deliberately simple and stable: Gemini CLI on top, `nmem` underneath. That keeps auth, debugging, and command composition in one place.
+
+## Requirements
+
+- [Gemini CLI](https://github.com/google-gemini/gemini-cli)
+- [Nowledge Mem](https://mem.nowledge.co) running locally, or a reachable remote Nowledge Mem server
+- `nmem` CLI in your `PATH`
+
+If Nowledge Mem is already running on the same machine through the desktop app, the cleanest setup is to install the bundled CLI from **Settings -> Preferences -> Developer Tools -> Install CLI**. That gives Gemini direct access to the local Mem instance without any remote configuration.
+
+You can also install `nmem` standalone:
+
+```bash
+# Option 1: pip
+pip install nmem-cli
+
+# Option 2: uvx
+curl -LsSf https://astral.sh/uv/install.sh | sh
+uvx --from nmem-cli nmem --version
+```
+
+Verify the connection:
+
+```bash
+nmem status
+```
+
+For the default same-machine setup, `nmem status` should show `http://127.0.0.1:14242 (default)`. No `~/.nowledge-mem/config.json` is required unless you are connecting to a remote Mem server.
+
+## Install
+
+Until this extension ships as a standalone Gemini extension package, install it from a local checkout:
+
+```bash
+git clone https://github.com/nowledge-co/community.git
+cd community/nowledge-mem-gemini-cli-extension
+gemini extensions link .
+```
+
+Restart Gemini CLI after linking.
+
+Gemini's extension release flow expects `gemini-extension.json` at the root of the installable package. This directory already follows that layout, so it can later be published cleanly as its own release artifact without changing the extension design.
+
+Release packaging and marketplace notes live in [`RELEASING.md`](./RELEASING.md).
+
+## What You Get
+
+**Automatic lifecycle hooks**
+
+- Session start loads Working Memory into Gemini when a briefing exists
+- Session end performs a best-effort real Gemini thread import through `nmem t save --from gemini-cli`
+
+**Persistent context**
+
+- `GEMINI.md` tells Gemini how to route recall across Working Memory, distilled memories, conversation threads, thread save, distillation, and handoff summaries
+
+**Custom commands**
+
+- `/nowledge:read-working-memory` -- Load today's Working Memory briefing
+- `/nowledge:search-memory <query>` -- Search your knowledge base before answering
+- `/nowledge:distill-memory` -- Save the high-value insights from the current conversation
+- `/nowledge:save-thread` -- Save the real Gemini CLI session through `nmem`'s native Gemini importer
+- `/nowledge:save-handoff` -- Save a concise resumable handoff summary for the current Gemini session
+- `/nowledge:status` -- Check `nmem` and server connectivity
+
+**Agent skills**
+
+- `read-working-memory`
+- `search-memory`
+- `distill-memory`
+- `save-thread`
+- `save-handoff`
+
+## Local vs Remote
+
+By default, `nmem` connects to the local Mem server at `http://127.0.0.1:14242`.
+
+For remote Mem, the preferred long-term setup is:
+
+```json
+{
+  "apiUrl": "https://mem.example.com",
+  "apiKey": "nmem_your_key"
+}
+```
+
+Save that to:
+
+```text
+~/.nowledge-mem/config.json
+```
+
+`nmem` loads connection settings with this priority:
+
+- `--api-url` flag
+- `NMEM_API_URL` / `NMEM_API_KEY`
+- `~/.nowledge-mem/config.json`
+- defaults
+
+If you need a temporary override for one Gemini session, launch Gemini from a shell where `NMEM_API_URL` and `NMEM_API_KEY` are already exported. For durable setup, keep using `~/.nowledge-mem/config.json`.
+
+For thread save in remote mode, the important detail is that `nmem t save --from gemini-cli` reads Gemini's local session files on the machine running Gemini, then uploads the normalized thread messages to Mem. The remote Mem server does not need direct access to your `~/.gemini` directory.
+
+## Direct `nmem` Use Is Always Allowed
+
+The bundled commands are convenience paths, not a cage. Gemini should freely compose direct `nmem` commands when that is clearer or more flexible.
+
+Examples:
+
+```bash
+nmem --json wm read
+nmem --json m search "auth token rotation" --mode deep --importance 0.7
+nmem --json m add "JWT refresh failures came from clock skew between the gateway and API nodes." -t "JWT refresh failures traced to clock skew" -i 0.9 --unit-type learning -l auth -l backend -s gemini-cli
+nmem --json t save --from gemini-cli -p . -s "Finished the auth refactor and verified the new refresh-token flow."
+nmem --json t create -t "Gemini CLI Session - auth refactor" -c "Goal: finish the auth refactor. Decisions: keep refresh verification in the API layer and treat gateway skew as the root cause. Files: api/auth.ts, auth.test.ts. Next: validate expiry behavior against remote sessions." -s gemini-cli
+nmem status
+```
+
+## Thread Save vs Handoff
+
+Gemini now supports two separate save paths, and they should stay distinct:
+
+- `/nowledge:save-thread` imports the **real Gemini session messages** into Nowledge Mem with `nmem t save --from gemini-cli`. The summary is only metadata; the stored thread is based on Gemini's recorded transcript. The extension also performs this import automatically at session end as a best-effort, idempotent lifecycle hook.
+- `/nowledge:save-handoff` creates a **compact resumable handoff summary** with Goal, Decisions, Files, Risks, and Next. Use this when the user wants a lightweight restart point rather than the full transcript.
+
+Use `/nowledge:distill-memory` for durable atomic knowledge, `/nowledge:save-thread` for the full session, and `/nowledge:save-handoff` for a resumable handoff.
+
+## Architecture Choice
+
+This integration keeps the control plane simple:
+
+- Gemini provides the extension surface: `GEMINI.md`, commands, and skills
+- `nmem` provides the execution path: memory search, Working Memory, capture, thread import, and remote auth
+- direct `nmem` composition stays available whenever Gemini needs a more flexible command path
+
+The result is a setup that is easier to reason about, easier to support, and easier for advanced users to extend.
+
+## Links
+
+- [Documentation](https://mem.nowledge.co/docs/integrations/gemini-cli)
+- [Nowledge Mem](https://mem.nowledge.co)
+- [Discord](https://nowled.ge/discord)
+- [GitHub](https://github.com/nowledge-co/community)
diff --git a/nowledge-mem-gemini-cli-extension/RELEASING.md b/nowledge-mem-gemini-cli-extension/RELEASING.md
new file mode 100644
index 000000000..c7afed4a7
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/RELEASING.md
@@ -0,0 +1,99 @@
+# Releasing the Gemini CLI Extension
+
+This extension lives inside the `community` repository, so Gemini release readiness should use the **release archive** path from Gemini's extension docs rather than assuming the repository root is the extension root.
+
+## Why This Release Path
+
+Gemini's release docs require `gemini-extension.json` to be at the root of the repository **or the release archive**.
+
+In this repo, the extension lives at `nowledge-mem-gemini-cli-extension/`, not the repository root. That means:
+
+- local development should keep using `gemini extensions link .`
+- public release and marketplace discovery should use a packaged archive whose root is this extension directory
+
+## Manual Prerequisites
+
+These are required for Gemini's gallery crawler, but cannot be enforced purely by files in this directory:
+
+- the GitHub repository must be public
+- the repository About section must include the `gemini-cli-extension` topic
+- the release must be tagged and published on GitHub
+- the attached archive must contain `gemini-extension.json` at the archive root
+
+## Validate Locally
+
+```bash
+cd community/nowledge-mem-gemini-cli-extension
+npm run validate
+```
+
+## Build The Release Artifact
+
+```bash
+cd community/nowledge-mem-gemini-cli-extension
+npm run package:release
+```
+
+Or run the full pre-release check:
+
+```bash
+cd community/nowledge-mem-gemini-cli-extension
+npm run verify:release
+```
+
+This produces:
+
+- `dist/nowledge-mem-gemini-cli-extension.tar.gz`
+- `dist/nowledge-mem-gemini-cli-extension.tar.gz.sha256`
+
+The archive is intentionally flat at the root so Gemini can inspect it as an installable extension package.
+
+## CI Verification
+
+Pull requests and relevant pushes run the `Validate Gemini Extension` workflow. That workflow validates the manifest and also rebuilds the release archive so packaging drift is caught before tagging.
+
+## Tagging Convention
+
+The GitHub Actions workflow watches tags in this form:
+
+```text
+nowledge-mem-gemini-cli-extension-v*
+```
+
+Example:
+
+```bash
+git tag nowledge-mem-gemini-cli-extension-v0.1.1
+git push origin nowledge-mem-gemini-cli-extension-v0.1.1
+```
+
+## Initial Public Release
+
+For the first public release, use:
+
+- tag: `nowledge-mem-gemini-cli-extension-v0.1.1`
+- release title: `Nowledge Mem Gemini CLI Extension v0.1.1`
+- release notes source: `release-notes/0.1.1.md`
+- workflow behavior: the release workflow verifies that the pushed tag matches `package.json` and publishes the matching `release-notes/<version>.md` file as the GitHub Release body
+
+## Installation After Release
+
+Once the tagged GitHub Release exists, Gemini users can install from the repository and ref:
+
+```bash
+gemini extensions install github.com/nowledge-co/community --ref nowledge-mem-gemini-cli-extension-v0.1.1
+```
+
+Gemini's own release docs say GitHub Releases are supported as install sources, and the workflow-created archive is shaped specifically for that path.
+
+## Release Checklist
+
+- bump `version` in `package.json` and `gemini-extension.json`
+- update `CHANGELOG.md`
+- add `release-notes/<version>.md`
+- run `npm run verify:release`
+- confirm the archive root contains `gemini-extension.json`, `package.json`, `GEMINI.md`, `commands/`, and `skills/`
+- create and push a matching tag
+- publish the GitHub Release with the generated `.tar.gz` asset and checksum
+- verify the repo still has the `gemini-cli-extension` topic
+- verify discovery on `geminicli.com/extensions` after the crawler runs
diff --git a/nowledge-mem-gemini-cli-extension/commands/nowledge/distill-memory.toml b/nowledge-mem-gemini-cli-extension/commands/nowledge/distill-memory.toml
new file mode 100644
index 000000000..91ee6fcd5
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/commands/nowledge/distill-memory.toml
@@ -0,0 +1,19 @@
+prompt = """
+Distill the most valuable insights from the current Gemini CLI conversation into Nowledge Mem.
+
+Workflow:
+
+1. Identify 1-3 durable insights, decisions, lessons, procedures, or preferences.
+2. Skip routine chatter, unresolved half-ideas, and low-signal implementation noise.
+3. If a memory looks likely to already exist, search first instead of creating a duplicate.
+4. Use the shell tool to run `nmem --json m add` for each selected memory.
+5. Use strong titles, a fitting `--unit-type`, and 0-3 labels only when they improve retrieval.
+6. Set `-s gemini-cli` so the capture path stays auditable.
+
+Importance guide:
+- `0.6-0.7`: useful but routine durable knowledge
+- `0.8-0.9`: major lesson, decision, or breakthrough
+- `1.0`: rare, foundational memory
+
+After saving, report what was stored, which unit types were used, and why each memory was worth keeping.
+"""
diff --git a/nowledge-mem-gemini-cli-extension/commands/nowledge/read-working-memory.toml b/nowledge-mem-gemini-cli-extension/commands/nowledge/read-working-memory.toml
new file mode 100644
index 000000000..f2aab6633
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/commands/nowledge/read-working-memory.toml
@@ -0,0 +1,21 @@
+prompt = """
+Load the user's Working Memory briefing before continuing.
+
+Use the shell tool to try:
+
+```bash
+nmem --json wm read
+```
+
+If the command succeeds but reports that no Working Memory exists yet, say that clearly.
+
+Only if `nmem` is unavailable in an older local-only setup, fall back to:
+
+```bash
+cat ~/ai-now/memory.md
+```
+
+Then summarize the user's active focus areas, priorities, unresolved flags, and the most relevant recent changes when a briefing is actually present.
+
+If remote access is configured through `~/.nowledge-mem/config.json`, let `nmem` use it naturally. Do not assume environment variables are the only auth path.
+"""
diff --git a/nowledge-mem-gemini-cli-extension/commands/nowledge/save-handoff.toml b/nowledge-mem-gemini-cli-extension/commands/nowledge/save-handoff.toml
new file mode 100644
index 000000000..4f7dccc5a
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/commands/nowledge/save-handoff.toml
@@ -0,0 +1,16 @@
+prompt = """
+Save a concise handoff summary of the current Gemini CLI session to Nowledge Mem.
+
+This command is intentionally a handoff-summary action, not a full thread importer.
+
+If the user wants the real Gemini session messages, use `/nowledge:save-thread` instead.
+
+Workflow:
+
+1. Write a short structured handoff with these fields: Goal, Decisions, Files, Risks, Next.
+2. Use the shell tool to create a handoff thread with `nmem --json t create -s gemini-cli`.
+3. Use a title in the form `Gemini CLI Session - <topic>`.
+4. Return the created handoff title and the summary that was stored.
+
+Keep the handoff concise, honest, and useful for resuming work later.
+"""
diff --git a/nowledge-mem-gemini-cli-extension/commands/nowledge/save-thread.toml b/nowledge-mem-gemini-cli-extension/commands/nowledge/save-thread.toml
new file mode 100644
index 000000000..67134e1d3
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/commands/nowledge/save-thread.toml
@@ -0,0 +1,15 @@
+prompt = """
+Save the real Gemini CLI session to Nowledge Mem.
+
+This command should persist the actual Gemini session messages, not a summary-only checkpoint.
+
+Workflow:
+
+1. Write a short 1-2 sentence summary of what was accomplished.
+2. Use the shell tool to run `nmem --json t save --from gemini-cli -p . -s "<summary>"`.
+3. If the user clearly wants a specific older Gemini session, add `--session-id`.
+4. Report whether the thread was created or updated, how many messages were stored, and the thread id.
+5. If no Gemini session is found, explain that plainly and suggest `/nowledge:save-handoff` only when the user wants a manual resumable summary instead.
+
+Do not replace thread-save with `t create`. `save-thread` means importing the recorded Gemini session.
+"""
diff --git a/nowledge-mem-gemini-cli-extension/commands/nowledge/search-memory.toml b/nowledge-mem-gemini-cli-extension/commands/nowledge/search-memory.toml
new file mode 100644
index 000000000..df1f1a992
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/commands/nowledge/search-memory.toml
@@ -0,0 +1,18 @@
+prompt = """
+Search Nowledge Mem for the user's query: `{{args}}`.
+
+Workflow:
+
+1. Rewrite the request into a short retrieval query rather than copying a long prompt verbatim.
+2. Start with `nmem --json m search` for distilled knowledge.
+3. If the user is asking about a prior conversation, previous session, or exact discussion, or if memory search is weak, use `nmem --json t search` too.
+4. If a memory result includes `source_thread` or thread search returns a strong hit, inspect that conversation with `nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200` and page further only when needed.
+5. If the recall need is conceptual, historical, or the first memory search is weak, run a second pass with `--mode deep`.
+6. Add filters only when the task clearly implies them:
+   - labels for project or domain scoping
+   - `--importance` for high-signal recall
+   - `--event-from` / `--recorded-from` when time matters
+7. Summarize only the strongest matches, mention memory ids or source threads when helpful, and clearly say when nothing relevant was found.
+
+Prefer precise retrieval queries over raw prompts. Use direct `nmem` composition freely when the bundled command shape is not enough.
+"""
diff --git a/nowledge-mem-gemini-cli-extension/commands/nowledge/status.toml b/nowledge-mem-gemini-cli-extension/commands/nowledge/status.toml
new file mode 100644
index 000000000..cbe6bef5d
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/commands/nowledge/status.toml
@@ -0,0 +1,17 @@
+prompt = """
+Check whether Nowledge Mem is reachable from Gemini CLI.
+
+Use the shell tool to run:
+
+```bash
+nmem status
+```
+
+Summarize:
+- whether the CLI can reach Mem
+- the effective API URL
+- the config source reported by `nmem` (`env`, `config`, or `default`)
+- whether a config file is present or malformed
+- whether remote auth appears to be set when the API URL is remote
+- the next concrete fix if setup is broken
+"""
diff --git a/nowledge-mem-gemini-cli-extension/gemini-extension.json b/nowledge-mem-gemini-cli-extension/gemini-extension.json
new file mode 100644
index 000000000..690c3790e
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/gemini-extension.json
@@ -0,0 +1,6 @@
+{
+  "name": "nowledge-mem-gemini-cli-extension",
+  "version": "0.1.1",
+  "description": "Gemini CLI extension for Nowledge Mem with persistent context, memory commands, and agent skills.",
+  "contextFileName": "GEMINI.md"
+}
diff --git a/nowledge-mem-gemini-cli-extension/hooks/hooks.json b/nowledge-mem-gemini-cli-extension/hooks/hooks.json
new file mode 100644
index 000000000..3220ebbe8
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/hooks/hooks.json
@@ -0,0 +1,30 @@
+{
+  "hooks": {
+    "SessionStart": [
+      {
+        "hooks": [
+          {
+            "name": "load-working-memory",
+            "type": "command",
+            "command": "node \"${extensionPath}${/}hooks${/}session-start.mjs\"",
+            "description": "Load Working Memory into Gemini at session start.",
+            "timeout": 10000
+          }
+        ]
+      }
+    ],
+    "SessionEnd": [
+      {
+        "hooks": [
+          {
+            "name": "auto-save-thread",
+            "type": "command",
+            "command": "node \"${extensionPath}${/}hooks${/}session-end.mjs\"",
+            "description": "Best-effort Gemini session import on shutdown.",
+            "timeout": 20000
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/nowledge-mem-gemini-cli-extension/hooks/session-end.mjs b/nowledge-mem-gemini-cli-extension/hooks/session-end.mjs
new file mode 100644
index 000000000..6f9d1b1e0
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/hooks/session-end.mjs
@@ -0,0 +1,31 @@
+import { spawnSync } from 'node:child_process';
+import { readFileSync } from 'node:fs';
+
+function readHookInput() {
+  try {
+    const raw = readFileSync(0, 'utf8');
+    return raw.trim() ? JSON.parse(raw) : {};
+  } catch {
+    return {};
+  }
+}
+
+const hookInput = readHookInput();
+const cwd = typeof hookInput.cwd === 'string' && hookInput.cwd.trim()
+  ? hookInput.cwd
+  : process.cwd();
+const sessionId = typeof hookInput.session_id === 'string' && hookInput.session_id.trim()
+  ? hookInput.session_id
+  : undefined;
+
+const args = ['--json', 't', 'save', '--from', 'gemini-cli', '-p', cwd, '--truncate'];
+if (sessionId) {
+  args.push('--session-id', sessionId);
+}
+
+spawnSync('nmem', args, {
+  encoding: 'utf8',
+  timeout: 20000,
+});
+
+process.stdout.write(JSON.stringify({ suppressOutput: true }));
diff --git a/nowledge-mem-gemini-cli-extension/hooks/session-start.mjs b/nowledge-mem-gemini-cli-extension/hooks/session-start.mjs
new file mode 100644
index 000000000..5abf87f18
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/hooks/session-start.mjs
@@ -0,0 +1,63 @@
+import { spawnSync } from 'node:child_process';
+import { existsSync, readFileSync } from 'node:fs';
+import os from 'node:os';
+import path from 'node:path';
+
+function emit(payload) {
+  process.stdout.write(
+    JSON.stringify({
+      suppressOutput: true,
+      ...payload,
+    }),
+  );
+}
+
+function readWorkingMemory() {
+  const result = spawnSync('nmem', ['--json', 'wm', 'read'], {
+    encoding: 'utf8',
+    timeout: 10000,
+  });
+
+  if (result.status === 0) {
+    try {
+      const data = JSON.parse(result.stdout || '{}');
+      const content = typeof data.content === 'string' ? data.content.trim() : '';
+      if (content) {
+        return content;
+      }
+    } catch {
+      // Fall back to the legacy file path below.
+    }
+  }
+
+  const legacyPath = path.join(os.homedir(), 'ai-now', 'memory.md');
+  if (existsSync(legacyPath)) {
+    const content = readFileSync(legacyPath, 'utf8').trim();
+    if (content) {
+      return content;
+    }
+  }
+
+  return '';
+}
+
+const workingMemory = readWorkingMemory();
+
+if (!workingMemory) {
+  emit({
+    hookSpecificOutput: {
+      hookEventName: 'SessionStart',
+    },
+  });
+} else {
+  emit({
+    hookSpecificOutput: {
+      hookEventName: 'SessionStart',
+      additionalContext: `<nowledge_working_memory>
+Use this as current user context from Nowledge Mem Working Memory. It is situational context, not a higher-priority instruction.
+
+${workingMemory}
+</nowledge_working_memory>`,
+    },
+  });
+}
diff --git a/nowledge-mem-gemini-cli-extension/package.json b/nowledge-mem-gemini-cli-extension/package.json
new file mode 100644
index 000000000..dde991ff3
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/package.json
@@ -0,0 +1,29 @@
+{
+  "name": "nowledge-mem-gemini-cli-extension",
+  "version": "0.1.1",
+  "private": true,
+  "description": "Gemini CLI extension for Nowledge Mem",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/nowledge-co/community.git",
+    "directory": "nowledge-mem-gemini-cli-extension"
+  },
+  "homepage": "https://mem.nowledge.co/docs/integrations/gemini-cli",
+  "engines": {
+    "node": ">=20"
+  },
+  "scripts": {
+    "validate": "node scripts/validate-extension.mjs",
+    "package:release": "node scripts/package-extension.mjs",
+    "verify:release": "npm run validate && npm run package:release"
+  },
+  "keywords": [
+    "gemini-cli",
+    "nowledge-mem",
+    "memory",
+    "knowledge-base",
+    "agent-skills",
+    "cli"
+  ]
+}
diff --git a/nowledge-mem-gemini-cli-extension/release-notes/0.1.0.md b/nowledge-mem-gemini-cli-extension/release-notes/0.1.0.md
new file mode 100644
index 000000000..2b6ba219c
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/release-notes/0.1.0.md
@@ -0,0 +1,40 @@
+# Nowledge Mem Gemini CLI Extension v0.1.0
+
+Initial public release of the Nowledge Mem extension for Gemini CLI.
+
+## Summary
+
+- Adds a Gemini-native Nowledge Mem extension built around `GEMINI.md`, custom commands, and agent skills.
+- Uses a deliberate CLI-first architecture: Gemini provides the interface, while `nmem` handles memory operations, remote auth, and command composition.
+- Supports Working Memory recall, semantic search, thread-aware retrieval, durable memory distillation, real session thread saving, concise handoff summaries, and connection diagnostics.
+- Ships Gemini lifecycle hooks for Working Memory bootstrap at session start and best-effort real session import at session end.
+
+## Included Commands
+
+- `/nowledge:read-working-memory`
+- `/nowledge:search-memory <query>`
+- `/nowledge:distill-memory`
+- `/nowledge:save-thread`
+- `/nowledge:save-handoff`
+- `/nowledge:status`
+
+## Included Skills
+
+- `read-working-memory`
+- `search-memory`
+- `distill-memory`
+- `save-thread`
+- `save-handoff`
+
+## Setup Notes
+
+- Preferred persistent remote setup uses `~/.nowledge-mem/config.json` through `nmem`.
+- `NMEM_API_URL` and `NMEM_API_KEY` can still be used as shell-level overrides when you want to launch Gemini with a temporary remote target.
+- `/nowledge:save-thread` now uses `nmem t save --from gemini-cli` for real Gemini session import.
+- `/nowledge:save-handoff` remains available for structured resumable summaries created with `nmem t create`.
+
+## Install
+
+```bash
+gemini extensions install github.com/nowledge-co/community --ref nowledge-mem-gemini-cli-extension-v0.1.0
+```
diff --git a/nowledge-mem-gemini-cli-extension/release-notes/0.1.1.md b/nowledge-mem-gemini-cli-extension/release-notes/0.1.1.md
new file mode 100644
index 000000000..394c7257f
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/release-notes/0.1.1.md
@@ -0,0 +1,21 @@
+# Nowledge Mem Gemini CLI Extension v0.1.1
+
+Release hardening update for the Gemini CLI extension.
+
+## Summary
+
+- Clarifies that `save-thread` remains the real Gemini transcript-backed capture path and that remote Mem setups still read transcript files locally through `nmem` on the Gemini machine.
+- Fixes the hook schema to match Gemini's expected top-level `hooks` object.
+- Sharpens the package, docs, and release assets so the extension is safer to ship and easier to review in the Gemini marketplace flow.
+
+## Operational Notes
+
+- Preferred persistent remote setup remains `~/.nowledge-mem/config.json` through `nmem`.
+- `save-handoff` remains the honest resumable-summary surface.
+- Release validation and packaging continue to run through `npm run verify:release`.
+
+## Install
+
+```bash
+gemini extensions install github.com/nowledge-co/community --ref nowledge-mem-gemini-cli-extension-v0.1.1
+```
diff --git a/nowledge-mem-gemini-cli-extension/scripts/package-extension.mjs b/nowledge-mem-gemini-cli-extension/scripts/package-extension.mjs
new file mode 100644
index 000000000..b2b3e5386
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/scripts/package-extension.mjs
@@ -0,0 +1,109 @@
+import { cp, mkdir, readFile, rm, writeFile } from 'node:fs/promises';
+import { createHash } from 'node:crypto';
+import { spawnSync } from 'node:child_process';
+import path from 'node:path';
+import process from 'node:process';
+import { fileURLToPath } from 'node:url';
+
+const extensionRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
+const distDir = path.join(extensionRoot, 'dist');
+const archiveName = 'nowledge-mem-gemini-cli-extension.tar.gz';
+const archivePath = path.join(distDir, archiveName);
+const stageDir = path.join(distDir, 'release-root');
+const filesToShip = [
+  'gemini-extension.json',
+  'package.json',
+  'README.md',
+  'CHANGELOG.md',
+  'GEMINI.md',
+  'commands',
+  'skills',
+  'hooks'
+];
+const requiredArchiveEntries = new Set([
+  './gemini-extension.json',
+  './package.json',
+  './README.md',
+  './CHANGELOG.md',
+  './GEMINI.md',
+  './commands/nowledge/read-working-memory.toml',
+  './commands/nowledge/search-memory.toml',
+  './commands/nowledge/distill-memory.toml',
+  './commands/nowledge/save-thread.toml',
+  './commands/nowledge/save-handoff.toml',
+  './commands/nowledge/status.toml',
+  './hooks/hooks.json',
+  './hooks/session-start.mjs',
+  './hooks/session-end.mjs',
+  './skills/read-working-memory/SKILL.md',
+  './skills/search-memory/SKILL.md',
+  './skills/distill-memory/SKILL.md',
+  './skills/save-thread/SKILL.md',
+  './skills/save-handoff/SKILL.md'
+]);
+
+function run(command, args, cwd = extensionRoot, options = {}) {
+  const result = spawnSync(command, args, { cwd, stdio: 'inherit', ...options });
+  if (result.status !== 0) {
+    process.exit(result.status ?? 1);
+  }
+  return result;
+}
+
+async function fileSha256(filePath) {
+  const contents = await readFile(filePath);
+  return createHash('sha256').update(contents).digest('hex');
+}
+
+function verifyArchive(filePath) {
+  const result = spawnSync('tar', ['-tzf', filePath], {
+    cwd: extensionRoot,
+    encoding: 'utf8'
+  });
+  if (result.status !== 0) {
+    process.exit(result.status ?? 1);
+  }
+
+  const entries = new Set(
+    result.stdout
+      .split('\n')
+      .map((line) => line.trim())
+      .filter(Boolean)
+  );
+
+  for (const requiredEntry of requiredArchiveEntries) {
+    if (!entries.has(requiredEntry)) {
+      console.error(`ERROR: release archive is missing ${requiredEntry}`);
+      process.exit(1);
+    }
+  }
+}
+
+async function main() {
+  run('node', ['scripts/validate-extension.mjs']);
+
+  await rm(distDir, { recursive: true, force: true });
+  await mkdir(stageDir, { recursive: true });
+
+  for (const relPath of filesToShip) {
+    await cp(path.join(extensionRoot, relPath), path.join(stageDir, relPath), {
+      recursive: true,
+      force: true
+    });
+  }
+
+  run('tar', ['-czf', archivePath, '-C', stageDir, '.']);
+  verifyArchive(archivePath);
+
+  const checksum = await fileSha256(archivePath);
+  await writeFile(
+    path.join(distDir, `${archiveName}.sha256`),
+    `${checksum}  ${archiveName}\n`,
+    'utf8'
+  );
+
+  console.log(`Created ${archivePath}`);
+  console.log(`Created ${path.join(distDir, `${archiveName}.sha256`)}`);
+}
+
+await main();
diff --git a/nowledge-mem-gemini-cli-extension/scripts/validate-extension.mjs b/nowledge-mem-gemini-cli-extension/scripts/validate-extension.mjs
new file mode 100644
index 000000000..5ab8616db
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/scripts/validate-extension.mjs
@@ -0,0 +1,106 @@
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import process from 'node:process';
+import { fileURLToPath } from 'node:url';
+
+const extensionRoot = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '..');
+const manifestPath = path.join(extensionRoot, 'gemini-extension.json');
+const packageJsonPath = path.join(extensionRoot, 'package.json');
+
+function fail(message) {
+  console.error(`ERROR: ${message}`);
+  process.exit(1);
+}
+
+async function readJson(filePath) {
+  const text = await readFile(filePath, 'utf8');
+  return JSON.parse(text);
+}
+
+function assertString(value, label) {
+  if (typeof value !== 'string' || value.trim() === '') {
+    fail(`${label} must be a non-empty string`);
+  }
+}
+
+async function main() {
+  const manifest = await readJson(manifestPath);
+  const packageJson = await readJson(packageJsonPath);
+  const extensionDirName = path.basename(extensionRoot);
+
+  assertString(manifest.name, 'manifest.name');
+  assertString(manifest.version, 'manifest.version');
+  assertString(manifest.description, 'manifest.description');
+
+  if (manifest.name !== extensionDirName) {
+    fail(`manifest.name must match directory name (${extensionDirName})`);
+  }
+
+  if (manifest.version !== packageJson.version) {
+    fail(`manifest.version (${manifest.version}) must match package.json version (${packageJson.version})`);
+  }
+
+  if (manifest.contextFileName !== 'GEMINI.md') {
+    fail('manifest.contextFileName must be GEMINI.md');
+  }
+
+  if (manifest.settings !== undefined && !Array.isArray(manifest.settings)) {
+    fail('manifest.settings must be an array when present');
+  }
+
+  const requiredPaths = [
+    'GEMINI.md',
+    'README.md',
+    'CHANGELOG.md',
+    'RELEASING.md',
+    'commands/nowledge/read-working-memory.toml',
+    'commands/nowledge/search-memory.toml',
+    'commands/nowledge/distill-memory.toml',
+    'commands/nowledge/save-thread.toml',
+    'commands/nowledge/save-handoff.toml',
+    'commands/nowledge/status.toml',
+    'hooks/hooks.json',
+    'hooks/session-start.mjs',
+    'hooks/session-end.mjs',
+    'skills/read-working-memory/SKILL.md',
+    'skills/search-memory/SKILL.md',
+    'skills/distill-memory/SKILL.md',
+    'skills/save-thread/SKILL.md',
+    'skills/save-handoff/SKILL.md',
+    'scripts/validate-extension.mjs',
+    'scripts/package-extension.mjs',
+    `release-notes/${manifest.version}.md`
+  ];
+
+  for (const relPath of requiredPaths) {
+    const absPath = path.join(extensionRoot, relPath);
+    try {
+      const text = await readFile(absPath, 'utf8');
+      if (text.trim() === '') {
+        fail(`${relPath} must not be empty`);
+      }
+
+      if (relPath === 'hooks/hooks.json') {
+        const hooksConfig = JSON.parse(text);
+        if (
+          !hooksConfig ||
+          typeof hooksConfig !== 'object' ||
+          typeof hooksConfig.hooks !== 'object' ||
+          hooksConfig.hooks === null ||
+          Array.isArray(hooksConfig.hooks)
+        ) {
+          fail('hooks/hooks.json must contain a top-level "hooks" object');
+        }
+      }
+    } catch (error) {
+      if (error instanceof SyntaxError) {
+        fail(`${relPath} must contain valid JSON`);
+      }
+      fail(`missing required file: ${relPath}`);
+    }
+  }
+
+  console.log('Validated Gemini extension manifest, version alignment, and required release files.');
+}
+
+await main();
diff --git a/nowledge-mem-gemini-cli-extension/skills/distill-memory/SKILL.md b/nowledge-mem-gemini-cli-extension/skills/distill-memory/SKILL.md
new file mode 100644
index 000000000..1889fe57d
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/skills/distill-memory/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: distill-memory
+description: Detect breakthrough moments, durable lessons, and decisions worth preserving. Suggest distillation sparingly, then store high-value knowledge as atomic memories.
+---
+
+# Distill Memory
+
+Store only knowledge that should remain useful after the current session ends.
+
+## Good Candidates
+
+- decisions with rationale
+- repeatable procedures
+- lessons from debugging or incident work
+- durable preferences or constraints
+- plans that future sessions will need to resume cleanly
+
+## Add vs Update
+
+- Use `nmem --json m add` when the insight is genuinely new.
+- If an existing memory already captures the same decision, workflow, or preference and the new information refines it, use `nmem m update <id> ...` instead of creating a duplicate.
+
+Prefer atomic, standalone memories with strong titles and structured meaning. Focus on what was learned or decided, not routine chatter.
diff --git a/nowledge-mem-gemini-cli-extension/skills/read-working-memory/SKILL.md b/nowledge-mem-gemini-cli-extension/skills/read-working-memory/SKILL.md
new file mode 100644
index 000000000..8bd1f9a33
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/skills/read-working-memory/SKILL.md
@@ -0,0 +1,21 @@
+---
+name: read-working-memory
+description: Read the user's daily Working Memory briefing at session start or when recent priorities matter. This gives Gemini CLI cross-tool continuity without bloating the main prompt.
+---
+
+# Read Working Memory
+
+Use `nmem --json wm read` for the user's current priorities, unresolved flags, and recent context.
+
+## When to Use
+
+- At session start
+- When resuming work after a break
+- When the user asks what they are focused on now
+- When the current task clearly depends on recent priorities or active initiatives
+
+## Usage Pattern
+
+- Read once near the start of a session.
+- Reuse that context mentally instead of re-reading on every turn.
+- Refresh only if the user asks, the session context changed materially, or a long-running session clearly needs it.
diff --git a/nowledge-mem-gemini-cli-extension/skills/save-handoff/SKILL.md b/nowledge-mem-gemini-cli-extension/skills/save-handoff/SKILL.md
new file mode 100644
index 000000000..b496b9e3d
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/skills/save-handoff/SKILL.md
@@ -0,0 +1,29 @@
+---
+name: save-handoff
+description: Save a concise Gemini CLI handoff summary only when the user explicitly asks. This is intentionally separate from full thread-save, which should use the native Gemini session importer.
+---
+
+# Save Handoff
+
+Only use this skill when the user explicitly asks to save progress as a handoff, leave a resumable summary, or create a lightweight restart point.
+
+## Why This Is A Handoff
+
+`save-thread` should mean saving the real session messages through the native Gemini importer.
+
+For Gemini, this skill intentionally creates a structured handoff summary thread instead of importing the full session.
+
+## Workflow
+
+1. Write a short but useful handoff summary.
+2. Include Goal, Decisions, Files, Risks, and Next.
+3. Create a thread with `nmem t create` and `-s gemini-cli`.
+4. If the user wants the full session instead, use `save-thread`.
+
+Example:
+
+```bash
+nmem --json t create -t "Gemini CLI Session - auth refactor" -c "Goal: finish the auth refactor. Decisions: keep refresh verification in the API layer and treat gateway clock skew as the root cause. Files: api/auth.ts, auth.test.ts. Risks: expiry behavior may still differ in remote sessions. Next: validate remote session expiry end to end." -s gemini-cli
+```
+
+Never present this as a lossless thread save. Never auto-save without an explicit user request.
diff --git a/nowledge-mem-gemini-cli-extension/skills/save-thread/SKILL.md b/nowledge-mem-gemini-cli-extension/skills/save-thread/SKILL.md
new file mode 100644
index 000000000..acca6b100
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/skills/save-thread/SKILL.md
@@ -0,0 +1,31 @@
+---
+name: save-thread
+description: Save the real Gemini CLI session to Nowledge Mem only when the user explicitly asks. This uses Gemini's native transcript-backed importer rather than a summary-only fallback.
+---
+
+# Save Thread
+
+Only use this skill when the user explicitly asks to save the session, persist the thread, or store the actual Gemini conversation.
+
+## Contract
+
+`save-thread` means saving the real Gemini session messages.
+
+Use `nmem t save --from gemini-cli` for that path. A short summary may be attached as metadata, but the stored thread should come from Gemini's recorded transcript.
+
+Use `save-handoff` instead only when the user wants a lightweight resumable summary rather than the full session.
+
+## Workflow
+
+1. Write a concise 1-2 sentence summary.
+2. Run `nmem --json t save --from gemini-cli -p . -s "..."`.
+3. If the user names a different Gemini session, add `--session-id`.
+4. Report whether the thread was created or appended and how many messages were stored.
+
+Example:
+
+```bash
+nmem --json t save --from gemini-cli -p . -s "Finished the auth refactor and verified the new refresh-token flow."
+```
+
+Never claim a checkpoint summary is a thread save. Never auto-save without an explicit user request.
diff --git a/nowledge-mem-gemini-cli-extension/skills/search-memory/SKILL.md b/nowledge-mem-gemini-cli-extension/skills/search-memory/SKILL.md
new file mode 100644
index 000000000..50ed97846
--- /dev/null
+++ b/nowledge-mem-gemini-cli-extension/skills/search-memory/SKILL.md
@@ -0,0 +1,28 @@
+---
+name: search-memory
+description: Search the user's personal knowledge base when past insights would improve the response. Trigger proactively for continuity, recurring bugs, design rationale, and remembered workflows.
+---
+
+# Search Memory
+
+Use Nowledge Mem proactively when prior knowledge would materially improve the answer.
+
+## Strong Triggers
+
+Search when:
+
+- the user references previous work, a prior fix, or an earlier decision
+- the task resumes a named feature, bug, refactor, incident, or subsystem
+- a debugging pattern resembles something solved earlier
+- the user asks for rationale, preferences, procedures, or recurring workflow details
+- the current result is ambiguous and past context would make the answer sharper
+
+## Retrieval Routing
+
+1. Start with `nmem --json m search` for durable knowledge.
+2. Use `--mode deep` when the first pass is weak or the recall need is conceptual.
+3. Use `nmem --json t search` for prior discussions, previous sessions, or exact conversation history.
+4. If a memory result includes a `source_thread` or thread search returns a strong hit, inspect the conversation progressively with `nmem --json t show`.
+5. Prefer the smallest retrieval surface that answers the question.
+
+Mention source threads when they add useful historical context.
diff --git a/nowledge-mem-npx-skills/CHANGELOG.md b/nowledge-mem-npx-skills/CHANGELOG.md
index b1d5a3e51..95f66b13d 100644
--- a/nowledge-mem-npx-skills/CHANGELOG.md
+++ b/nowledge-mem-npx-skills/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 All notable changes to the Nowledge Mem npx Skills will be documented in this file.
 
+## [0.4.0] - 2026-03-09
+
+### Changed
+
+- **save-handoff** replaces the previous generic `save-thread` skill so the reusable package stays honest across agent runtimes that do not have a real transcript importer
+- **read-working-memory** now prefers `nmem --json wm read` and keeps `~/ai-now/memory.md` only as a legacy fallback
+
 ## [0.3.0] - 2026-02-26
 
 ### Changed
@@ -21,7 +28,7 @@ All notable changes to the Nowledge Mem npx Skills will be documented in this fi
 
 - Initial release of npx skills format for Vercel `add-skill` CLI
 - **search-memory** skill - Semantic search across personal knowledge base
-- **save-thread** skill - Persist coding sessions as checkpoints
+- **save-handoff** skill - Leave resumable handoff summaries in generic agent environments
 - **distill-memory** skill - Capture breakthrough moments as memories
 - Support for Claude Code, Cursor, OpenCode, Codex, and 20+ agents
 - Comprehensive README with installation and usage instructions
diff --git a/nowledge-mem-npx-skills/README.md b/nowledge-mem-npx-skills/README.md
index c8d99c90d..c00b4b50e 100644
--- a/nowledge-mem-npx-skills/README.md
+++ b/nowledge-mem-npx-skills/README.md
@@ -10,9 +10,9 @@
 
 These skills extend your AI coding agent with persistent memory capabilities powered by [Nowledge Mem](https://mem.nowledge.co):
 
-- **Search Memory** - Automatically surface relevant context from your knowledge base
+- **Search Memory** - Automatically route recall across distilled memories and prior discussion threads
 - **Read Working Memory** - Load your daily briefing at session start for cross-tool continuity
-- **Save Thread** - Persist complete coding sessions for future reference
+- **Save Handoff** - Leave resumable handoff summaries in generic agent environments
 - **Distill Memory** - Capture breakthrough moments as searchable insights
 
 ## Installation
@@ -101,9 +101,9 @@ Agent: [Automatically searches knowledge base]
 Found it! We solved this 2 weeks ago. The issue was token expiration...
 ```
 
-### Save Thread (`save-thread`)
+### Save Handoff (`save-handoff`)
 
-Saves complete conversations as checkpoints for future reference.
+Saves a structured resumable handoff for future reference.
 
 **Activates when you say:**
 - "Save this session"
@@ -114,10 +114,10 @@ Saves complete conversations as checkpoints for future reference.
 ```
 You: Save this session - implemented JWT authentication
 
-Agent: ✓ Thread saved
-Summary: Implemented JWT authentication
-Messages: 23
-Thread ID: claude-code-abc123
+Agent: ✓ Handoff saved
+Title: Session Handoff - JWT authentication
+Summary: Goal, Decisions, Files, Risks, Next
+Thread ID: generic-agent-abc123
 ```
 
 ### Read Working Memory (`read-working-memory`)
@@ -130,7 +130,7 @@ Loads your daily Working Memory briefing at session start so the agent knows you
 
 **Example:**
 ```
-[Agent reads ~/ai-now/memory.md at session start]
+[Agent reads Working Memory at session start]
 
 Agent: I see you're focused on the auth migration and have an
 unresolved flag about the session handling approach. Want me to
@@ -168,14 +168,74 @@ nmem --json m search "React patterns"
 "What do I know about database optimization?"
 ```
 
-### Save a Session
+### Memory Lifecycle
+
+The reusable skills follow the same core flow as the richer native integrations: read Working Memory, route recall across memories and threads, save a resumable handoff when asked, and distill durable knowledge.
+
+## Make Agents Use Memory Proactively
+
+Native integrations like Claude Code, Gemini CLI, Cursor, OpenClaw, and Alma already bundle the behavioral guidance that teaches the agent when to read context, search, or save.
+
+For less common agents, custom harnesses, or environments that only see `nmem`, skills, or MCP tools, you should add explicit intent guidance in `AGENTS.md`, `CLAUDE.md`, `GEMINI.md`, or the system prompt.
+
+### Step 1: Give The Agent A Memory Surface
+
+Use one of these:
+
+- `npx skills` for shared skill-based behavior
+- `nmem` CLI for terminal-visible commands
+- MCP when the client can call tools directly
+
+### Step 2: Add An Intent Policy
+
+For CLI or skill-driven agents, paste a policy like this into `AGENTS.md` or your system prompt:
+
+```markdown
+## Nowledge Mem
+
+Use Nowledge Mem as your external memory system.
+
+At session start:
+- Run `nmem --json wm read` once to load current priorities and recent context.
+- Do not re-read it on every turn unless the user asks or the session context changed materially.
+
+Search proactively when:
+- the user references previous work, a prior fix, or an earlier decision
+- the task resumes a named feature, bug, refactor, or subsystem
+- a debugging pattern resembles something solved earlier
+- the user asks for rationale, preferences, procedures, or recurring workflow details
+
+Retrieval routing:
+- Start with `nmem --json m search` for durable knowledge.
+- Use `nmem --json t search` when the user is asking about a prior discussion or exact conversation history.
+- If a memory result includes `source_thread`, inspect that conversation progressively with `nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200`.
+
+When preserving knowledge:
+- Use `nmem --json m add` for genuinely new durable knowledge.
+- If an existing memory already captures the same decision, preference, or workflow and the new information refines it, use `nmem m update <id> ...` instead of creating a duplicate.
+- Use a handoff save only when the user explicitly asks for a resumable checkpoint or handoff summary.
+```
+
+For MCP-only agents, use the same policy but replace the commands with the tool names `read_working_memory`, `memory_search`, `thread_search`, `thread_fetch_messages`, `memory_add`, and `memory_update`.
+
+### Step 3: Keep The Prompt Direct
+
+The best intent prompts are short and operational. Tell the agent exactly:
+
+- when to read Working Memory
+- when to search proactively
+- when to use thread tools instead of memory search
+- when to add a new memory versus update an existing one
+- when handoff save is explicit-only
+
+### Save a Handoff
 
 ```bash
 # Via CLI
-nmem t save --from claude-code -s "Implemented user auth"
+nmem --json t create -t "Session Handoff - auth refactor" -c "Goal: finish auth refactor. Decisions: keep refresh verification in the API layer. Files: auth.ts, auth.test.ts. Risks: remote expiry path still unverified. Next: run the remote flow." -s generic-agent
 
 # In conversation
-"Save this session - fixed memory leak in event listeners"
+"Save a handoff for this debugging session"
 ```
 
 ### Create a Memory
@@ -190,7 +250,7 @@ nmem m add "PostgreSQL over MongoDB: ACID needed for transactions" \
 
 ### Claude Code Plugin (Full Featured)
 
-For Claude Code users, the full plugin with slash commands is also available:
+For Claude Code users, the full plugin with slash commands and real session import is also available:
 
 ```bash
 claude plugin marketplace add nowledge-co/community
diff --git a/nowledge-mem-npx-skills/skills/distill-memory/SKILL.md b/nowledge-mem-npx-skills/skills/distill-memory/SKILL.md
index e27ef4650..b815957d8 100644
--- a/nowledge-mem-npx-skills/skills/distill-memory/SKILL.md
+++ b/nowledge-mem-npx-skills/skills/distill-memory/SKILL.md
@@ -1,148 +1,25 @@
 ---
 name: distill-memory
-description: Recognize breakthrough moments, blocking resolutions, and design decisions worth preserving. Detect high-value insights that save future time. Suggest distillation at valuable moments, not routine work.
+description: Capture breakthrough moments and valuable insights as searchable memories in your knowledge base.
 ---
 
 # Distill Memory
 
-> Capture breakthrough moments and valuable insights as searchable memories in your knowledge base.
+Store only knowledge that should remain useful after the current session ends.
 
-## When to Suggest
+## When to Save
 
-**Breakthrough moments:**
-- Extended debugging finally resolves
-- User relief signals ("Finally!", "Aha!", "That was it!")
-- Root cause discovered after investigation
+Good candidates include:
 
-**Important decisions:**
-- Compared multiple options
-- Chose with clear rationale
-- Trade-off resolved with reasoning
+- decisions with rationale
+- repeatable procedures
+- lessons from debugging or incident work
+- durable preferences or constraints
+- plans that future sessions will need to resume cleanly
 
-**Research conclusions:**
-- Investigated multiple approaches
-- Reached definitive conclusion
-- Optimal path determined
+## Add vs Update
 
-**Unexpected discoveries:**
-- Counterintuitive solution found
-- Assumption challenged and corrected
-- Surprising cause-effect relationship
+- Use `nmem --json m add` when the insight is genuinely new.
+- If an existing memory already captures the same decision, workflow, or preference and the new information refines it, use `nmem m update <id> ...` instead of creating a duplicate.
 
-**Lessons learned:**
-- "Next time do X instead"
-- Preventive measure identified
-- Pattern recognized for future use
-
-**Skip these (not worth preserving):**
-- Routine fixes
-- Work in progress
-- Simple Q&A
-- Generic information
-
-## Prerequisites
-
-**nmem CLI** - Choose one option:
-
-**Option 1: uvx (Recommended)**
-```bash
-curl -LsSf https://astral.sh/uv/install.sh | sh
-uvx --from nmem-cli nmem --version
-```
-
-**Option 2: pip**
-```bash
-pip install nmem-cli
-nmem --version
-```
-
-Ensure Nowledge Mem server is running at `http://localhost:14242`
-
-## Usage
-
-Use `nmem m add` to create memories:
-
-```bash
-nmem m add "Insight content with context for future use" \
-  -t "Searchable title (50-60 chars)" \
-  -i 0.8
-```
-
-### Memory Quality Guidelines
-
-**Good memories (atomic + actionable):**
-
-- "React hooks cleanup must return function. Missing return caused memory leaks in event listeners."
-- "PostgreSQL over MongoDB: ACID compliance needed for financial transactions."
-- "Docker build cache invalidation: COPY package*.json before source files."
-
-**Poor memories (avoid):**
-
-- Vague: "Fixed bugs in the code"
-- Too long: Full conversation transcripts
-- No context: "Use useState"
-
-### Content Guidelines
-
-- Focus on outcome/insight, not process
-- Include "why" not just "what"
-- Add enough context for future understanding
-- Be specific and actionable
-
-### Importance Scores
-
-| Score | Use for |
-|-------|---------|
-| 0.8-1.0 | Major breakthroughs, critical decisions |
-| 0.5-0.7 | Useful patterns, good practices |
-| 0.3-0.4 | Minor tips, nice-to-know |
-
-### Options
-
-| Flag | Description | Example |
-|------|-------------|---------|
-| `-t, --title` | Searchable title | `-t "React Hooks Cleanup"` |
-| `-i, --importance` | Score 0.0-1.0 | `-i 0.9` |
-| `--unit-type TYPE` | Memory type | `--unit-type decision` |
-| `-l, --label LABEL` | Topic label (repeatable) | `-l react -l frontend` |
-| `--json` | JSON response | `--json` |
-
-Available unit types: `fact`, `preference`, `decision`, `plan`, `procedure`, `learning`, `context`, `event`.
-
-## Suggestion Approach
-
-**Timing:** After resolution/decision, when user pauses
-
-**Pattern:** "This [type] seems valuable - [essence]. Distill into memory?"
-
-**Frequency:** 1-3 per session typical. Quality over quantity.
-
-## Examples
-
-```bash
-# High-value debugging insight
-nmem m add "React hooks cleanup must return function. Missing return caused memory leaks in event listeners when component unmounted." \
-  -t "React Hooks Cleanup Pattern" \
-  -i 0.9 --unit-type learning -l react -l frontend
-
-# Architecture decision
-nmem m add "Chose PostgreSQL over MongoDB: needed ACID compliance for financial transactions and complex JOIN queries for reporting." \
-  -t "Database Choice: PostgreSQL for ACID" \
-  -i 0.9 --unit-type decision -l database -l architecture
-
-# Development workflow tip
-nmem m add "Docker build cache: COPY package*.json and run npm install BEFORE copying source files. Saves rebuild time on code changes." \
-  -t "Docker Build Cache Optimization" \
-  -i 0.7 --unit-type procedure -l docker -l devops
-
-# Debugging lesson
-nmem m add "CORS preflight fails silently in fetch. Check Network tab for OPTIONS request, not just the main request." \
-  -t "CORS Debugging: Check OPTIONS Request" \
-  -i 0.8 --unit-type learning -l debugging -l web
-```
-
-## Links
-
-- [Documentation](https://mem.nowledge.co/docs)
-- [Nowledge Mem](https://mem.nowledge.co)
-- [Discord Community](https://nowled.ge/discord)
+Prefer atomic, standalone memories with strong titles and clear meaning. Focus on what was learned or decided, not routine chatter.
diff --git a/nowledge-mem-npx-skills/skills/read-working-memory/SKILL.md b/nowledge-mem-npx-skills/skills/read-working-memory/SKILL.md
index bdafec1c3..767bd21b4 100644
--- a/nowledge-mem-npx-skills/skills/read-working-memory/SKILL.md
+++ b/nowledge-mem-npx-skills/skills/read-working-memory/SKILL.md
@@ -29,12 +29,14 @@ description: Read your daily Working Memory briefing to understand current conte
 
 ## Usage
 
-Read the Working Memory file directly:
+Read Working Memory with `nmem` first:
 
 ```bash
-cat ~/ai-now/memory.md
+nmem --json wm read
 ```
 
+If it succeeds but reports `exists: false`, say there is no Working Memory briefing yet. Only fall back to `~/ai-now/memory.md` for older local-only setups.
+
 ### What You'll Find
 
 The Working Memory briefing contains:
@@ -56,9 +58,9 @@ The Working Memory briefing contains:
 
 ```bash
 # Read today's briefing
-cat ~/ai-now/memory.md
+nmem --json wm read
 
-# Check if the file exists first
+# Legacy local-only fallback
 test -f ~/ai-now/memory.md && cat ~/ai-now/memory.md || echo "No Working Memory found. Ensure Nowledge Mem is running with Background Intelligence enabled."
 ```
 
diff --git a/nowledge-mem-npx-skills/skills/save-handoff/SKILL.md b/nowledge-mem-npx-skills/skills/save-handoff/SKILL.md
new file mode 100644
index 000000000..e07c9ba44
--- /dev/null
+++ b/nowledge-mem-npx-skills/skills/save-handoff/SKILL.md
@@ -0,0 +1,58 @@
+---
+name: save-handoff
+description: Save a concise handoff summary only when the user explicitly requests it. Use this for resumable progress notes in generic agent environments where a real session importer is not guaranteed.
+---
+
+# Save Handoff
+
+> Persist a compact resumable handoff when the user wants a restart point, summary, or checkpoint.
+
+## When to Use
+
+**Only activate when user explicitly says:**
+
+- "Save a handoff"
+- "Checkpoint this"
+- "Leave me a summary"
+- "Remember where we are"
+
+**Never auto-save or suggest saving.** This is always user-initiated.
+
+## Why This Is A Handoff
+
+This reusable package works across many agents. It must not pretend to have a real transcript importer when the runtime may not provide one.
+
+So this skill creates a structured handoff summary with `nmem t create` instead of claiming a lossless thread save.
+
+If the user's tool has a dedicated Nowledge integration with real thread import, prefer that native package instead.
+
+## Usage
+
+Create a structured handoff thread:
+
+```bash
+nmem --json t create   -t "Session Handoff - <topic>"   -c "Goal: ... Decisions: ... Files: ... Risks: ... Next: ..."   -s generic-agent
+```
+
+### Handoff Format
+
+Include these fields:
+
+- Goal
+- Decisions
+- Files
+- Risks
+- Next
+
+### Response Format
+
+After successful save:
+
+```
+✓ Handoff saved
+Title: {title}
+Summary: {content}
+Thread ID: {thread_id}
+```
+
+Never present this as a real transcript import.
diff --git a/nowledge-mem-npx-skills/skills/save-thread/SKILL.md b/nowledge-mem-npx-skills/skills/save-thread/SKILL.md
deleted file mode 100644
index 2d19170f3..000000000
--- a/nowledge-mem-npx-skills/skills/save-thread/SKILL.md
+++ /dev/null
@@ -1,114 +0,0 @@
----
-name: save-thread
-description: Save complete conversation as checkpoint. Only when user explicitly requests ("save session", "checkpoint this"). Use nmem t save to automatically import coding sessions.
----
-
-# Save Thread
-
-> Persist complete coding sessions to your personal knowledge base for future reference.
-
-## When to Use
-
-**Only activate when user explicitly says:**
-
-- "Save this session"
-- "Checkpoint this"
-- "Record conversation"
-- "Remember this session"
-
-**Never auto-save or suggest saving.** This is always user-initiated.
-
-## Prerequisites
-
-**nmem CLI** - Choose one option:
-
-**Option 1: uvx (Recommended)**
-```bash
-curl -LsSf https://astral.sh/uv/install.sh | sh
-uvx --from nmem-cli nmem --version
-```
-
-**Option 2: pip**
-```bash
-pip install nmem-cli
-nmem --version
-```
-
-Ensure Nowledge Mem server is running at `http://localhost:14242`
-
-## Usage
-
-Use `nmem t save` to automatically import the current coding session:
-
-```bash
-# Save current session for current project
-nmem t save --from claude-code
-
-# Save with custom summary (recommended)
-nmem t save --from claude-code -s "Brief summary of what was accomplished"
-
-# Save all sessions for current project
-nmem t save --from claude-code -m all
-
-# Save for specific project path
-nmem t save --from claude-code -p /path/to/project
-```
-
-### Available Options
-
-| Flag | Description | Example |
-|------|-------------|---------|
-| `--from` | Source app | `--from claude-code` |
-| `-s, --summary` | Brief summary | `-s "Fixed auth bug"` |
-| `-m, --mode` | `current` or `all` | `-m all` |
-| `-p, --project` | Project path | `-p /path/to/project` |
-| `--truncate` | Truncate large results | `--truncate` |
-
-### Behavior
-
-- **Auto-detects sessions** from `~/.claude/projects/`
-- **Idempotent**: Re-running appends only new messages
-- **Thread ID**: Auto-generated as `claude-code-{session_id}`
-
-## Thread vs Memory
-
-| Thread | Memory |
-|--------|--------|
-| Full conversation history | Distilled insights |
-| Complete context | Atomic, searchable facts |
-| Session checkpoint | Actionable knowledge |
-
-Both serve different purposes - you can save a thread AND distill key memories.
-
-## Response Format
-
-After successful save:
-
-```
-✓ Thread saved
-Summary: {summary}
-Messages: {count}
-Thread ID: claude-code-{session_id}
-```
-
-## Examples
-
-```bash
-# Basic save
-nmem t save --from claude-code
-
-# Save with descriptive summary
-nmem t save --from claude-code -s "Implemented JWT authentication with refresh tokens"
-
-# Save all sessions
-nmem t save --from claude-code -m all
-
-# Save specific project
-nmem t save --from claude-code -p ~/projects/my-app -s "API refactoring complete"
-```
-
-## Links
-
-- [Documentation](https://mem.nowledge.co/docs)
-- [Nowledge Mem](https://mem.nowledge.co)
-- [Discord Community](https://nowled.ge/discord)
diff --git a/nowledge-mem-npx-skills/skills/search-memory/SKILL.md b/nowledge-mem-npx-skills/skills/search-memory/SKILL.md
index 8c76055df..f8c73da4d 100644
--- a/nowledge-mem-npx-skills/skills/search-memory/SKILL.md
+++ b/nowledge-mem-npx-skills/skills/search-memory/SKILL.md
@@ -5,122 +5,21 @@ description: Search your personal knowledge base when past insights would improv
 
 # Search Memory
 
-> AI-powered semantic search across your personal knowledge base using Nowledge Mem.
+> AI-powered search across your personal knowledge base using Nowledge Mem.
 
 ## When to Use
 
-**Strong signals to search:**
+Search when:
 
-- Continuity: Current topic connects to prior work
-- Pattern match: Problem resembles past solved issue
-- Decision context: "Why/how we chose X" implies documented rationale
-- Recurring theme: Topic discussed in past sessions
-- Implicit recall: "that approach", "like before"
+- the user references previous work, a prior fix, or an earlier decision
+- the task resumes a named feature, bug, refactor, incident, or subsystem
+- a debugging pattern resembles something solved earlier
+- the user asks for rationale, preferences, procedures, or recurring workflow details
+- the current result is ambiguous and past context would make the answer sharper
 
-**Contextual signals:**
+## Retrieval Routing
 
-- Complex debugging (may match past root causes)
-- Architecture discussion (choices may be documented)
-- Domain-specific question (conventions likely stored)
-
-**Skip when:**
-
-- Fundamentally new topic
-- Generic syntax questions
-- Fresh perspective explicitly requested
-
-## Prerequisites
-
-**nmem CLI** - Choose one option:
-
-**Option 1: uvx (Recommended)**
-```bash
-# Install uv if needed
-curl -LsSf https://astral.sh/uv/install.sh | sh
-
-# Run nmem directly (auto-downloads)
-uvx --from nmem-cli nmem --version
-```
-
-**Option 2: pip**
-```bash
-pip install nmem-cli
-nmem --version
-```
-
-Ensure Nowledge Mem server is running at `http://localhost:14242`
-
-## Usage
-
-Use `nmem` CLI with `--json` flag for programmatic search:
-
-```bash
-# Basic search
-nmem --json m search "your query here"
-
-# With importance filter
-nmem --json m search "API design" --importance 0.8
-
-# With labels (multiple labels use AND logic)
-nmem --json m search "authentication" -l backend -l security
-
-# With time filter
-nmem --json m search "meeting notes" -t week
-
-# Limit results
-nmem --json m search "debugging tips" -n 5
-```
-
-### Query Guidelines
-
-- Extract semantic core from user's request
-- Preserve domain terminology
-- Multi-language aware (works with any language)
-- Use 3-7 core concepts for best results
-
-### Available Filters
-
-| Flag | Description | Example |
-|------|-------------|---------|
-| `--importance MIN` | Minimum importance (0.0-1.0) | `--importance 0.7` |
-| `-l, --label LABEL` | Filter by label (repeatable) | `-l frontend -l react` |
-| `-t, --time RANGE` | Time filter | `-t today`, `-t week`, `-t month` |
-| `-n NUM` | Limit results | `-n 5` |
-| `--unit-type TYPE` | Filter by memory type | `--unit-type decision` |
-
-Available unit types: `fact`, `preference`, `decision`, `plan`, `procedure`, `learning`, `context`, `event`.
-
-### Understanding Results
-
-Parse the `memories` array from JSON response. Check `score` field:
-
-- **0.6-1.0**: Directly relevant - include in response
-- **0.3-0.6**: Related context - may be useful
-- **< 0.3**: Skip - not relevant enough
-
-Results may include a `source_thread` field linking the memory to the conversation it was distilled from. Use `nmem --json t show <thread_id>` to fetch the full conversation for deeper context.
-
-## Response Guidelines
-
-**Found relevant memories:** Synthesize insights, cite when helpful
-
-**No results:** State clearly, suggest distilling current discussion if valuable
-
-## Examples
-
-```bash
-# Search for React patterns
-nmem --json m search "React hooks patterns" -l frontend
-
-# Find debugging solutions
-nmem --json m search "memory leak debugging" --importance 0.6
-
-# Recent project decisions
-nmem --json m search "architecture decision" -t month -n 10
-```
-
-## Links
-
-- [Documentation](https://mem.nowledge.co/docs)
-- [Nowledge Mem](https://mem.nowledge.co)
-- [Discord Community](https://nowled.ge/discord)
+1. Start with `nmem --json m search` for durable knowledge.
+2. Use `nmem --json t search` when the user is really asking about a prior conversation or exact session history.
+3. If a result includes `source_thread`, inspect it progressively with `nmem --json t show <thread_id> --limit 8 --offset 0 --content-limit 1200`.
+4. Prefer the smallest retrieval surface that answers the question.
diff --git a/nowledge-mem-openclaw-plugin/CHANGELOG.md b/nowledge-mem-openclaw-plugin/CHANGELOG.md
index 54e4e5f71..8d327890d 100644
--- a/nowledge-mem-openclaw-plugin/CHANGELOG.md
+++ b/nowledge-mem-openclaw-plugin/CHANGELOG.md
@@ -2,6 +2,16 @@
 
 All notable changes to the Nowledge Mem OpenClaw plugin will be documented in this file.
 
+## [0.6.9] - 2026-03-05
+
+### Fixed
+
+- **Auto-recall searched with full prompt instead of user message**: `event.prompt` contains the entire conversation history (system prompt + all prior turns), making recalled memories irrelevant as conversation grows. Now uses a tiered query strategy: substantial messages (>=40 chars) search alone; short messages (3-39 chars, likely follow-ups like "explain that more") include the last 3 messages as context for topic grounding; tiny messages (<3 chars) skip recall. Falls back to truncated `event.prompt` (500 chars) only when `event.messages` is unavailable.
+
+### Added
+
+- **`recallMinScore` config option** (0-100, default 0): Minimum relevance score threshold for auto-recalled memories. Set to e.g. 30 to filter out low-confidence results. Configurable via OpenClaw Config UI, config file, or `NMEM_RECALL_MIN_SCORE` env var.
+
 ## [0.6.8] - 2026-02-27
 
 ### Changed
diff --git a/nowledge-mem-openclaw-plugin/CLAUDE.md b/nowledge-mem-openclaw-plugin/CLAUDE.md
index d4f09b7bc..88e42506c 100644
--- a/nowledge-mem-openclaw-plugin/CLAUDE.md
+++ b/nowledge-mem-openclaw-plugin/CLAUDE.md
@@ -62,7 +62,7 @@ openclaw.plugin.json - manifest + config schema (version, uiHints, configSchema)
 
 ### Thread Tools (progressive conversation retrieval)
 - `nowledge_mem_thread_search` - search past conversations by keyword. Returns threads with matched message snippets, relevance scores, and message counts. Supports `source` filter.
-- `nowledge_mem_thread_fetch` - fetch full messages from a specific thread. Supports pagination via `offset` + `limit` for progressive retrieval of long conversations.
+- `nowledge_mem_thread_fetch` - fetch messages from a specific thread. Start with a small page, then use `offset` + `limit` for progressive retrieval only when more context is needed.
 
 ### Diagnostics
 - `nowledge_mem_status` - show effective config (mode, apiUrl, apiKey set, sessionContext, sessionDigest, etc.), backend connectivity, and version. No parameters.
@@ -91,6 +91,7 @@ Optional config file at `~/.nowledge-mem/openclaw.json`. Falls through to OpenCl
 | `sessionDigest` | boolean | `true` | `NMEM_SESSION_DIGEST` | Thread capture + LLM distillation at session end |
 | `digestMinInterval` | integer 0-86400 | `300` | `NMEM_DIGEST_MIN_INTERVAL` | Minimum seconds between session digests |
 | `maxContextResults` | integer 1-20 | `5` | `NMEM_MAX_CONTEXT_RESULTS` | How many memories to inject at prompt time |
+| `recallMinScore` | integer 0-100 | `0` | `NMEM_RECALL_MIN_SCORE` | Min relevance score (%) to include in auto-recall |
 | `apiUrl` | string | `""` | `NMEM_API_URL` | Remote server URL. Empty = local (127.0.0.1:14242) |
 | `apiKey` | string | `""` | `NMEM_API_KEY` | API key. Never logged. |
 
diff --git a/nowledge-mem-openclaw-plugin/README.md b/nowledge-mem-openclaw-plugin/README.md
index 88deab0ab..5267f9675 100644
--- a/nowledge-mem-openclaw-plugin/README.md
+++ b/nowledge-mem-openclaw-plugin/README.md
@@ -337,6 +337,7 @@ To change settings, use the OpenClaw plugin settings UI. Changes take effect on
 | `sessionDigest` | boolean | `true` | Thread capture + LLM distillation at session end |
 | `digestMinInterval` | integer | `300` | Minimum seconds between session digests for the same thread (0-86400) |
 | `maxContextResults` | integer | `5` | Max memories to inject at prompt time (1-20, only used when sessionContext is enabled) |
+| `recallMinScore` | integer | `0` | Min relevance score (0-100%) to include in auto-recall. 0 = include all |
 | `apiUrl` | string | `""` | Remote server URL. Empty = local (`http://127.0.0.1:14242`) |
 | `apiKey` | string | `""` | API key for remote access. Injected as `NMEM_API_KEY` env var, never logged |
 
@@ -350,6 +351,7 @@ For persistent or scripted config, create `~/.nowledge-mem/openclaw.json`:
   "sessionDigest": true,
   "digestMinInterval": 300,
   "maxContextResults": 5,
+  "recallMinScore": 0,
   "apiUrl": "",
   "apiKey": ""
 }
@@ -366,6 +368,7 @@ NMEM_SESSION_CONTEXT=true
 NMEM_SESSION_DIGEST=true
 NMEM_DIGEST_MIN_INTERVAL=300
 NMEM_MAX_CONTEXT_RESULTS=5
+NMEM_RECALL_MIN_SCORE=0
 NMEM_API_URL=https://...
 NMEM_API_KEY=your-key
 ```
diff --git a/nowledge-mem-openclaw-plugin/openclaw.plugin.json b/nowledge-mem-openclaw-plugin/openclaw.plugin.json
index 1c2a4afa2..cd9da15a1 100644
--- a/nowledge-mem-openclaw-plugin/openclaw.plugin.json
+++ b/nowledge-mem-openclaw-plugin/openclaw.plugin.json
@@ -1,15 +1,15 @@
 {
 	"id": "openclaw-nowledge-mem",
-	"version": "0.6.8",
+	"version": "0.6.9",
 	"kind": "memory",
 	"uiHints": {
 		"sessionContext": {
 			"label": "Session context injection",
-			"help": "Inject Working Memory and relevant memories at prompt time. Off by default \u2014 the agent can call memory_search on demand instead."
+			"help": "Inject Working Memory and relevant memories at prompt time. Off by default; the agent can call memory_search on demand instead."
 		},
 		"sessionDigest": {
 			"label": "Session digest at end",
-			"help": "Capture conversation threads and distill key memories via LLM at session end. On by default \u2014 negligible cost, one triage call per session."
+			"help": "Capture conversation threads and distill key memories via LLM at session end. On by default. Negligible cost, one triage call per session."
 		},
 		"digestMinInterval": {
 			"label": "Minimum digest interval (seconds)",
@@ -17,7 +17,11 @@
 		},
 		"maxContextResults": {
 			"label": "Max context results",
-			"help": "How many memories to inject for each context cycle (1\u201320)"
+			"help": "How many memories to inject for each context cycle (1-20)"
+		},
+		"recallMinScore": {
+			"label": "Min recall score (%)",
+			"help": "Only inject memories with relevance score above this threshold (0-100). Set to 0 to include all results."
 		},
 		"apiUrl": {
 			"label": "Server URL (remote mode)",
@@ -25,7 +29,7 @@
 		},
 		"apiKey": {
 			"label": "API key (remote mode)",
-			"help": "Required when connecting to a remote server. Never logged or passed as a CLI argument \u2014 injected as NMEM_API_KEY env var only.",
+			"help": "Required when connecting to a remote server. Never logged or passed as a CLI argument. Injected as NMEM_API_KEY env var only.",
 			"secret": true
 		}
 	},
@@ -35,7 +39,7 @@
 			"sessionContext": {
 				"type": "boolean",
 				"default": false,
-				"description": "Inject Working Memory and recalled memories at prompt time. Off by default \u2014 the agent queries memory on demand via tools."
+				"description": "Inject Working Memory and recalled memories at prompt time. Off by default; the agent queries memory on demand via tools."
 			},
 			"sessionDigest": {
 				"type": "boolean",
@@ -56,6 +60,13 @@
 				"maximum": 20,
 				"description": "Maximum memories to inject at prompt time (only used when sessionContext is enabled)"
 			},
+			"recallMinScore": {
+				"type": "integer",
+				"default": 0,
+				"minimum": 0,
+				"maximum": 100,
+				"description": "Minimum relevance score (0-100%) to include a memory in auto-recall. Set to 0 to include all results."
+			},
 			"apiUrl": {
 				"type": "string",
 				"default": "",
diff --git a/nowledge-mem-openclaw-plugin/package.json b/nowledge-mem-openclaw-plugin/package.json
index 13e09573c..822a1726c 100644
--- a/nowledge-mem-openclaw-plugin/package.json
+++ b/nowledge-mem-openclaw-plugin/package.json
@@ -1,6 +1,6 @@
 {
 	"name": "@nowledge/openclaw-nowledge-mem",
-	"version": "0.6.8",
+	"version": "0.6.9",
 	"type": "module",
 	"description": "Nowledge Mem memory plugin for OpenClaw, local-first personal knowledge base",
 	"author": {
diff --git a/nowledge-mem-openclaw-plugin/src/config.js b/nowledge-mem-openclaw-plugin/src/config.js
index 2e7d9f837..6f7b723d1 100644
--- a/nowledge-mem-openclaw-plugin/src/config.js
+++ b/nowledge-mem-openclaw-plugin/src/config.js
@@ -19,6 +19,7 @@ const ALLOWED_KEYS = new Set([
 	"sessionDigest",
 	"digestMinInterval",
 	"maxContextResults",
+	"recallMinScore",
 	"apiUrl",
 	"apiKey",
 	// Legacy aliases — accepted but not advertised
@@ -135,7 +136,7 @@ function firstDefined(...options) {
  * will configure via OpenClaw's plugin settings UI (pluginConfig).
  *
  * Canonical keys: sessionContext, sessionDigest, digestMinInterval,
- *                 maxContextResults, apiUrl, apiKey
+ *                 maxContextResults, recallMinScore, apiUrl, apiKey
  *
  * Legacy aliases (accepted from all sources; never shown in docs):
  *   autoRecall → sessionContext
@@ -148,6 +149,7 @@ function firstDefined(...options) {
  *   NMEM_SESSION_DIGEST        — true/1/yes to enable (alias: NMEM_AUTO_CAPTURE)
  *   NMEM_DIGEST_MIN_INTERVAL   — seconds (0-86400)
  *   NMEM_MAX_CONTEXT_RESULTS   — integer (1-20)
+ *   NMEM_RECALL_MIN_SCORE      — integer (0-100)
  *   NMEM_API_URL               — remote server URL
  *   NMEM_API_KEY               — API key (never logged)
  */
@@ -234,6 +236,20 @@ export function parseConfig(raw, logger) {
 	const maxContextResults = Math.min(20, Math.max(1, Math.trunc(mcr.value)));
 	_sources.maxContextResults = mcr.source;
 
+	// --- recallMinScore: file > pluginConfig > env > default ---
+	const rmsEnv = envInt("NMEM_RECALL_MIN_SCORE");
+	const rms = firstDefined(
+		{ value: pickNum(resolvedFile, "recallMinScore"), source: "file" },
+		{
+			value: pickNum(resolvedPlugin, "recallMinScore"),
+			source: "pluginConfig",
+		},
+		{ value: rmsEnv, source: "env" },
+		{ value: 0, source: "default" },
+	);
+	const recallMinScore = Math.min(100, Math.max(0, Math.trunc(rms.value)));
+	_sources.recallMinScore = rms.source;
+
 	// --- apiUrl: file > pluginConfig > env > "" ---
 	const fileUrl =
 		typeof resolvedFile.apiUrl === "string" && resolvedFile.apiUrl.trim();
@@ -267,6 +283,7 @@ export function parseConfig(raw, logger) {
 		sessionDigest,
 		digestMinInterval,
 		maxContextResults,
+		recallMinScore,
 		apiUrl,
 		apiKey,
 		_sources,
diff --git a/nowledge-mem-openclaw-plugin/src/hooks/recall.js b/nowledge-mem-openclaw-plugin/src/hooks/recall.js
index 04f899b8f..35cf742df 100644
--- a/nowledge-mem-openclaw-plugin/src/hooks/recall.js
+++ b/nowledge-mem-openclaw-plugin/src/hooks/recall.js
@@ -13,20 +13,148 @@ function escapeForPrompt(text) {
 	);
 }
 
+/** Max query length sent to search — longer messages get truncated. */
+const MAX_QUERY_LENGTH = 500;
+
+/**
+ * Messages shorter than this get augmented with recent conversational
+ * context. Below this threshold, a message is likely referential
+ * ("explain that more", "what about the second point?") and needs
+ * surrounding context to produce useful search results.
+ *
+ * Messages at or above this threshold are substantial enough to
+ * search on their own ("openviking 不好用", "how do I deploy to k8s?").
+ */
+const SHORT_QUERY_THRESHOLD = 40;
+
+/** How many recent messages to include for short-query context. */
+const CONTEXT_MESSAGES = 3;
+
+/** Per-message cap when building context window. */
+const CONTEXT_MSG_MAX_CHARS = 150;
+
+/**
+ * Extract text from message content (string or structured blocks).
+ * Mirrors the pattern from capture.js.
+ */
+function extractText(content) {
+	if (typeof content === "string") return content.trim();
+	if (!Array.isArray(content)) return "";
+	const parts = [];
+	for (const block of content) {
+		if (!block || typeof block !== "object") continue;
+		if (block.type === "text" && typeof block.text === "string") {
+			const text = block.text.trim();
+			if (text) parts.push(text);
+		}
+	}
+	return parts.join("\n").trim();
+}
+
+/**
+ * Normalize a raw message entry to { role, text }.
+ * Handles both flat { role, content } and nested { message: { role, content } }.
+ * Returns null for non-user/assistant messages, empty content, or slash commands.
+ */
+function normalizeMessage(raw) {
+	if (!raw || typeof raw !== "object") return null;
+	const msg =
+		raw.message && typeof raw.message === "object" ? raw.message : raw;
+	const role = typeof msg.role === "string" ? msg.role : "";
+	if (role !== "user" && role !== "assistant") return null;
+	const text = extractText(msg.content);
+	if (!text) return null;
+	// Skip slash commands — they're plugin directives, not conversational
+	if (role === "user" && text.startsWith("/")) return null;
+	return { role, text };
+}
+
+/**
+ * Build the search query from the hook event.
+ *
+ * Three tiers:
+ *   >= 40 chars : latest user message alone (self-contained)
+ *   3-39 chars  : latest user message + recent context (likely referential)
+ *   < 3 chars   : skip recall (too short — "ok", emoji, etc.)
+ *
+ * The event object provides:
+ * - event.messages: structured array of {role, content} messages (preferred)
+ * - event.prompt: the full formatted prompt (fallback, truncated)
+ */
+function buildSearchQuery(event) {
+	const messages = event?.messages;
+
+	if (Array.isArray(messages) && messages.length > 0) {
+		// Find the latest user message (walk backwards)
+		let latestUserIdx = -1;
+		let latestUserText = "";
+		for (let i = messages.length - 1; i >= 0; i--) {
+			const norm = normalizeMessage(messages[i]);
+			if (norm && norm.role === "user") {
+				latestUserIdx = i;
+				latestUserText = norm.text;
+				break;
+			}
+		}
+
+		if (!latestUserText || latestUserText.length < 3) return "";
+
+		// Substantial message — use it alone
+		if (latestUserText.length >= SHORT_QUERY_THRESHOLD) {
+			return latestUserText.slice(0, MAX_QUERY_LENGTH);
+		}
+
+		// Short message — likely a follow-up. Add recent context so
+		// the search engine sees what topic the conversation is about.
+		// Collect up to CONTEXT_MESSAGES messages preceding the latest user
+		// message (both user and assistant, for topic grounding).
+		const contextParts = [];
+		const scanFrom = Math.max(0, latestUserIdx - CONTEXT_MESSAGES);
+		for (let i = scanFrom; i < latestUserIdx; i++) {
+			const norm = normalizeMessage(messages[i]);
+			if (!norm) continue;
+			const truncated =
+				norm.text.length > CONTEXT_MSG_MAX_CHARS
+					? `${norm.text.slice(0, CONTEXT_MSG_MAX_CHARS)}…`
+					: norm.text;
+			contextParts.push(truncated);
+		}
+
+		if (contextParts.length > 0) {
+			// Latest message first (primary intent), then context
+			const combined = `${latestUserText}\n\n${contextParts.join("\n")}`;
+			return combined.slice(0, MAX_QUERY_LENGTH);
+		}
+
+		// No prior context available (first message) — use as-is
+		return latestUserText;
+	}
+
+	// Fallback: use event.prompt but truncate aggressively.
+	// This path only fires when OpenClaw doesn't provide event.messages.
+	if (typeof event?.prompt === "string" && event.prompt.length >= 5) {
+		return event.prompt.slice(0, MAX_QUERY_LENGTH);
+	}
+
+	return "";
+}
+
 /**
- * Builds the before_agent_start hook handler.
+ * Builds the before_prompt_build hook handler.
  *
- * Injects two layers of context at session start:
+ * Injects two layers of context at prompt time:
  * 1. Working Memory — today's focus, priorities, unresolved flags
- * 2. Relevant memories — with types, labels, and source provenance
+ * 2. Relevant memories — searched using the user's latest message
  *
  * Tool guidance is minimal — the agent already sees full tool descriptions
  * in its tool list. We only add a brief behavioral note.
  */
 export function buildRecallHandler(client, cfg, logger) {
+	const minScore = (cfg.recallMinScore ?? 0) / 100; // config is 0-100, API is 0-1
+
 	return async (event) => {
-		const prompt = event.prompt;
-		if (!prompt || prompt.length < 5) return;
+		const searchQuery = buildSearchQuery(event);
+		if (!searchQuery) return;
 
 		const sections = [];
 
@@ -44,9 +172,17 @@ export function buildRecallHandler(client, cfg, logger) {
 
 		// 2. Relevant memories — enriched with scoring signals and labels
 		try {
-			const results = await client.searchRich(prompt, cfg.maxContextResults);
-			if (results.length > 0) {
-				const lines = results.map((r) => {
+			const results = await client.searchRich(
+				searchQuery,
+				cfg.maxContextResults,
+			);
+			// Filter by minimum score if configured
+			const filtered =
+				minScore > 0
+					? results.filter((r) => (r.score ?? 0) >= minScore)
+					: results;
+			if (filtered.length > 0) {
+				const lines = filtered.map((r) => {
 					const title = r.title || "(untitled)";
 					const score = `${(r.score * 100).toFixed(0)}%`;
 					const labels =
@@ -82,7 +218,9 @@ export function buildRecallHandler(client, cfg, logger) {
 			"</nowledge-mem>",
 		].join("\n");
 
-		logger.debug?.(`recall: injecting ${context.length} chars`);
+		logger.debug?.(
+			`recall: injecting ${context.length} chars (query: ${searchQuery.slice(0, 80)}…)`,
+		);
 		return { prependContext: context };
 	};
 }
diff --git a/nowledge-mem-openclaw-plugin/src/tools/status.js b/nowledge-mem-openclaw-plugin/src/tools/status.js
index 321e0b421..bec1ca6cc 100644
--- a/nowledge-mem-openclaw-plugin/src/tools/status.js
+++ b/nowledge-mem-openclaw-plugin/src/tools/status.js
@@ -106,6 +106,9 @@ export function createStatusTool(client, _logger, cfg) {
 			lines.push(
 				`  maxContextResults: ${cfg.maxContextResults} (${sources.maxContextResults || "?"})`,
 			);
+			lines.push(
+				`  recallMinScore: ${cfg.recallMinScore}% (${sources.recallMinScore || "?"})`,
+			);
 
 			details.config = {
 				sessionContext: {
@@ -124,6 +127,10 @@ export function createStatusTool(client, _logger, cfg) {
 					value: cfg.maxContextResults,
 					source: sources.maxContextResults,
 				},
+				recallMinScore: {
+					value: cfg.recallMinScore,
+					source: sources.recallMinScore,
+				},
 				apiUrl: { value: cfg.apiUrl || "(local)", source: sources.apiUrl },
 				apiKey: {
 					value: cfg.apiKey ? "(set)" : "(not set)",
diff --git a/static/nowledge-mem-icon-512x512.png b/static/nowledge-mem-icon-512x512.png
new file mode 100644
index 000000000..9b863a26e
Binary files /dev/null and b/static/nowledge-mem-icon-512x512.png differ