fixes readme

kvendrik · kvendrik · commit 8fbbcc38b28a · 2026-03-29T15:17:55.000+02:00
diff --git a/agent/context/README.md b/agent/context/README.md
@@ -37,9 +37,9 @@ Both [Anthropic](https://docs.anthropic.com/en/docs/build-with-claude/compaction
 
 - **Keep summaries intact** — every summarization pass discards detail. We never re-summarize existing summaries; they're preserved as-is and new summaries are added alongside them ([LangChain `moving_summary_buffer`](https://langchain-doc.readthedocs.io/en/latest/modules/memory/types/summary_buffer.html)).
 - **Keep recent turns verbatim** — the most recent exchanges carry the highest signal. Summarize the older prefix, never the tail ([Anthropic](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents), [LangChain summary-buffer](https://langchain-doc.readthedocs.io/en/latest/modules/memory/types/summary_buffer.html)). Anthropic's Claude Code uses the same shape: compressed context + the N most recently accessed items ([source](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents)).
-- **Use cost as a secondary signal** — large cache write costs indicate a bloated context even if token counts look fine. We use cost thresholds alongside token thresholds ([Anthropic prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching)).
-- **Compact before hitting the hard limit** — model recall drops well before the hard context window. Compact proactively at a soft threshold, not at the edge ([OpenAI cookbook](https://cookbook.openai.com/examples/context_summarization_with_realtime_api), [Anthropic post on compaction](https://docs.anthropic.com/en/docs/build-with-claude/compaction), [Anthropic post on context engineering](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents)).
+- **Compact before hitting the hard limit** — model recall drops well before the hard context window. We compact at a configurable soft token limit (defaults to 60% of the context window), matching the industry-standard approach used by [Claude Code](https://docs.anthropic.com/en/docs/build-with-claude/compaction), [OpenAI](https://developers.openai.com/api/docs/guides/context-management), and [LangChain](https://langchain-doc.readthedocs.io/en/latest/modules/memory/types/summary_buffer.html).
 - **Don't summarize old tool call results** — raw tool output is useful when fresh but redundant once acted on. Clearing old results is the lightest-touch compaction step ([Anthropic](https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents)).
+- **Compress tool results in-place** — no matter if we compact the messages or not we do run tool result compression on it which truncates tool results older than the last 3 user turns.
 
 ## What you should know
 
@@ -49,7 +49,7 @@ Both [Anthropic](https://docs.anthropic.com/en/docs/build-with-claude/compaction
 
 When you call `compact()`:
 
-1. **`checkLimit()`** — checks thresholds (cache write cost, cache write tokens, soft token limit). First match fires a `reached: true`. This mirrors Anthropic's token-threshold trigger and OpenAI's `compact_threshold`.
+1. **`checkLimit()`** — checks whether context tokens exceed the soft token limit (defaults to 60% of the context window). Returns `reached: true` when the limit is crossed.
 2. **`split()`** — finds the boundary between "prefix to summarize" and "tail to preserve." We try to keep 5 recent user turns, then 3, then 1, each checked against a token budget (40% of context window). Falls back to the largest token-bounded suffix that fits. Token-bounded retention follows the LangChain `max_token_limit` pattern.
 3. **`summarize()`** — sends the messages to compact to the LLM with a summarization prompt. Custom instructions replace the default prompt (when `instructions.strategy` is set to `replace`), matching Anthropic's `instructions` parameter behavior.
 4. **Reassemble** — We put together the messages to preserve from `split()` and the `summary` for a new array of messages.
@@ -58,7 +58,7 @@ When you call `compact()`:
 
 ```ts
 import type { Model, Api } from '@mariozechner/pi-ai';
-import { checkLimit, split, summarize } from '@kvendrik/compact';
+import { checkLimit, split, summarize, compressToolResults } from '@kvendrik/compact';
 
 const model: {model: Model<Api>, key: string} = {...};
 const messages: AgentMessage[] = [];
@@ -79,5 +79,5 @@ if (limit.reached) {
   return [...compacted, ...preserve];
 }
 
-return messages;
+return compressToolResults(messages);
 ```
diff --git a/agent/context/checkLimit.ts b/agent/context/checkLimit.ts
@@ -1,29 +1,16 @@
 import type { Model, Api } from '@mariozechner/pi-ai';
 import type { AgentMessage } from '@mariozechner/pi-agent-core';
-import { getLatestAssistantUsage, usage } from './usage';
+import { usage } from './usage';
 
 export interface Limits {
   /** Percentage of the context window to compact at (default is 60%). Model recall degrades well
    *  before the hard limit, so we compact proactively.
    * @default 60 */
   softTokenLimit?: number;
-  /** High cache-write cost (in USD) signals a bloated context even when token
-   *  counts look fine. Assumes Sonnet-class pricing (~$3/MTok input);
-   *  adjust if the default model tier changes significantly.
-   * @default 0.05 */
-  cacheWriteCostLimit?: number;
-  /** If cache-write tokens exceed this share of the context window,
-   *  trigger compaction. Catches uncached context growth that the
-   *  token-based soft limit might miss on the first turn after a
-   *  cache bust.
-   * @default 40 */
-  cacheWriteTokenLimit?: number;
 }
 
-export const DEFAULT_LIMITS: Required<Limits> = {
+const DEFAULT_LIMITS: Required<Limits> = {
   softTokenLimit: 60,
-  cacheWriteCostLimit: 0.05,
-  cacheWriteTokenLimit: 40,
 };
 
 interface CheckOptions {
@@ -44,31 +31,7 @@ export function checkLimit(
   const currentUsage = usage(messages);
   const contextWindow = model.contextWindow;
   const softLimit = Math.floor(contextWindow * (limits.softTokenLimit / 100));
-
-  const latestTurnTokenLimit = Math.floor(
-    contextWindow * (limits.cacheWriteTokenLimit / 100),
-  );
-
   const currentTokens = currentUsage.tokens.used;
-  const latestUsage = getLatestAssistantUsage(messages);
-  const latestTurnCacheWriteCost =
-    currentUsage.cost.cacheWrite === 0
-      ? undefined
-      : currentUsage.cost.cacheWrite;
-  const latestTurnCacheWriteTokens = latestUsage?.cacheWrite ?? 0;
-
-  if (
-    typeof latestTurnCacheWriteCost === 'number' &&
-    latestTurnCacheWriteCost >= limits.cacheWriteCostLimit
-  ) {
-    const reason = `Cache write cost $${latestTurnCacheWriteCost.toFixed(4)} exceeded $${limits.cacheWriteCostLimit.toFixed(2)} threshold`;
-    return { reached: true, reason };
-  }
-
-  if (latestTurnCacheWriteTokens >= latestTurnTokenLimit) {
-    const reason = `Cache write tokens ${latestTurnCacheWriteTokens} exceeded ${latestTurnTokenLimit} token threshold`;
-    return { reached: true, reason };
-  }
 
   if (currentTokens <= softLimit) {
     const reason = `Context is within budget (${currentTokens}/${softLimit} tokens, ${Math.round((currentTokens / softLimit) * 100)}%)`;
diff --git a/agent/context/compact.ts b/agent/context/compact.ts
@@ -1,8 +1,9 @@
 import type { Model, Api } from '@mariozechner/pi-ai';
 import type { AgentMessage } from '@mariozechner/pi-agent-core';
-import { checkLimit, type Limits, DEFAULT_LIMITS } from './checkLimit';
+import { checkLimit, type Limits } from './checkLimit';
 import { split } from './split';
 import { summarize, type Instructions } from './summarize';
+import { compressToolResults } from './compress';
 
 export interface CompactResult {
   messages: AgentMessage[];
@@ -20,7 +21,7 @@ interface CompactOptions {
 
 export async function compact(
   messages: AgentMessage[],
-  { signal, model, instructions, force, limits }: CompactOptions,
+  { signal, model, instructions, force, limits }: CompactOptions
 ): Promise<CompactResult> {
   const effectiveSignal = signal ?? new AbortController().signal;
 
@@ -34,22 +35,23 @@ export async function compact(
 
   const { reached, reason } = checkLimit(messages, {
     model: model.model,
-    limits: {
-      ...DEFAULT_LIMITS,
-      ...limits,
-    },
+    limits,
   });
 
   if (!reached) {
-    return { messages, didCompact: false, reason };
+    return {
+      messages: compressToolResults(messages),
+      didCompact: false,
+      reason,
+    };
   }
 
   return doCompact(reason);
 
   async function doCompact(trigger: string): Promise<CompactResult> {
     const { compact: messagesToCompact, preserve } = split(
       messages,
-      model.model,
+      model.model
     );
 
     if (messagesToCompact === null) {
@@ -63,7 +65,7 @@ export async function compact(
     });
 
     return {
-      messages: [...compactedMessages, ...preserve],
+      messages: [...compactedMessages, ...compressToolResults(preserve)],
       didCompact: true,
       reason: trigger,
     };
diff --git a/agent/context/compress.ts b/agent/context/compress.ts
@@ -0,0 +1,93 @@
+import type { AgentMessage } from '@mariozechner/pi-agent-core';
+import type {
+  TextContent,
+  ImageContent,
+  ToolResultMessage,
+} from '@mariozechner/pi-ai';
+
+/** Number of recent user turns whose tool results are preserved verbatim. */
+const PRESERVE_RECENT_TURNS = 3;
+
+/** Max characters kept per text block in a compressed tool result. */
+const MAX_COMPRESSED_CHARS = 200;
+
+const TRUNCATION_MARKER = '\n[truncated]';
+
+export function compressToolResults(
+  messages: AgentMessage[],
+  {
+    preserveRecentTurns = PRESERVE_RECENT_TURNS,
+    maxCompressedChars = MAX_COMPRESSED_CHARS,
+  }: { preserveRecentTurns?: number; maxCompressedChars?: number } = {}
+): AgentMessage[] {
+  const boundary = findPreserveBoundary(messages);
+
+  if (boundary === 0) {
+    return messages;
+  }
+
+  const result: AgentMessage[] = [];
+
+  for (let idx = 0; idx < messages.length; idx++) {
+    const msg = messages[idx];
+    if (idx < boundary && isToolResult(msg)) {
+      result.push(compressedToolResult(msg));
+    } else {
+      result.push(msg);
+    }
+  }
+
+  return result;
+
+  /** Index of the Nth-from-last user message. Messages before this index are
+   *  eligible for compression. Returns 0 when the conversation is too short. */
+  function findPreserveBoundary(messages: AgentMessage[]): number {
+    let userTurnsSeen = 0;
+
+    for (let idx = messages.length - 1; idx >= 0; idx--) {
+      if (messages[idx].role === 'user') {
+        userTurnsSeen++;
+        if (userTurnsSeen === preserveRecentTurns) {
+          return idx;
+        }
+      }
+    }
+
+    return 0;
+  }
+
+  function compressedToolResult(msg: ToolResultMessage): AgentMessage {
+    const compressed: (TextContent | ImageContent)[] = [];
+    let hadImage = false;
+
+    for (const block of msg.content) {
+      if (block.type === 'image') {
+        hadImage = true;
+        continue;
+      }
+
+      if (block.text.length <= maxCompressedChars) {
+        compressed.push(block);
+      } else {
+        compressed.push({
+          type: 'text',
+          text: block.text.slice(0, maxCompressedChars) + TRUNCATION_MARKER,
+        });
+      }
+    }
+
+    if (hadImage) {
+      compressed.push({ type: 'text', text: '[image omitted]' });
+    }
+
+    if (compressed.length === 0) {
+      compressed.push({ type: 'text', text: '[result omitted]' });
+    }
+
+    return { ...msg, content: compressed };
+  }
+}
+
+function isToolResult(msg: AgentMessage): msg is ToolResultMessage {
+  return msg.role === 'toolResult';
+}
diff --git a/agent/context/index.ts b/agent/context/index.ts
@@ -3,3 +3,4 @@ export { summarize } from './summarize';
 export { compact, type CompactResult } from './compact';
 export { split } from './split';
 export { checkLimit } from './checkLimit';
+export { compressToolResults } from './compress';
diff --git a/agent/context/package.json b/agent/context/package.json
@@ -2,11 +2,11 @@
   "name": "@kvendrik/compact",
   "version": "0.1.0",
   "description": "A context compaction toolkit for pi-ai",
-  "module": "src/index.ts",
+  "module": "./index.ts",
   "type": "module",
   "license": "MIT",
   "files": [
-    "src",
+    "*.ts",
     "README.md"
   ],
   "repository": {
@@ -22,7 +22,7 @@
   ],
   "scripts": {
     "test": "bun test",
-    "lint": "eslint src/",
+    "lint": "eslint .",
     "typecheck": "tsc --noEmit",
     "test:all": "bun test && bun lint && bun typecheck"
   },
diff --git a/agent/context/summarize.ts b/agent/context/summarize.ts
@@ -1,5 +1,6 @@
 import { completeSimple, type Model, type Api } from '@mariozechner/pi-ai';
 import type { AgentMessage } from '@mariozechner/pi-agent-core';
+import { compressToolResults } from './compress';
 
 const SUMMARIZE_SYSTEM = `You are a summarizer. Given a conversation history, produce a concise summary that preserves key facts, decisions, topics, and context needed to continue the conversation. Output only the summary, no preamble.`;
 
@@ -61,24 +62,15 @@ export async function summarize(
   return [summaryMessage];
 }
 
-function messagesToTranscript(messages: AgentMessage[]): string {
+function messagesToTranscript(allMessages: AgentMessage[]): string {
   const lines: string[] = [];
-  for (const m of messages) {
-    const msg = m as {
-      role: string;
-      content?: string | { type?: string; text?: string }[];
-    };
-
-    if (msg.role === 'tool') {
-      lines.push('tool: [tool output omitted]');
-      continue;
-    }
 
+  const messages = compressToolResults(allMessages);
+
+  for (const msg of messages) {
     const content = msg.content;
-    if (content === undefined) {
-      continue;
-    }
     let text = '';
+
     if (typeof content === 'string') {
       text = content;
     } else if (Array.isArray(content)) {
@@ -93,10 +85,12 @@ function messagesToTranscript(messages: AgentMessage[]): string {
         .map((b) => (b as { text: string }).text)
         .join('');
     }
+
     if (text.trim() !== '') {
       lines.push(`${msg.role}: ${text.trim()}`);
     }
   }
+
   return lines.join('\n\n');
 }
 
diff --git a/agent/context/usage.ts b/agent/context/usage.ts
@@ -45,18 +45,6 @@ function hasMeaningfulUsage(msgUsage: UsageWithCost): boolean {
   );
 }
 
-export function getLatestAssistantUsage(
-  messages: AgentMessage[],
-): UsageWithCost | null {
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const message = messages[i];
-    if (hasUsage(message)) {
-      return message.usage;
-    }
-  }
-
-  return null;
-}
 
 function getLatestAssistantMessageWithUsage(
   messages: AgentMessage[],