From 99a5cd707bcb452964353e7b9b87deccdb25dc00 Mon Sep 17 00:00:00 2001 From: Chad Woolley Date: Sat, 21 Feb 2026 22:47:24 -0800 Subject: [PATCH] llm/anthropic, agent, attractor: configurable max_tokens with 64K default Make max_tokens configurable at three levels (node attr > stylesheet > provider default) following the existing reasoning_effort pattern. Bump the Anthropic adapter default from 4096 to 65536 so large tool calls (e.g. write_file with 80KB+ content) are not truncated mid-JSON. Co-Authored-By: Claude Opus 4.6 --- .../2026-02-21-configurable-max-tokens.md | 178 ++++++++++++++++++ internal/agent/session.go | 7 + internal/attractor/engine/codergen_router.go | 12 ++ internal/attractor/style/stylesheet.go | 4 +- internal/llm/providers/anthropic/adapter.go | 4 +- .../llm/providers/anthropic/adapter_test.go | 6 +- 6 files changed, 204 insertions(+), 7 deletions(-) create mode 100644 docs/plans/2026-02-21-configurable-max-tokens.md diff --git a/docs/plans/2026-02-21-configurable-max-tokens.md b/docs/plans/2026-02-21-configurable-max-tokens.md new file mode 100644 index 00000000..cbe1adbc --- /dev/null +++ b/docs/plans/2026-02-21-configurable-max-tokens.md @@ -0,0 +1,178 @@ +# Configurable `max_tokens` for LLM API Calls + +**Date:** 2026-02-21 +**Status:** Proposed +**Problem:** The Anthropic provider adapter hardcodes `max_tokens: 4096`. When an agent +emits a large tool call (e.g., `write_file` with 80KB+ content), the output is truncated +mid-JSON, causing schema validation failures (`missing properties: 'content'`). This +creates unrecoverable error loops. + +## Goal + +Make `max_tokens` configurable at three levels (highest priority wins): + +1. **Node attribute** in the pipeline DOT file (per-node override) +2. **Model stylesheet** in the DOT graph attributes (per-class/selector) +3. **Provider adapter default** (bumped from 4096 to 65536) + +This follows the exact same pattern as `reasoning_effort`, which is already configurable +at levels 1 and 2. + +## Design + +### Level of effort: Small + +All plumbing already exists. `llm.Request.MaxTokens` is defined and respected by every +provider adapter. The only missing piece is reading it from node attributes and the +stylesheet, then passing it through to the request. + +## Changes + +### 1. Add `max_tokens` to stylesheet whitelist + +**File:** `internal/attractor/style/stylesheet.go` ~line 148 + +The `props` slice controls which properties are applied from the stylesheet to nodes. +Add `"max_tokens"`: + +```go +// Before +props := []string{"llm_model", "llm_provider", "reasoning_effort"} + +// After +props := []string{"llm_model", "llm_provider", "reasoning_effort", "max_tokens"} +``` + +### 2. Read `max_tokens` from node attrs in CoderGen router + +**File:** `internal/attractor/engine/codergen_router.go` ~line 172 + +After the existing `reasoning_effort` extraction, add `max_tokens` extraction: + +```go +// Existing pattern for reasoning_effort (around line 172): +reasoning := node.Attr("reasoning_effort", "") + +// Add after: +maxTokensStr := node.Attr("max_tokens", "") +var maxTokensPtr *int +if maxTokensStr != "" { + if v, err := strconv.Atoi(maxTokensStr); err == nil && v > 0 { + maxTokensPtr = &v + } +} +``` + +Then pass it to the request in the **one_shot path** (~line 180): + +```go +req := llm.Request{ + Provider: prov, + Model: mid, + Messages: []llm.Message{llm.User(prompt)}, + ReasoningEffort: reasoningPtr, + MaxTokens: maxTokensPtr, // <-- add this +} +``` + +And in the **agent_loop path** — find where `SessionConfig` is built (~line 235) and +add `MaxTokens` there too. This requires a small addition to `SessionConfig` (see step 3). + +### 3. Add `MaxTokens` to agent SessionConfig + +**File:** `internal/agent/session.go` + +Add a `MaxTokens *int` field to the session config struct (find the struct definition +that holds `ReasoningEffort`). Then apply it when building the `llm.Request` (~line 466): + +```go +// Existing pattern: +if strings.TrimSpace(s.cfg.ReasoningEffort) != "" { + v := strings.TrimSpace(s.cfg.ReasoningEffort) + req.ReasoningEffort = &v +} + +// Add after: +if s.cfg.MaxTokens != nil { + req.MaxTokens = s.cfg.MaxTokens +} +``` + +### 4. Wire it up in CoderGen agent_loop session creation + +**File:** `internal/attractor/engine/codergen_router.go` + +Where `SessionConfig` is populated for agent_loop execution (~line 235): + +```go +sessCfg.ReasoningEffort = reasoning +sessCfg.MaxTokens = maxTokensPtr // <-- add this +``` + +### 5. Bump Anthropic provider adapter default to 64K + +**File:** `internal/llm/providers/anthropic/adapter.go` lines 97 and 270 + +Both the synchronous and streaming paths hardcode `maxTokens := 4096`. Change both to +`maxTokens := 65536` so that all Anthropic API calls get a generous default, eliminating +truncation for large tool calls even without per-node configuration: + +```go +// Before (lines 97 and 270) +maxTokens := 4096 + +// After +maxTokens := 65536 +``` + +Both paths already check `req.MaxTokens` and use it when non-nil, so per-node overrides +still take priority. Google's adapter (`adapter.go:97-101`) is left unchanged (its 2048 +default is appropriate for its usage pattern). + +## Usage + +### Per-node in DOT file + +```dot +expand_spec [ + shape=box, + max_tokens=32768 +] +``` + +### Via model stylesheet (per-class) + +```dot +graph [ + model_stylesheet=" + * { llm_model: claude-sonnet-4-6; llm_provider: anthropic; } + .hard { llm_model: claude-opus-4-6; max_tokens: 32768; } + " +] +``` + +### Priority order + +1. Explicit node attribute (`max_tokens=32768` on the node) — wins +2. Stylesheet match (`.hard { max_tokens: 32768; }`) — applied only if node attr missing +3. Provider adapter default (65536 for Anthropic) — fallback + +This matches the existing stylesheet semantics: `ApplyStylesheet` only sets properties +that are **missing** from node attrs (see `stylesheet.go` line 50). + +## Files to modify + +| File | Change | +|------|--------| +| `internal/attractor/style/stylesheet.go` | Add `"max_tokens"` to props whitelist | +| `internal/attractor/engine/codergen_router.go` | Read `max_tokens` attr, pass to Request and SessionConfig | +| `internal/agent/session.go` (or wherever SessionConfig is defined) | Add `MaxTokens *int` field, apply to Request | +| `internal/llm/providers/anthropic/adapter.go` | Bump default `maxTokens` from 4096 to 65536 (lines 97, 270) | + +## Testing + +1. Unit test: stylesheet with `max_tokens` property is parsed and applied to nodes +2. Unit test: `max_tokens` node attr is read and converted to `*int` correctly +3. Integration: run a pipeline with `max_tokens=32768` on a node and verify the API + request includes the correct value (check CXDB turn data or provider logs) + diff --git a/internal/agent/session.go b/internal/agent/session.go index ebcf358b..f15fb251 100644 --- a/internal/agent/session.go +++ b/internal/agent/session.go @@ -33,6 +33,10 @@ type SessionConfig struct { // Valid values are provider-dependent but typically include: low|medium|high. ReasoningEffort string + // MaxTokens overrides the provider adapter's default max_tokens when non-nil. + // Use this to allow larger outputs (e.g., large write_file tool calls). + MaxTokens *int + // ProviderOptions is merged into every LLM request as provider_options. // Use this for provider-specific parameters (e.g., Cerebras clear_thinking). ProviderOptions map[string]any @@ -473,6 +477,9 @@ func (s *Session) processOneInput(ctx context.Context, input string) (string, er v := strings.TrimSpace(s.cfg.ReasoningEffort) req.ReasoningEffort = &v } + if s.cfg.MaxTokens != nil { + req.MaxTokens = s.cfg.MaxTokens + } if len(s.cfg.ProviderOptions) > 0 { req.ProviderOptions = s.cfg.ProviderOptions } diff --git a/internal/attractor/engine/codergen_router.go b/internal/attractor/engine/codergen_router.go index 094b0b82..b11f72a3 100644 --- a/internal/attractor/engine/codergen_router.go +++ b/internal/attractor/engine/codergen_router.go @@ -175,6 +175,14 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m reasoningPtr = &reasoning } + maxTokensStr := strings.TrimSpace(node.Attr("max_tokens", "")) + var maxTokensPtr *int + if maxTokensStr != "" { + if v, err := strconv.Atoi(maxTokensStr); err == nil && v > 0 { + maxTokensPtr = &v + } + } + switch mode { case "one_shot": text, used, err := r.withFailoverText(ctx, execCtx, node, client, provider, modelID, func(prov string, mid string) (string, error) { @@ -183,6 +191,7 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m Model: mid, Messages: []llm.Message{llm.User(prompt)}, ReasoningEffort: reasoningPtr, + MaxTokens: maxTokensPtr, } if err := writeJSON(filepath.Join(stageDir, "api_request.json"), req); err != nil { warnEngine(execCtx, fmt.Sprintf("write api_request.json: %v", err)) @@ -234,6 +243,9 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m if reasoning != "" { sessCfg.ReasoningEffort = reasoning } + if maxTokensPtr != nil { + sessCfg.MaxTokens = maxTokensPtr + } // Cerebras GLM 4.7: preserve reasoning across agent-loop turns. // clear_thinking defaults to true on the API, which strips prior // reasoning context — counterproductive for multi-step agentic work. diff --git a/internal/attractor/style/stylesheet.go b/internal/attractor/style/stylesheet.go index f1f16dc5..04689647 100644 --- a/internal/attractor/style/stylesheet.go +++ b/internal/attractor/style/stylesheet.go @@ -48,7 +48,7 @@ func ApplyStylesheet(g *model.Graph, rules []Rule) error { func applyToNode(g *model.Graph, n *model.Node, rules []Rule) { // Only set properties that are missing. - props := []string{"llm_model", "llm_provider", "reasoning_effort"} + props := []string{"llm_model", "llm_provider", "reasoning_effort", "max_tokens"} for _, prop := range props { if _, ok := n.Attrs[prop]; ok { continue @@ -145,7 +145,7 @@ func (p *ssParser) parseRule() (Rule, error) { if err != nil { return Rule{}, err } - if prop != "llm_model" && prop != "llm_provider" && prop != "reasoning_effort" { + if prop != "llm_model" && prop != "llm_provider" && prop != "reasoning_effort" && prop != "max_tokens" { return Rule{}, p.errf("unknown property %q", prop) } p.skipSpace() diff --git a/internal/llm/providers/anthropic/adapter.go b/internal/llm/providers/anthropic/adapter.go index 88614bdc..9170b65a 100644 --- a/internal/llm/providers/anthropic/adapter.go +++ b/internal/llm/providers/anthropic/adapter.go @@ -94,7 +94,7 @@ func (a *Adapter) Complete(ctx context.Context, req llm.Request) (llm.Response, } autoCache := anthropicAutoCacheEnabled(a.Name(), req.ProviderOptions) - maxTokens := 4096 + maxTokens := 65536 if req.MaxTokens != nil && *req.MaxTokens > 0 { maxTokens = *req.MaxTokens } @@ -267,7 +267,7 @@ func (a *Adapter) Stream(ctx context.Context, req llm.Request) (llm.Stream, erro } autoCache := anthropicAutoCacheEnabled(a.Name(), req.ProviderOptions) - maxTokens := 4096 + maxTokens := 65536 if req.MaxTokens != nil && *req.MaxTokens > 0 { maxTokens = *req.MaxTokens } diff --git a/internal/llm/providers/anthropic/adapter_test.go b/internal/llm/providers/anthropic/adapter_test.go index 4b090e0b..4a58c64c 100644 --- a/internal/llm/providers/anthropic/adapter_test.go +++ b/internal/llm/providers/anthropic/adapter_test.go @@ -1583,7 +1583,7 @@ func TestAdapter_UsageCacheTokens_Mapped(t *testing.T) { } } -func TestAdapter_Complete_DefaultMaxTokens_Is4096(t *testing.T) { +func TestAdapter_Complete_DefaultMaxTokens_Is65536(t *testing.T) { var gotBody map[string]any srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -1620,8 +1620,8 @@ func TestAdapter_Complete_DefaultMaxTokens_Is4096(t *testing.T) { if !ok { t.Fatalf("max_tokens not found or not a number: %#v", gotBody["max_tokens"]) } - if int(mt) != 4096 { - t.Fatalf("max_tokens: got %d want 4096", int(mt)) + if int(mt) != 65536 { + t.Fatalf("max_tokens: got %d want 65536", int(mt)) } }