From 99a5cd707bcb452964353e7b9b87deccdb25dc00 Mon Sep 17 00:00:00 2001
From: Chad Woolley <cwoolley@gitlab.com>
Date: Sat, 21 Feb 2026 22:47:24 -0800
Subject: [PATCH] llm/anthropic, agent, attractor: configurable max_tokens with
 64K default

Make max_tokens configurable at three levels (node attr > stylesheet >
provider default) following the existing reasoning_effort pattern. Bump
the Anthropic adapter default from 4096 to 65536 so large tool calls
(e.g. write_file with 80KB+ content) are not truncated mid-JSON.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../2026-02-21-configurable-max-tokens.md     | 178 ++++++++++++++++++
 internal/agent/session.go                     |   7 +
 internal/attractor/engine/codergen_router.go  |  12 ++
 internal/attractor/style/stylesheet.go        |   4 +-
 internal/llm/providers/anthropic/adapter.go   |   4 +-
 .../llm/providers/anthropic/adapter_test.go   |   6 +-
 6 files changed, 204 insertions(+), 7 deletions(-)
 create mode 100644 docs/plans/2026-02-21-configurable-max-tokens.md

diff --git a/docs/plans/2026-02-21-configurable-max-tokens.md b/docs/plans/2026-02-21-configurable-max-tokens.md
new file mode 100644
index 00000000..cbe1adbc
--- /dev/null
+++ b/docs/plans/2026-02-21-configurable-max-tokens.md
@@ -0,0 +1,178 @@
+# Configurable `max_tokens` for LLM API Calls
+
+**Date:** 2026-02-21
+**Status:** Proposed
+**Problem:** The Anthropic provider adapter hardcodes `max_tokens: 4096`. When an agent
+emits a large tool call (e.g., `write_file` with 80KB+ content), the output is truncated
+mid-JSON, causing schema validation failures (`missing properties: 'content'`). This
+creates unrecoverable error loops.
+
+## Goal
+
+Make `max_tokens` configurable at three levels (highest priority wins):
+
+1. **Node attribute** in the pipeline DOT file (per-node override)
+2. **Model stylesheet** in the DOT graph attributes (per-class/selector)
+3. **Provider adapter default** (bumped from 4096 to 65536)
+
+This follows the exact same pattern as `reasoning_effort`, which is already configurable
+at levels 1 and 2.
+
+## Design
+
+### Level of effort: Small
+
+All plumbing already exists. `llm.Request.MaxTokens` is defined and respected by every
+provider adapter. The only missing piece is reading it from node attributes and the
+stylesheet, then passing it through to the request.
+
+## Changes
+
+### 1. Add `max_tokens` to stylesheet whitelist
+
+**File:** `internal/attractor/style/stylesheet.go` ~line 148
+
+The `props` slice controls which properties are applied from the stylesheet to nodes.
+Add `"max_tokens"`:
+
+```go
+// Before
+props := []string{"llm_model", "llm_provider", "reasoning_effort"}
+
+// After
+props := []string{"llm_model", "llm_provider", "reasoning_effort", "max_tokens"}
+```
+
+### 2. Read `max_tokens` from node attrs in CoderGen router
+
+**File:** `internal/attractor/engine/codergen_router.go` ~line 172
+
+After the existing `reasoning_effort` extraction, add `max_tokens` extraction:
+
+```go
+// Existing pattern for reasoning_effort (around line 172):
+reasoning := node.Attr("reasoning_effort", "")
+
+// Add after:
+maxTokensStr := node.Attr("max_tokens", "")
+var maxTokensPtr *int
+if maxTokensStr != "" {
+    if v, err := strconv.Atoi(maxTokensStr); err == nil && v > 0 {
+        maxTokensPtr = &v
+    }
+}
+```
+
+Then pass it to the request in the **one_shot path** (~line 180):
+
+```go
+req := llm.Request{
+    Provider:        prov,
+    Model:           mid,
+    Messages:        []llm.Message{llm.User(prompt)},
+    ReasoningEffort: reasoningPtr,
+    MaxTokens:       maxTokensPtr,  // <-- add this
+}
+```
+
+And in the **agent_loop path** — find where `SessionConfig` is built (~line 235) and
+add `MaxTokens` there too. This requires a small addition to `SessionConfig` (see step 3).
+
+### 3. Add `MaxTokens` to agent SessionConfig
+
+**File:** `internal/agent/session.go`
+
+Add a `MaxTokens *int` field to the session config struct (find the struct definition
+that holds `ReasoningEffort`). Then apply it when building the `llm.Request` (~line 466):
+
+```go
+// Existing pattern:
+if strings.TrimSpace(s.cfg.ReasoningEffort) != "" {
+    v := strings.TrimSpace(s.cfg.ReasoningEffort)
+    req.ReasoningEffort = &v
+}
+
+// Add after:
+if s.cfg.MaxTokens != nil {
+    req.MaxTokens = s.cfg.MaxTokens
+}
+```
+
+### 4. Wire it up in CoderGen agent_loop session creation
+
+**File:** `internal/attractor/engine/codergen_router.go`
+
+Where `SessionConfig` is populated for agent_loop execution (~line 235):
+
+```go
+sessCfg.ReasoningEffort = reasoning
+sessCfg.MaxTokens = maxTokensPtr  // <-- add this
+```
+
+### 5. Bump Anthropic provider adapter default to 64K
+
+**File:** `internal/llm/providers/anthropic/adapter.go` lines 97 and 270
+
+Both the synchronous and streaming paths hardcode `maxTokens := 4096`. Change both to
+`maxTokens := 65536` so that all Anthropic API calls get a generous default, eliminating
+truncation for large tool calls even without per-node configuration:
+
+```go
+// Before (lines 97 and 270)
+maxTokens := 4096
+
+// After
+maxTokens := 65536
+```
+
+Both paths already check `req.MaxTokens` and use it when non-nil, so per-node overrides
+still take priority. Google's adapter (`adapter.go:97-101`) is left unchanged (its 2048
+default is appropriate for its usage pattern).
+
+## Usage
+
+### Per-node in DOT file
+
+```dot
+expand_spec [
+    shape=box,
+    max_tokens=32768
+]
+```
+
+### Via model stylesheet (per-class)
+
+```dot
+graph [
+    model_stylesheet="
+        * { llm_model: claude-sonnet-4-6; llm_provider: anthropic; }
+        .hard { llm_model: claude-opus-4-6; max_tokens: 32768; }
+    "
+]
+```
+
+### Priority order
+
+1. Explicit node attribute (`max_tokens=32768` on the node) — wins
+2. Stylesheet match (`.hard { max_tokens: 32768; }`) — applied only if node attr missing
+3. Provider adapter default (65536 for Anthropic) — fallback
+
+This matches the existing stylesheet semantics: `ApplyStylesheet` only sets properties
+that are **missing** from node attrs (see `stylesheet.go` line 50).
+
+## Files to modify
+
+| File | Change |
+|------|--------|
+| `internal/attractor/style/stylesheet.go` | Add `"max_tokens"` to props whitelist |
+| `internal/attractor/engine/codergen_router.go` | Read `max_tokens` attr, pass to Request and SessionConfig |
+| `internal/agent/session.go` (or wherever SessionConfig is defined) | Add `MaxTokens *int` field, apply to Request |
+| `internal/llm/providers/anthropic/adapter.go` | Bump default `maxTokens` from 4096 to 65536 (lines 97, 270) |
+
+## Testing
+
+1. Unit test: stylesheet with `max_tokens` property is parsed and applied to nodes
+2. Unit test: `max_tokens` node attr is read and converted to `*int` correctly
+3. Integration: run a pipeline with `max_tokens=32768` on a node and verify the API
+   request includes the correct value (check CXDB turn data or provider logs)
+
diff --git a/internal/agent/session.go b/internal/agent/session.go
index ebcf358b..f15fb251 100644
--- a/internal/agent/session.go
+++ b/internal/agent/session.go
@@ -33,6 +33,10 @@ type SessionConfig struct {
 	// Valid values are provider-dependent but typically include: low|medium|high.
 	ReasoningEffort string
 
+	// MaxTokens overrides the provider adapter's default max_tokens when non-nil.
+	// Use this to allow larger outputs (e.g., large write_file tool calls).
+	MaxTokens *int
+
 	// ProviderOptions is merged into every LLM request as provider_options.
 	// Use this for provider-specific parameters (e.g., Cerebras clear_thinking).
 	ProviderOptions map[string]any
@@ -473,6 +477,9 @@ func (s *Session) processOneInput(ctx context.Context, input string) (string, er
 			v := strings.TrimSpace(s.cfg.ReasoningEffort)
 			req.ReasoningEffort = &v
 		}
+		if s.cfg.MaxTokens != nil {
+			req.MaxTokens = s.cfg.MaxTokens
+		}
 		if len(s.cfg.ProviderOptions) > 0 {
 			req.ProviderOptions = s.cfg.ProviderOptions
 		}
diff --git a/internal/attractor/engine/codergen_router.go b/internal/attractor/engine/codergen_router.go
index 094b0b82..b11f72a3 100644
--- a/internal/attractor/engine/codergen_router.go
+++ b/internal/attractor/engine/codergen_router.go
@@ -175,6 +175,14 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m
 		reasoningPtr = &reasoning
 	}
 
+	maxTokensStr := strings.TrimSpace(node.Attr("max_tokens", ""))
+	var maxTokensPtr *int
+	if maxTokensStr != "" {
+		if v, err := strconv.Atoi(maxTokensStr); err == nil && v > 0 {
+			maxTokensPtr = &v
+		}
+	}
+
 	switch mode {
 	case "one_shot":
 		text, used, err := r.withFailoverText(ctx, execCtx, node, client, provider, modelID, func(prov string, mid string) (string, error) {
@@ -183,6 +191,7 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m
 				Model:           mid,
 				Messages:        []llm.Message{llm.User(prompt)},
 				ReasoningEffort: reasoningPtr,
+				MaxTokens:       maxTokensPtr,
 			}
 			if err := writeJSON(filepath.Join(stageDir, "api_request.json"), req); err != nil {
 				warnEngine(execCtx, fmt.Sprintf("write api_request.json: %v", err))
@@ -234,6 +243,9 @@ func (r *CodergenRouter) runAPI(ctx context.Context, execCtx *Execution, node *m
 			if reasoning != "" {
 				sessCfg.ReasoningEffort = reasoning
 			}
+			if maxTokensPtr != nil {
+				sessCfg.MaxTokens = maxTokensPtr
+			}
 			// Cerebras GLM 4.7: preserve reasoning across agent-loop turns.
 			// clear_thinking defaults to true on the API, which strips prior
 			// reasoning context — counterproductive for multi-step agentic work.
diff --git a/internal/attractor/style/stylesheet.go b/internal/attractor/style/stylesheet.go
index f1f16dc5..04689647 100644
--- a/internal/attractor/style/stylesheet.go
+++ b/internal/attractor/style/stylesheet.go
@@ -48,7 +48,7 @@ func ApplyStylesheet(g *model.Graph, rules []Rule) error {
 
 func applyToNode(g *model.Graph, n *model.Node, rules []Rule) {
 	// Only set properties that are missing.
-	props := []string{"llm_model", "llm_provider", "reasoning_effort"}
+	props := []string{"llm_model", "llm_provider", "reasoning_effort", "max_tokens"}
 	for _, prop := range props {
 		if _, ok := n.Attrs[prop]; ok {
 			continue
@@ -145,7 +145,7 @@ func (p *ssParser) parseRule() (Rule, error) {
 		if err != nil {
 			return Rule{}, err
 		}
-		if prop != "llm_model" && prop != "llm_provider" && prop != "reasoning_effort" {
+		if prop != "llm_model" && prop != "llm_provider" && prop != "reasoning_effort" && prop != "max_tokens" {
 			return Rule{}, p.errf("unknown property %q", prop)
 		}
 		p.skipSpace()
diff --git a/internal/llm/providers/anthropic/adapter.go b/internal/llm/providers/anthropic/adapter.go
index 88614bdc..9170b65a 100644
--- a/internal/llm/providers/anthropic/adapter.go
+++ b/internal/llm/providers/anthropic/adapter.go
@@ -94,7 +94,7 @@ func (a *Adapter) Complete(ctx context.Context, req llm.Request) (llm.Response,
 	}
 	autoCache := anthropicAutoCacheEnabled(a.Name(), req.ProviderOptions)
 
-	maxTokens := 4096
+	maxTokens := 65536
 	if req.MaxTokens != nil && *req.MaxTokens > 0 {
 		maxTokens = *req.MaxTokens
 	}
@@ -267,7 +267,7 @@ func (a *Adapter) Stream(ctx context.Context, req llm.Request) (llm.Stream, erro
 	}
 	autoCache := anthropicAutoCacheEnabled(a.Name(), req.ProviderOptions)
 
-	maxTokens := 4096
+	maxTokens := 65536
 	if req.MaxTokens != nil && *req.MaxTokens > 0 {
 		maxTokens = *req.MaxTokens
 	}
diff --git a/internal/llm/providers/anthropic/adapter_test.go b/internal/llm/providers/anthropic/adapter_test.go
index 4b090e0b..4a58c64c 100644
--- a/internal/llm/providers/anthropic/adapter_test.go
+++ b/internal/llm/providers/anthropic/adapter_test.go
@@ -1583,7 +1583,7 @@ func TestAdapter_UsageCacheTokens_Mapped(t *testing.T) {
 	}
 }
 
-func TestAdapter_Complete_DefaultMaxTokens_Is4096(t *testing.T) {
+func TestAdapter_Complete_DefaultMaxTokens_Is65536(t *testing.T) {
 	var gotBody map[string]any
 
 	srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
@@ -1620,8 +1620,8 @@ func TestAdapter_Complete_DefaultMaxTokens_Is4096(t *testing.T) {
 	if !ok {
 		t.Fatalf("max_tokens not found or not a number: %#v", gotBody["max_tokens"])
 	}
-	if int(mt) != 4096 {
-		t.Fatalf("max_tokens: got %d want 4096", int(mt))
+	if int(mt) != 65536 {
+		t.Fatalf("max_tokens: got %d want 65536", int(mt))
 	}
 }